if (mddev->level != LEVEL_MULTIPATH) {
rdev->faulty = 0;
+ rdev->flags = 0;
desc = sb->disks + rdev->desc_nr;
if (desc->state & (1<<MD_DISK_FAULTY))
rdev->in_sync = 1;
rdev->raid_disk = desc->raid_disk;
}
+ if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
+ set_bit(WriteMostly, &rdev->flags);
} else /* MULTIPATH are always insync */
rdev->in_sync = 1;
return 0;
spare++;
working++;
}
+ if (test_bit(WriteMostly, &rdev2->flags))
+ d->state |= (1<<MD_DISK_WRITEMOSTLY);
}
/* now set the "removed" and "faulty" bits on any missing devices */
rdev->raid_disk = role;
break;
}
+ rdev->flags = 0;
+ if (sb->devflags & WriteMostly1)
+ set_bit(WriteMostly, &rdev->flags);
} else /* MULTIPATH are always insync */
rdev->in_sync = 1;
info.state |= (1<<MD_DISK_ACTIVE);
info.state |= (1<<MD_DISK_SYNC);
}
+ if (test_bit(WriteMostly, &rdev->flags))
+ info.state |= (1<<MD_DISK_WRITEMOSTLY);
} else {
info.major = info.minor = 0;
info.raid_disk = -1;
rdev->saved_raid_disk = rdev->raid_disk;
rdev->in_sync = 0; /* just to be sure */
+ if (info->state & (1<<MD_DISK_WRITEMOSTLY))
+ set_bit(WriteMostly, &rdev->flags);
+
rdev->raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
if (err)
else
rdev->in_sync = 0;
+ if (info->state & (1<<MD_DISK_WRITEMOSTLY))
+ set_bit(WriteMostly, &rdev->flags);
+
err = bind_rdev_to_array(rdev, mddev);
if (err) {
export_rdev(rdev);
char b[BDEVNAME_SIZE];
seq_printf(seq, " %s[%d]",
bdevname(rdev->bdev,b), rdev->desc_nr);
+ if (test_bit(WriteMostly, &rdev->flags))
+ seq_printf(seq, "(W)");
if (rdev->faulty) {
seq_printf(seq, "(F)");
continue;
{
const unsigned long this_sector = r1_bio->sector;
int new_disk = conf->last_used, disk = new_disk;
+ int wonly_disk = -1;
const int sectors = r1_bio->sectors;
sector_t new_distance, current_distance;
- mdk_rdev_t *new_rdev, *rdev;
+ mdk_rdev_t *rdev;
rcu_read_lock();
/*
- * Check if it if we can balance. We can balance on the whole
+ * Check if we can balance. We can balance on the whole
* device if no resync is going on, or below the resync window.
* We take the first readable disk when above the resync window.
*/
/* Choose the first operation device, for consistancy */
new_disk = 0;
- while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
- !new_rdev->in_sync) {
- new_disk++;
- if (new_disk == conf->raid_disks) {
- new_disk = -1;
+ for (rdev = conf->mirrors[new_disk].rdev;
+ !rdev || !rdev->in_sync
+ || test_bit(WriteMostly, &rdev->flags);
+ rdev = conf->mirrors[++new_disk].rdev) {
+
+ if (rdev && rdev->in_sync)
+ wonly_disk = new_disk;
+
+ if (new_disk == conf->raid_disks - 1) {
+ new_disk = wonly_disk;
break;
}
}
/* make sure the disk is operational */
- while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
- !new_rdev->in_sync) {
+ for (rdev = conf->mirrors[new_disk].rdev;
+ !rdev || !rdev->in_sync ||
+ test_bit(WriteMostly, &rdev->flags);
+ rdev = conf->mirrors[new_disk].rdev) {
+
+ if (rdev && rdev->in_sync)
+ wonly_disk = new_disk;
+
if (new_disk <= 0)
new_disk = conf->raid_disks;
new_disk--;
if (new_disk == disk) {
- new_disk = -1;
- goto rb_out;
+ new_disk = wonly_disk;
+ break;
}
}
+
+ if (new_disk < 0)
+ goto rb_out;
+
disk = new_disk;
/* now disk == new_disk == starting point for search */
disk = conf->raid_disks;
disk--;
- if ((rdev=conf->mirrors[disk].rdev) == NULL ||
- !rdev->in_sync)
+ rdev = conf->mirrors[disk].rdev;
+
+ if (!rdev ||
+ !rdev->in_sync ||
+ test_bit(WriteMostly, &rdev->flags))
continue;
if (!atomic_read(&rdev->nr_pending)) {
new_disk = disk;
- new_rdev = rdev;
break;
}
new_distance = abs(this_sector - conf->mirrors[disk].head_position);
if (new_distance < current_distance) {
current_distance = new_distance;
new_disk = disk;
- new_rdev = rdev;
}
} while (disk != conf->last_used);
-rb_out:
+ rb_out:
if (new_disk >= 0) {
- conf->next_seq_sect = this_sector + sectors;
- conf->last_used = new_disk;
- atomic_inc(&new_rdev->nr_pending);
- if (!new_rdev->in_sync) {
+ rdev = conf->mirrors[new_disk].rdev;
+ if (!rdev)
+ goto retry;
+ atomic_inc(&rdev->nr_pending);
+ if (!rdev->in_sync) {
/* cannot risk returning a device that failed
* before we inc'ed nr_pending
*/
- atomic_dec(&new_rdev->nr_pending);
+ atomic_dec(&rdev->nr_pending);
goto retry;
}
+ conf->next_seq_sect = this_sector + sectors;
+ conf->last_used = new_disk;
}
rcu_read_unlock();
sector_t max_sector, nr_sectors;
int disk;
int i;
+ int wonly;
int write_targets = 0;
int sync_blocks;
int still_degraded = 0;
*/
disk = conf->last_used;
/* make sure disk is operational */
-
+ wonly = disk;
while (conf->mirrors[disk].rdev == NULL ||
- !conf->mirrors[disk].rdev->in_sync) {
+ !conf->mirrors[disk].rdev->in_sync ||
+ test_bit(WriteMostly, &conf->mirrors[disk].rdev->flags)
+ ) {
+ if (conf->mirrors[disk].rdev &&
+ conf->mirrors[disk].rdev->in_sync)
+ wonly = disk;
if (disk <= 0)
disk = conf->raid_disks;
disk--;
- if (disk == conf->last_used)
+ if (disk == conf->last_used) {
+ disk = wonly;
break;
+ }
}
conf->last_used = disk;
atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
int faulty; /* if faulty do not issue IO requests */
int in_sync; /* device is a full member of the array */
+ unsigned long flags; /* Should include faulty and in_sync here. */
+#define WriteMostly 4 /* Avoid reading if at all possible */
+
int desc_nr; /* descriptor index in the superblock */
int raid_disk; /* role of device in array */
int saved_raid_disk; /* role that device used to have in the
#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */
#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */
+#define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config.
+ * read requests will only be sent here in
+ * dire need
+ */
+
typedef struct mdp_device_descriptor_s {
__u32 number; /* 0 Device number in the entire set */
__u32 major; /* 1 Device major number */
__u64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/
__u32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */
- __u32 layout; /* only for raid5 currently */
+ __u32 layout; /* only for raid5 and raid10 currently */
__u64 size; /* used size of component devices, in 512byte sectors */
__u32 chunksize; /* in 512byte sectors */
__u32 dev_number; /* permanent identifier of this device - not role in raid */
__u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */
__u8 device_uuid[16]; /* user-space setable, ignored by kernel */
- __u8 pad2[64-56]; /* set to 0 when writing */
+ __u8 devflags; /* per-device flags. Only one defined...*/
+#define WriteMostly1 1 /* mask for writemostly flag in above */
+ __u8 pad2[64-57]; /* set to 0 when writing */
/* array state information - 64 bytes */
__u64 utime; /* 40 bits second, 24 btes microseconds */