]> err.no Git - linux-2.6/blobdiff - drivers/md/md.c
md: support 'external' metadata for md arrays
[linux-2.6] / drivers / md / md.c
index acf1b81b47cbeccf74d85cc6bc9f66ee998afbbb..e2782a04012da1e2869a2351dcbb689d88537524 100644 (file)
@@ -231,7 +231,7 @@ static void mddev_put(mddev_t *mddev)
                list_del(&mddev->all_mddevs);
                spin_unlock(&all_mddevs_lock);
                blk_cleanup_queue(mddev->queue);
-               kobject_unregister(&mddev->kobj);
+               kobject_put(&mddev->kobj);
        } else
                spin_unlock(&all_mddevs_lock);
 }
@@ -778,7 +778,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->major_version = 0;
                mddev->minor_version = sb->minor_version;
                mddev->patch_version = sb->patch_version;
-               mddev->persistent = ! sb->not_persistent;
+               mddev->persistent = 1;
+               mddev->external = 0;
                mddev->chunk_size = sb->chunk_size;
                mddev->ctime = sb->ctime;
                mddev->utime = sb->utime;
@@ -904,7 +905,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        sb->size  = mddev->size;
        sb->raid_disks = mddev->raid_disks;
        sb->md_minor = mddev->md_minor;
-       sb->not_persistent = !mddev->persistent;
+       sb->not_persistent = 0;
        sb->utime = mddev->utime;
        sb->state = 0;
        sb->events_hi = (mddev->events>>32);
@@ -1158,6 +1159,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                mddev->major_version = 1;
                mddev->patch_version = 0;
                mddev->persistent = 1;
+               mddev->external = 0;
                mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
                mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
                mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
@@ -1383,22 +1385,19 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
                        return -EBUSY;
        }
        bdevname(rdev->bdev,b);
-       if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0)
-               return -ENOMEM;
-       while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL)
+       while ( (s=strchr(b, '/')) != NULL)
                *s = '!';
-                       
+
        rdev->mddev = mddev;
        printk(KERN_INFO "md: bind<%s>\n", b);
 
-       rdev->kobj.parent = &mddev->kobj;
-       if ((err = kobject_add(&rdev->kobj)))
+       if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
                goto fail;
 
        if (rdev->bdev->bd_part)
-               ko = &rdev->bdev->bd_part->kobj;
+               ko = &rdev->bdev->bd_part->dev.kobj;
        else
-               ko = &rdev->bdev->bd_disk->kobj;
+               ko = &rdev->bdev->bd_disk->dev.kobj;
        if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
                kobject_del(&rdev->kobj);
                goto fail;
@@ -1699,18 +1698,20 @@ repeat:
                MD_BUG();
                mddev->events --;
        }
-       sync_sbs(mddev, nospares);
 
        /*
         * do not write anything to disk if using
         * nonpersistent superblocks
         */
        if (!mddev->persistent) {
-               clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+               if (!mddev->external)
+                       clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+
                spin_unlock_irq(&mddev->write_lock);
                wake_up(&mddev->sb_wait);
                return;
        }
+       sync_sbs(mddev, nospares);
        spin_unlock_irq(&mddev->write_lock);
 
        dprintk(KERN_INFO 
@@ -2036,9 +2037,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
        if (err)
                goto abort_free;
 
-       rdev->kobj.parent = NULL;
-       rdev->kobj.ktype = &rdev_ktype;
-       kobject_init(&rdev->kobj);
+       kobject_init(&rdev->kobj, &rdev_ktype);
 
        rdev->desc_nr = -1;
        rdev->saved_raid_disk = -1;
@@ -2430,6 +2429,8 @@ array_state_show(mddev_t *mddev, char *page)
                case 0:
                        if (mddev->in_sync)
                                st = clean;
+                       else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
+                               st = write_pending;
                        else if (mddev->safemode)
                                st = active_idle;
                        else
@@ -2460,11 +2461,9 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
                break;
        case clear:
                /* stopping an active array */
-               if (mddev->pers) {
-                       if (atomic_read(&mddev->active) > 1)
-                               return -EBUSY;
-                       err = do_md_stop(mddev, 0);
-               }
+               if (atomic_read(&mddev->active) > 1)
+                       return -EBUSY;
+               err = do_md_stop(mddev, 0);
                break;
        case inactive:
                /* stopping an active array */
@@ -2472,7 +2471,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
                        if (atomic_read(&mddev->active) > 1)
                                return -EBUSY;
                        err = do_md_stop(mddev, 2);
-               }
+               } else
+                       err = 0; /* already inactive */
                break;
        case suspended:
                break; /* not supported yet */
@@ -2500,9 +2500,15 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
                        restart_array(mddev);
                        spin_lock_irq(&mddev->write_lock);
                        if (atomic_read(&mddev->writes_pending) == 0) {
-                               mddev->in_sync = 1;
-                               set_bit(MD_CHANGE_CLEAN, &mddev->flags);
-                       }
+                               if (mddev->in_sync == 0) {
+                                       mddev->in_sync = 1;
+                                       if (mddev->persistent)
+                                               set_bit(MD_CHANGE_CLEAN,
+                                                       &mddev->flags);
+                               }
+                               err = 0;
+                       } else
+                               err = -EBUSY;
                        spin_unlock_irq(&mddev->write_lock);
                } else {
                        mddev->ro = 0;
@@ -2513,7 +2519,8 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len)
        case active:
                if (mddev->pers) {
                        restart_array(mddev);
-                       clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
+                       if (mddev->external)
+                               clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
                        wake_up(&mddev->sb_wait);
                        err = 0;
                } else {
@@ -2664,7 +2671,9 @@ __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
 
 
 /* Metdata version.
- * This is either 'none' for arrays with externally managed metadata,
+ * This is one of
+ *   'none' for arrays with no metadata (good luck...)
+ *   'external' for arrays with externally managed metadata,
  * or N.M for internally known formats
  */
 static ssize_t
@@ -2673,6 +2682,8 @@ metadata_show(mddev_t *mddev, char *page)
        if (mddev->persistent)
                return sprintf(page, "%d.%d\n",
                               mddev->major_version, mddev->minor_version);
+       else if (mddev->external)
+               return sprintf(page, "external:%s\n", mddev->metadata_type);
        else
                return sprintf(page, "none\n");
 }
@@ -2687,6 +2698,21 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len)
 
        if (cmd_match(buf, "none")) {
                mddev->persistent = 0;
+               mddev->external = 0;
+               mddev->major_version = 0;
+               mddev->minor_version = 90;
+               return len;
+       }
+       if (strncmp(buf, "external:", 9) == 0) {
+               int namelen = len-9;
+               if (namelen >= sizeof(mddev->metadata_type))
+                       namelen = sizeof(mddev->metadata_type)-1;
+               strncpy(mddev->metadata_type, buf+9, namelen);
+               mddev->metadata_type[namelen] = 0;
+               if (namelen && mddev->metadata_type[namelen-1] == '\n')
+                       mddev->metadata_type[--namelen] = 0;
+               mddev->persistent = 0;
+               mddev->external = 1;
                mddev->major_version = 0;
                mddev->minor_version = 90;
                return len;
@@ -2703,6 +2729,7 @@ metadata_store(mddev_t *mddev, const char *buf, size_t len)
        mddev->major_version = major;
        mddev->minor_version = minor;
        mddev->persistent = 1;
+       mddev->external = 0;
        return len;
 }
 
@@ -2714,7 +2741,7 @@ action_show(mddev_t *mddev, char *page)
 {
        char *type = "idle";
        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
-           test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) {
+           (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
                if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
                        type = "reshape";
                else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
@@ -2833,6 +2860,12 @@ sync_max_store(mddev_t *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry md_sync_max =
 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
 
+static ssize_t
+degraded_show(mddev_t *mddev, char *page)
+{
+       return sprintf(page, "%d\n", mddev->degraded);
+}
+static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
 
 static ssize_t
 sync_speed_show(mddev_t *mddev, char *page)
@@ -2976,6 +3009,7 @@ static struct attribute *md_redundancy_attrs[] = {
        &md_suspend_lo.attr,
        &md_suspend_hi.attr,
        &md_bitmap.attr,
+       &md_degraded.attr,
        NULL,
 };
 static struct attribute_group md_redundancy_group = {
@@ -3047,6 +3081,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
        int partitioned = (MAJOR(dev) != MD_MAJOR);
        int shift = partitioned ? MdpMinorShift : 0;
        int unit = MINOR(dev) >> shift;
+       int error;
 
        if (!mddev)
                return NULL;
@@ -3075,12 +3110,13 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
        add_disk(disk);
        mddev->gendisk = disk;
        mutex_unlock(&disks_mutex);
-       mddev->kobj.parent = &disk->kobj;
-       kobject_set_name(&mddev->kobj, "%s", "md");
-       mddev->kobj.ktype = &md_ktype;
-       if (kobject_register(&mddev->kobj))
+       error = kobject_init_and_add(&mddev->kobj, &md_ktype, &disk->dev.kobj,
+                                    "%s", "md");
+       if (error)
                printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
                       disk->disk_name);
+       else
+               kobject_uevent(&mddev->kobj, KOBJ_ADD);
        return NULL;
 }
 
@@ -3352,7 +3388,7 @@ static int do_md_run(mddev_t * mddev)
 
        mddev->changed = 1;
        md_new_event(mddev);
-       kobject_uevent(&mddev->gendisk->kobj, KOBJ_CHANGE);
+       kobject_uevent(&mddev->gendisk->dev.kobj, KOBJ_CHANGE);
        return 0;
 }
 
@@ -3463,7 +3499,6 @@ static int do_md_stop(mddev_t * mddev, int mode)
                        mddev->pers->stop(mddev);
                        mddev->queue->merge_bvec_fn = NULL;
                        mddev->queue->unplug_fn = NULL;
-                       mddev->queue->issue_flush_fn = NULL;
                        mddev->queue->backing_dev_info.congested_fn = NULL;
                        if (mddev->pers->sync_request)
                                sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
@@ -3521,6 +3556,7 @@ static int do_md_stop(mddev_t * mddev, int mode)
                mddev->raid_disks = 0;
                mddev->recovery_cp = 0;
                mddev->reshape_position = MaxSector;
+               mddev->external = 0;
 
        } else if (mddev->pers)
                printk(KERN_INFO "md: %s switched to read-only mode.\n",
@@ -4162,13 +4198,15 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
        else
                mddev->recovery_cp = 0;
        mddev->persistent    = ! info->not_persistent;
+       mddev->external      = 0;
 
        mddev->layout        = info->layout;
        mddev->chunk_size    = info->chunk_size;
 
        mddev->max_disks     = MD_SB_DISKS;
 
-       mddev->flags         = 0;
+       if (mddev->persistent)
+               mddev->flags         = 0;
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
        mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
@@ -4711,7 +4749,7 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
 
 void md_unregister_thread(mdk_thread_t *thread)
 {
-       dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
+       dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
 
        kthread_stop(thread->tsk);
        kfree(thread);
@@ -4979,7 +5017,10 @@ static int md_seq_show(struct seq_file *seq, void *v)
                                           mddev->major_version,
                                           mddev->minor_version);
                        }
-               } else
+               } else if (mddev->external)
+                       seq_printf(seq, " super external:%s",
+                                  mddev->metadata_type);
+               else
                        seq_printf(seq, " super non-persistent");
 
                if (mddev->pers) {
@@ -5439,7 +5480,7 @@ void md_do_sync(mddev_t *mddev)
                 * about not overloading the IO subsystem. (things like an
                 * e2fsck being done on the RAID array should execute fast)
                 */
-               mddev->queue->unplug_fn(mddev->queue);
+               blk_unplug(mddev->queue);
                cond_resched();
 
                currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
@@ -5458,7 +5499,7 @@ void md_do_sync(mddev_t *mddev)
         * this also signals 'finished resyncing' to md_stop
         */
  out:
-       mddev->queue->unplug_fn(mddev->queue);
+       blk_unplug(mddev->queue);
 
        wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
 
@@ -5586,7 +5627,7 @@ void md_check_recovery(mddev_t *mddev)
        }
 
        if ( ! (
-               mddev->flags ||
+               (mddev->flags && !mddev->external) ||
                test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
                test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
                (mddev->safemode == 1) ||
@@ -5602,7 +5643,8 @@ void md_check_recovery(mddev_t *mddev)
                if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
                    !mddev->in_sync && mddev->recovery_cp == MaxSector) {
                        mddev->in_sync = 1;
-                       set_bit(MD_CHANGE_CLEAN, &mddev->flags);
+                       if (mddev->persistent)
+                               set_bit(MD_CHANGE_CLEAN, &mddev->flags);
                }
                if (mddev->safemode == 1)
                        mddev->safemode = 0;
@@ -5771,26 +5813,47 @@ static int __init md_init(void)
  * Searches all registered partitions for autorun RAID arrays
  * at boot time.
  */
-static dev_t detected_devices[128];
-static int dev_cnt;
+
+static LIST_HEAD(all_detected_devices);
+struct detected_devices_node {
+       struct list_head list;
+       dev_t dev;
+};
 
 void md_autodetect_dev(dev_t dev)
 {
-       if (dev_cnt >= 0 && dev_cnt < 127)
-               detected_devices[dev_cnt++] = dev;
+       struct detected_devices_node *node_detected_dev;
+
+       node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
+       if (node_detected_dev) {
+               node_detected_dev->dev = dev;
+               list_add_tail(&node_detected_dev->list, &all_detected_devices);
+       } else {
+               printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
+                       ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
+       }
 }
 
 
 static void autostart_arrays(int part)
 {
        mdk_rdev_t *rdev;
-       int i;
+       struct detected_devices_node *node_detected_dev;
+       dev_t dev;
+       int i_scanned, i_passed;
 
-       printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
+       i_scanned = 0;
+       i_passed = 0;
 
-       for (i = 0; i < dev_cnt; i++) {
-               dev_t dev = detected_devices[i];
+       printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
 
+       while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
+               i_scanned++;
+               node_detected_dev = list_entry(all_detected_devices.next,
+                                       struct detected_devices_node, list);
+               list_del(&node_detected_dev->list);
+               dev = node_detected_dev->dev;
+               kfree(node_detected_dev);
                rdev = md_import_device(dev,0, 90);
                if (IS_ERR(rdev))
                        continue;
@@ -5800,8 +5863,11 @@ static void autostart_arrays(int part)
                        continue;
                }
                list_add(&rdev->same_set, &pending_raid_disks);
+               i_passed++;
        }
-       dev_cnt = 0;
+
+       printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
+                                               i_scanned, i_passed);
 
        autorun_devices(part);
 }