diff --git a/Create.c b/Create.c index 2721884..4080bf6 100644 --- a/Create.c +++ b/Create.c @@ -259,7 +259,8 @@ int Create(struct supertype *st, char *mddev, if (st && ! st->ss->validate_geometry(st, s->level, s->layout, s->raiddisks, &s->chunk, s->size*2, data_offset, NULL, - &newsize, c->verbose>=0)) + &newsize, s->consistency_policy, + c->verbose>=0)) return 1; if (s->chunk && s->chunk != UnSet) { @@ -358,7 +359,8 @@ int Create(struct supertype *st, char *mddev, st, s->level, s->layout, s->raiddisks, &s->chunk, s->size*2, dv->data_offset, dname, - &freesize, c->verbose > 0)) { + &freesize, s->consistency_policy, + c->verbose > 0)) { case -1: /* Not valid, message printed, and not * worth checking any further */ exit(2); @@ -395,6 +397,7 @@ int Create(struct supertype *st, char *mddev, &s->chunk, s->size*2, dv->data_offset, dname, &freesize, + s->consistency_policy, c->verbose >= 0)) { pr_err("%s is not suitable for this array.\n", @@ -501,7 +504,8 @@ int Create(struct supertype *st, char *mddev, s->raiddisks, &s->chunk, minsize*2, data_offset, - NULL, NULL, 0)) { + NULL, NULL, + s->consistency_policy, 0)) { pr_err("devices too large for RAID level %d\n", s->level); return 1; } @@ -528,6 +532,12 @@ int Create(struct supertype *st, char *mddev, if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0) s->bitmap_file = NULL; + if (s->consistency_policy == CONSISTENCY_POLICY_PPL && + !st->ss->write_init_ppl) { + pr_err("%s metadata does not support PPL\n", st->ss->name); + return 1; + } + if (!have_container && s->level > 0 && ((maxsize-s->size)*100 > maxsize)) { if (c->runstop != 1 || c->verbose >= 0) pr_err("largest drive (%s) exceeds size (%lluK) by more than 1%%\n", @@ -720,7 +730,7 @@ int Create(struct supertype *st, char *mddev, name += 2; } } - if (!st->ss->init_super(st, &info.array, s->size, name, c->homehost, uuid, + if (!st->ss->init_super(st, &info.array, s, name, c->homehost, uuid, data_offset)) goto abort_locked; diff --git a/Kill.c b/Kill.c index f2fdb85..ff52561 100644 --- a/Kill.c +++ b/Kill.c @@ -63,7 +63,7 @@ int Kill(char *dev, struct supertype *st, int force, int verbose, int noexcl) rv = st->ss->load_super(st, fd, dev); if (rv == 0 || (force && rv >= 2)) { st->ss->free_super(st); - st->ss->init_super(st, NULL, 0, "", NULL, NULL, + st->ss->init_super(st, NULL, NULL, "", NULL, NULL, INVALID_SECTORS); if (st->ss->store_super(st, fd)) { if (verbose >= 0) diff --git a/ReadMe.c b/ReadMe.c index 50d3807..fc04c2c 100644 --- a/ReadMe.c +++ b/ReadMe.c @@ -78,11 +78,11 @@ char Version[] = "mdadm - v" VERSION " - " VERS_DATE "\n"; * found, it is started. */ -char short_options[]="-ABCDEFGIQhVXYWZ:vqbc:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:"; +char short_options[]="-ABCDEFGIQhVXYWZ:vqbc:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:k:"; char short_bitmap_options[]= - "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:"; + "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:k:"; char short_bitmap_auto_options[]= - "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:"; + "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:k:"; struct option long_options[] = { {"manage", 0, 0, ManageOpt}, @@ -148,6 +148,7 @@ struct option long_options[] = { {"nodes",1, 0, Nodes}, /* also for --assemble */ {"home-cluster",1, 0, ClusterName}, {"write-journal",1, 0, WriteJournal}, + {"consistency-policy", 1, 0, 'k'}, /* For assemble */ {"uuid", 1, 0, 'u'}, @@ -362,27 +363,29 @@ char Help_create[] = " other levels.\n" "\n" " Options that are valid with --create (-C) are:\n" -" --bitmap= : Create a bitmap for the array with the given filename\n" -" : or an internal bitmap is 'internal' is given\n" -" --chunk= -c : chunk size in kibibytes\n" -" --rounding= : rounding factor for linear array (==chunk size)\n" -" --level= -l : raid level: 0,1,4,5,6,10,linear,multipath and synonyms\n" -" --parity= -p : raid5/6 parity algorithm: {left,right}-{,a}symmetric\n" -" --layout= : same as --parity, for RAID10: [fno]NN \n" -" --raid-devices= -n : number of active devices in array\n" -" --spare-devices= -x: number of spare (eXtra) devices in initial array\n" -" --size= -z : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n" -" --data-offset= : Space to leave between start of device and start\n" -" : of array data.\n" -" --force -f : Honour devices as listed on command line. Don't\n" -" : insert a missing drive for RAID5.\n" -" --run -R : insist of running the array even if not all\n" -" : devices are present or some look odd.\n" -" --readonly -o : start the array readonly - not supported yet.\n" -" --name= -N : Textual name for array - max 32 characters\n" -" --bitmap-chunk= : bitmap chunksize in Kilobytes.\n" -" --delay= -d : bitmap update delay in seconds.\n" -" --write-journal= : Specify journal device for RAID-4/5/6 array\n" +" --bitmap= -b : Create a bitmap for the array with the given filename\n" +" : or an internal bitmap if 'internal' is given\n" +" --chunk= -c : chunk size in kibibytes\n" +" --rounding= : rounding factor for linear array (==chunk size)\n" +" --level= -l : raid level: 0,1,4,5,6,10,linear,multipath and synonyms\n" +" --parity= -p : raid5/6 parity algorithm: {left,right}-{,a}symmetric\n" +" --layout= : same as --parity, for RAID10: [fno]NN \n" +" --raid-devices= -n : number of active devices in array\n" +" --spare-devices= -x : number of spare (eXtra) devices in initial array\n" +" --size= -z : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n" +" --data-offset= : Space to leave between start of device and start\n" +" : of array data.\n" +" --force -f : Honour devices as listed on command line. Don't\n" +" : insert a missing drive for RAID5.\n" +" --run -R : insist of running the array even if not all\n" +" : devices are present or some look odd.\n" +" --readonly -o : start the array readonly - not supported yet.\n" +" --name= -N : Textual name for array - max 32 characters\n" +" --bitmap-chunk= : bitmap chunksize in Kilobytes.\n" +" --delay= -d : bitmap update delay in seconds.\n" +" --write-journal= : Specify journal device for RAID-4/5/6 array\n" +" --consistency-policy= : Specify the policy that determines how the array\n" +" -k : maintains consistency in case of unexpected shutdown.\n" "\n" ; diff --git a/maps.c b/maps.c index 64f1df2..d9ee7de 100644 --- a/maps.c +++ b/maps.c @@ -129,6 +129,16 @@ mapping_t faultylayout[] = { { NULL, 0} }; +mapping_t consistency_policies[] = { + { "unknown", CONSISTENCY_POLICY_UNKNOWN}, + { "none", CONSISTENCY_POLICY_NONE}, + { "resync", CONSISTENCY_POLICY_RESYNC}, + { "bitmap", CONSISTENCY_POLICY_BITMAP}, + { "journal", CONSISTENCY_POLICY_JOURNAL}, + { "ppl", CONSISTENCY_POLICY_PPL}, + { NULL, 0} +}; + char *map_num(mapping_t *map, int num) { while (map->name) { diff --git a/mdadm.8.in b/mdadm.8.in index df1d460..cad5db5 100644 --- a/mdadm.8.in +++ b/mdadm.8.in @@ -724,7 +724,9 @@ When creating an array on devices which are 100G or larger, .I mdadm automatically adds an internal bitmap as it will usually be beneficial. This can be suppressed with -.B "\-\-bitmap=none". +.B "\-\-bitmap=none" +or by selecting a different consistency policy with +.BR \-\-consistency\-policy . .TP .BR \-\-bitmap\-chunk= @@ -1020,6 +1022,36 @@ should be a SSD with reasonable lifetime. Auto creation of symlinks in /dev to /dev/md, option --symlinks must be 'no' or 'yes' and work with --create and --build. +.TP +.BR \-k ", " \-\-consistency\-policy= +Specify how the array maintains consistency in case of unexpected shutdown. +Only relevant for RAID levels with redundancy. +Currently supported options are: +.RS + +.TP +.B resync +Full resync is performed and all redundancy is regenerated when the array is +started after unclean shutdown. + +.TP +.B bitmap +Resync assisted by a write-intent bitmap. Implicitly selected when using +.BR \-\-bitmap . + +.TP +.B journal +For RAID levels 4/5/6, journal device is used to log transactions and replay +after unclean shutdown. Implicitly selected when using +.BR \-\-write\-journal . + +.TP +.B ppl +For RAID5 only, Partial Parity Log is used to close the write hole and +eliminate resync. PPL is stored in the metadata region of RAID member drives, +no additional journal drive is needed. +.RE + .SH For assemble: @@ -2153,8 +2185,10 @@ in the array exceed 100G is size, an internal write-intent bitmap will automatically be added unless some other option is explicitly requested with the .B \-\-bitmap -option. In any case space for a bitmap will be reserved so that one -can be added layer with +option or a different consistency policy is selected with the +.B \-\-consistency\-policy +option. In any case space for a bitmap will be reserved so that one +can be added later with .BR "\-\-grow \-\-bitmap=internal" . If the metadata type supports it (currently only 1.x metadata), space diff --git a/mdadm.c b/mdadm.c index 08ddcab..d4e8286 100644 --- a/mdadm.c +++ b/mdadm.c @@ -78,6 +78,7 @@ int main(int argc, char *argv[]) .level = UnSet, .layout = UnSet, .bitmap_chunk = UnSet, + .consistency_policy = UnSet, }; char sys_hostname[256]; @@ -1215,6 +1216,16 @@ int main(int argc, char *argv[]) s.journaldisks = 1; continue; + case O(CREATE, 'k'): + s.consistency_policy = map_name(consistency_policies, + optarg); + if (s.consistency_policy == UnSet || + s.consistency_policy < CONSISTENCY_POLICY_RESYNC) { + pr_err("Invalid consistency policy: %s\n", + optarg); + exit(2); + } + continue; } /* We have now processed all the valid options. Anything else is * an error @@ -1242,9 +1253,47 @@ int main(int argc, char *argv[]) exit(0); } - if (s.journaldisks && (s.level < 4 || s.level > 6)) { - pr_err("--write-journal is only supported for RAID level 4/5/6.\n"); - exit(2); + if (s.journaldisks) { + if (s.level < 4 || s.level > 6) { + pr_err("--write-journal is only supported for RAID level 4/5/6.\n"); + exit(2); + } + if (s.consistency_policy != UnSet && + s.consistency_policy != CONSISTENCY_POLICY_JOURNAL) { + pr_err("--write-journal is not supported with consistency policy: %s\n", + map_num(consistency_policies, s.consistency_policy)); + exit(2); + } + } + + if (mode == CREATE && s.consistency_policy != UnSet) { + if (s.level <= 0) { + pr_err("--consistency-policy not meaningful with level %s.\n", + map_num(pers, s.level)); + exit(2); + } else if (s.consistency_policy == CONSISTENCY_POLICY_JOURNAL && + !s.journaldisks) { + pr_err("--write-journal is required for consistency policy: %s\n", + map_num(consistency_policies, s.consistency_policy)); + exit(2); + } else if (s.consistency_policy == CONSISTENCY_POLICY_PPL && + s.level != 5) { + pr_err("PPL consistency policy is only supported for RAID level 5.\n"); + exit(2); + } else if (s.consistency_policy == CONSISTENCY_POLICY_BITMAP && + (!s.bitmap_file || + strcmp(s.bitmap_file, "none") == 0)) { + pr_err("--bitmap is required for consistency policy: %s\n", + map_num(consistency_policies, s.consistency_policy)); + exit(2); + } else if (s.bitmap_file && + strcmp(s.bitmap_file, "none") != 0 && + s.consistency_policy != CONSISTENCY_POLICY_BITMAP && + s.consistency_policy != CONSISTENCY_POLICY_JOURNAL) { + pr_err("--bitmap is not compatible with consistency policy: %s\n", + map_num(consistency_policies, s.consistency_policy)); + exit(2); + } } if (!mode && devs_found) { diff --git a/mdadm.h b/mdadm.h index cebc0c0..b52d4d3 100644 --- a/mdadm.h +++ b/mdadm.h @@ -279,6 +279,15 @@ struct mdinfo { int journal_device_required; int journal_clean; + enum { + CONSISTENCY_POLICY_UNKNOWN, + CONSISTENCY_POLICY_NONE, + CONSISTENCY_POLICY_RESYNC, + CONSISTENCY_POLICY_BITMAP, + CONSISTENCY_POLICY_JOURNAL, + CONSISTENCY_POLICY_PPL, + } consistency_policy; + /* During reshape we can sometimes change the data_offset to avoid * over-writing still-valid data. We need to know if there is space. * So getinfo_super will fill in space_before and space_after in sectors. @@ -426,6 +435,7 @@ enum special_options { ClusterName, ClusterConfirm, WriteJournal, + ConsistencyPolicy, }; enum prefix_standard { @@ -527,6 +537,7 @@ struct shape { int assume_clean; int write_behind; unsigned long long size; + int consistency_policy; }; /* List of device names - wildcards expanded */ @@ -618,6 +629,7 @@ enum sysfs_read_flags { GET_STATE = (1 << 23), GET_ERROR = (1 << 24), GET_ARRAY_STATE = (1 << 25), + GET_CONSISTENCY_POLICY = (1 << 26), }; /* If fd >= 0, get the array it is open on, @@ -701,7 +713,7 @@ extern int restore_stripes(int *dest, unsigned long long *offsets, extern char *map_num(mapping_t *map, int num); extern int map_name(mapping_t *map, char *name); -extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[]; +extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[], consistency_policies[]; extern char *map_dev_preferred(int major, int minor, int create, char *prefer); @@ -863,7 +875,7 @@ extern struct superswitch { * metadata. */ int (*init_super)(struct supertype *st, mdu_array_info_t *info, - unsigned long long size, char *name, + struct shape *s, char *name, char *homehost, int *uuid, unsigned long long data_offset); @@ -961,7 +973,7 @@ extern struct superswitch { int *chunk, unsigned long long size, unsigned long long data_offset, char *subdev, unsigned long long *freesize, - int verbose); + int consistency_policy, int verbose); /* Return a linked list of 'mdinfo' structures for all arrays * in the container. For non-containers, it is like @@ -1059,6 +1071,9 @@ extern struct superswitch { /* validate container after assemble */ int (*validate_container)(struct mdinfo *info); + /* write initial empty PPL on device */ + int (*write_init_ppl)(struct supertype *st, struct mdinfo *info, int fd); + /* records new bad block in metadata */ int (*record_bad_block)(struct active_array *a, int n, unsigned long long sector, int length); diff --git a/super-ddf.c b/super-ddf.c index 1707ad1..cdd16a4 100644 --- a/super-ddf.c +++ b/super-ddf.c @@ -2290,7 +2290,7 @@ static unsigned int find_vde_by_guid(const struct ddf_super *ddf, static int init_super_ddf(struct supertype *st, mdu_array_info_t *info, - unsigned long long size, char *name, char *homehost, + struct shape *s, char *name, char *homehost, int *uuid, unsigned long long data_offset) { /* This is primarily called by Create when creating a new array. @@ -2328,7 +2328,7 @@ static int init_super_ddf(struct supertype *st, struct virtual_disk *vd; if (st->sb) - return init_super_ddf_bvd(st, info, size, name, homehost, uuid, + return init_super_ddf_bvd(st, info, s->size, name, homehost, uuid, data_offset); if (posix_memalign((void**)&ddf, 512, sizeof(*ddf)) != 0) { @@ -3347,7 +3347,7 @@ static int validate_geometry_ddf(struct supertype *st, int *chunk, unsigned long long size, unsigned long long data_offset, char *dev, unsigned long long *freesize, - int verbose) + int consistency_policy, int verbose) { int fd; struct mdinfo *sra; diff --git a/super-gpt.c b/super-gpt.c index 8b080a0..bb38a97 100644 --- a/super-gpt.c +++ b/super-gpt.c @@ -205,7 +205,7 @@ static int validate_geometry(struct supertype *st, int level, int *chunk, unsigned long long size, unsigned long long data_offset, char *subdev, unsigned long long *freesize, - int verbose) + int consistency_policy, int verbose) { pr_err("gpt metadata cannot be used this way\n"); return 0; diff --git a/super-intel.c b/super-intel.c index e1618f1..5d0f131 100644 --- a/super-intel.c +++ b/super-intel.c @@ -5155,7 +5155,7 @@ static int check_name(struct intel_super *super, char *name, int quiet) } static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, - unsigned long long size, char *name, + struct shape *s, char *name, char *homehost, int *uuid, long long data_offset) { @@ -5250,7 +5250,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, strncpy((char *) dev->volume, name, MAX_RAID_SERIAL_LEN); array_blocks = calc_array_size(info->level, info->raid_disks, info->layout, info->chunk_size, - size * 2); + s->size * 2); /* round array size down to closest MB */ array_blocks = (array_blocks >> SECT_PER_MB_SHIFT) << SECT_PER_MB_SHIFT; @@ -5264,7 +5264,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, vol->curr_migr_unit = 0; map = get_imsm_map(dev, MAP_0); set_pba_of_lba0(map, super->create_offset); - set_blocks_per_member(map, info_to_blocks_per_member(info, size)); + set_blocks_per_member(map, info_to_blocks_per_member(info, s->size)); map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info)); map->failed_disk_num = ~0; if (info->level > 0) @@ -5292,7 +5292,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, map->num_domains = 1; /* info->size is only int so use the 'size' parameter instead */ - num_data_stripes = (size * 2) / info_to_blocks_per_strip(info); + num_data_stripes = (s->size * 2) / info_to_blocks_per_strip(info); num_data_stripes /= map->num_domains; set_num_data_stripes(map, num_data_stripes); @@ -5314,7 +5314,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, } static int init_super_imsm(struct supertype *st, mdu_array_info_t *info, - unsigned long long size, char *name, + struct shape *s, char *name, char *homehost, int *uuid, unsigned long long data_offset) { @@ -5337,7 +5337,7 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info, } if (st->sb) - return init_super_imsm_volume(st, info, size, name, homehost, uuid, + return init_super_imsm_volume(st, info, s, name, homehost, uuid, data_offset); if (info) @@ -6914,7 +6914,7 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout, int raiddisks, int *chunk, unsigned long long size, unsigned long long data_offset, char *dev, unsigned long long *freesize, - int verbose) + int consistency_policy, int verbose) { int fd, cfd; struct mdinfo *sra; @@ -10953,7 +10953,7 @@ enum imsm_reshape_type imsm_analyze_change(struct supertype *st, geo->raid_disks + devNumChange, &chunk, geo->size, INVALID_SECTORS, - 0, 0, 1)) + 0, 0, info.consistency_policy, 1)) change = -1; if (check_devs) { diff --git a/super-mbr.c b/super-mbr.c index f5e4cea..1bbe57a 100644 --- a/super-mbr.c +++ b/super-mbr.c @@ -193,7 +193,7 @@ static int validate_geometry(struct supertype *st, int level, int *chunk, unsigned long long size, unsigned long long data_offset, char *subdev, unsigned long long *freesize, - int verbose) + int consistency_policy, int verbose) { pr_err("mbr metadata cannot be used this way\n"); return 0; diff --git a/super0.c b/super0.c index f5b4507..7a555e3 100644 --- a/super0.c +++ b/super0.c @@ -725,7 +725,7 @@ static int update_super0(struct supertype *st, struct mdinfo *info, * We use the first 8 bytes (64bits) of the sha1 of the host name */ static int init_super0(struct supertype *st, mdu_array_info_t *info, - unsigned long long size, char *ignored_name, + struct shape *s, char *ignored_name, char *homehost, int *uuid, unsigned long long data_offset) { @@ -764,8 +764,8 @@ static int init_super0(struct supertype *st, mdu_array_info_t *info, sb->gvalid_words = 0; /* ignored */ sb->ctime = time(0); sb->level = info->level; - sb->size = size; - if (size != (unsigned long long)sb->size) + sb->size = s->size; + if (s->size != (unsigned long long)sb->size) return 0; sb->nr_disks = info->nr_disks; sb->raid_disks = info->raid_disks; @@ -1267,7 +1267,7 @@ static int validate_geometry0(struct supertype *st, int level, int *chunk, unsigned long long size, unsigned long long data_offset, char *subdev, unsigned long long *freesize, - int verbose) + int consistency_policy, int verbose) { unsigned long long ldsize; int fd; diff --git a/super1.c b/super1.c index f3520ac..4a0f041 100644 --- a/super1.c +++ b/super1.c @@ -1397,7 +1397,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info, } static int init_super1(struct supertype *st, mdu_array_info_t *info, - unsigned long long size, char *name, char *homehost, + struct shape *s, char *name, char *homehost, int *uuid, unsigned long long data_offset) { struct mdp_superblock_1 *sb; @@ -1450,7 +1450,7 @@ static int init_super1(struct supertype *st, mdu_array_info_t *info, sb->ctime = __cpu_to_le64((unsigned long long)time(0)); sb->level = __cpu_to_le32(info->level); sb->layout = __cpu_to_le32(info->layout); - sb->size = __cpu_to_le64(size*2ULL); + sb->size = __cpu_to_le64(s->size*2ULL); sb->chunksize = __cpu_to_le32(info->chunk_size>>9); sb->raid_disks = __cpu_to_le32(info->raid_disks); @@ -2487,7 +2487,7 @@ static int validate_geometry1(struct supertype *st, int level, int *chunk, unsigned long long size, unsigned long long data_offset, char *subdev, unsigned long long *freesize, - int verbose) + int consistency_policy, int verbose) { unsigned long long ldsize, devsize; int bmspace; diff --git a/sysfs.c b/sysfs.c index b0657a0..53589a7 100644 --- a/sysfs.c +++ b/sysfs.c @@ -242,6 +242,17 @@ struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options) } else sra->sysfs_array_state[0] = 0; + if (options & GET_CONSISTENCY_POLICY) { + strcpy(base, "consistency_policy"); + if (load_sys(fname, buf, sizeof(buf))) { + sra->consistency_policy = CONSISTENCY_POLICY_UNKNOWN; + } else { + sra->consistency_policy = map_name(consistency_policies, buf); + if (sra->consistency_policy == UnSet) + sra->consistency_policy = CONSISTENCY_POLICY_UNKNOWN; + } + } + if (! (options & GET_DEVS)) return sra;