mdadm/restripe.c
NeilBrown e0d95aac96 restripe: add support for new layouts including DDF
md supports new raid6 layouts to support conversion to and from
raid5 and well as DDF.  Make sure restripe handles those, including
getting the order right for Q-syndrome calculation.

Signed-off-by: NeilBrown <neilb@suse.de>
2009-05-25 10:52:31 +10:00

553 lines
14 KiB
C

/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
* Copyright (C) 2006 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Author: Neil Brown
* Email: <neilb@suse.de>
*/
#include "mdadm.h"
/* To restripe, we read from old geometry to a buffer, and
* read from buffer to new geometry.
* When reading we don't worry about parity. When writing we do.
*
*/
static int geo_map(int block, unsigned long long stripe, int raid_disks,
int level, int layout)
{
/* On the given stripe, find which disk in the array will have
* block numbered 'block'.
* '-1' means the parity block.
* '-2' means the Q syndrome.
*/
int pd;
switch(level*100 + layout) {
case 000:
case 400:
case 500 + ALGORITHM_PARITY_N:
/* raid 4 isn't messed around by parity blocks */
if (block == -1)
return raid_disks-1; /* parity block */
return block;
case 500 + ALGORITHM_LEFT_ASYMMETRIC:
pd = (raid_disks-1) - stripe % raid_disks;
if (block == -1) return pd;
if (block >= pd)
block++;
return block;
case 500 + ALGORITHM_RIGHT_ASYMMETRIC:
pd = stripe % raid_disks;
if (block == -1) return pd;
if (block >= pd)
block++;
return block;
case 500 + ALGORITHM_LEFT_SYMMETRIC:
pd = (raid_disks - 1) - stripe % raid_disks;
if (block == -1) return pd;
return (pd + 1 + block) % raid_disks;
case 500 + ALGORITHM_RIGHT_SYMMETRIC:
pd = stripe % raid_disks;
if (block == -1) return pd;
return (pd + 1 + block) % raid_disks;
case 500 + ALGORITHM_PARITY_0:
return block + 1;
case 600 + ALGORITHM_PARITY_N_6:
if (block == -2)
return raid_disks - 1;
if (block == -1)
return raid_disks - 2; /* parity block */
return block;
case 600 + ALGORITHM_LEFT_ASYMMETRIC_6:
if (block == -2)
return raid_disks - 1;
raid_disks--;
pd = (raid_disks-1) - stripe % raid_disks;
if (block == -1) return pd;
if (block >= pd)
block++;
return block;
case 600 + ALGORITHM_RIGHT_ASYMMETRIC_6:
if (block == -2)
return raid_disks - 1;
raid_disks--;
pd = stripe % raid_disks;
if (block == -1) return pd;
if (block >= pd)
block++;
return block;
case 600 + ALGORITHM_LEFT_SYMMETRIC_6:
if (block == -2)
return raid_disks - 1;
raid_disks--;
pd = (raid_disks - 1) - stripe % raid_disks;
if (block == -1) return pd;
return (pd + 1 + block) % raid_disks;
case 600 + ALGORITHM_RIGHT_SYMMETRIC_6:
if (block == -2)
return raid_disks - 1;
raid_disks--;
pd = stripe % raid_disks;
if (block == -1) return pd;
return (pd + 1 + block) % raid_disks;
case 600 + ALGORITHM_PARITY_0_6:
if (block == -2)
return raid_disks - 1;
return block + 1;
case 600 + ALGORITHM_PARITY_0:
if (block == -1)
return 0;
if (block == -2)
return 1;
return block + 2;
case 600 + ALGORITHM_LEFT_ASYMMETRIC:
pd = raid_disks - 1 - (stripe % raid_disks);
if (block == -1) return pd;
if (block == -2) return (pd+1) % raid_disks;
if (pd == raid_disks - 1)
return block+1;
if (block >= pd)
return block+2;
return block;
case 600 + ALGORITHM_ROTATING_ZERO_RESTART:
/* Different order for calculating Q, otherwize same as ... */
case 600 + ALGORITHM_RIGHT_ASYMMETRIC:
pd = stripe % raid_disks;
if (block == -1) return pd;
if (block == -2) return (pd+1) % raid_disks;
if (pd == raid_disks - 1)
return block+1;
if (block >= pd)
return block+2;
return block;
case 600 + ALGORITHM_LEFT_SYMMETRIC:
pd = raid_disks - 1 - (stripe % raid_disks);
if (block == -1) return pd;
if (block == -2) return (pd+1) % raid_disks;
return (pd + 2 + block) % raid_disks;
case 600 + ALGORITHM_RIGHT_SYMMETRIC:
pd = stripe % raid_disks;
if (block == -1) return pd;
if (block == -2) return (pd+1) % raid_disks;
return (pd + 2 + block) % raid_disks;
case 600 + ALGORITHM_ROTATING_N_RESTART:
/* Same a left_asymmetric, by first stripe is
* D D D P Q rather than
* Q D D D P
*/
pd = raid_disks - 1 - ((stripe + 1) % raid_disks);
if (block == -1) return pd;
if (block == -2) return (pd+1) % raid_disks;
if (pd == raid_disks - 1)
return block+1;
if (block >= pd)
return block+2;
return block;
case 600 + ALGORITHM_ROTATING_N_CONTINUE:
/* Same as left_symmetric but Q is before P */
pd = raid_disks - 1 - (stripe % raid_disks);
if (block == -1) return pd;
if (block == -2) return (pd+raid_disks-1) % raid_disks;
return (pd + 1 + block) % raid_disks;
}
return -1;
}
static int is_ddf(int layout)
{
switch (layout)
{
default:
return 0;
case ALGORITHM_ROTATING_N_CONTINUE:
case ALGORITHM_ROTATING_N_RESTART:
case ALGORITHM_ROTATING_ZERO_RESTART:
return 1;
}
}
static void xor_blocks(char *target, char **sources, int disks, int size)
{
int i, j;
/* Amazingly inefficient... */
for (i=0; i<size; i++) {
char c = 0;
for (j=0 ; j<disks; j++)
c ^= sources[j][i];
target[i] = c;
}
}
static void qsyndrome(char *p, char *q, char **sources, int disks, int size)
{
int d, z;
char wq0, wp0, wd0, w10, w20;
for ( d = 0; d < size; d++) {
wq0 = wp0 = sources[disks-1][d];
for ( z = disks-2 ; z >= 0 ; z-- ) {
wd0 = sources[z][d];
wp0 ^= wd0;
w20 = (wq0&0x80) ? 0xff : 0x00;
w10 = (wq0 << 1) & 0xff;
w20 &= 0x1d;
w10 ^= w20;
wq0 = w10 ^ wd0;
}
p[d] = wp0;
q[d] = wq0;
}
}
/* Save data:
* We are given:
* A list of 'fds' of the active disks. For now we require all to be present.
* A geometry: raid_disks, chunk_size, level, layout
* A list of 'fds' for mirrored targets. They are already seeked to
* right (Write) location
* A start and length
*/
int save_stripes(int *source, unsigned long long *offsets,
int raid_disks, int chunk_size, int level, int layout,
int nwrites, int *dest,
unsigned long long start, unsigned long long length)
{
char abuf[8192+512];
char *buf = (char*)(((unsigned long)abuf+511)&~511UL);
int cpos = start % chunk_size; /* where in chunk we are up to */
int len;
int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2);
int disk;
while (length > 0) {
unsigned long long offset;
int i;
len = chunk_size - cpos;
if (len > 8192) len = 8192;
if (len > length) len = length;
/* len bytes to be moved from one device */
offset = (start/chunk_size/data_disks)*chunk_size + cpos;
disk = start/chunk_size % data_disks;
disk = geo_map(disk, start/chunk_size/data_disks,
raid_disks, level, layout);
if (lseek64(source[disk], offsets[disk]+offset, 0) < 0)
return -1;
if (read(source[disk], buf, len) != len)
return -1;
for (i=0; i<nwrites; i++)
if (write(dest[i], buf, len) != len)
return -1;
length -= len;
start += len;
cpos += len;
while (cpos >= chunk_size) cpos -= chunk_size;
}
return 0;
}
/* Restore data:
* We are given:
* A list of 'fds' of the active disks. Some may be '-1' for not-available.
* A geometry: raid_disks, chunk_size, level, layout
* An 'fd' to read from. It is already seeked to the right (Read) location.
* A start and length.
* The length must be a multiple of the stripe size.
*
* We build a full stripe in memory and then write it out.
* We assume that there are enough working devices.
*/
int restore_stripes(int *dest, unsigned long long *offsets,
int raid_disks, int chunk_size, int level, int layout,
int source, unsigned long long read_offset,
unsigned long long start, unsigned long long length)
{
char *stripe_buf = malloc(raid_disks * chunk_size);
char **stripes = malloc(raid_disks * sizeof(char*));
char **blocks = malloc(raid_disks * sizeof(char*));
char *zero = malloc(chunk_size);
int i;
int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2);
if (stripe_buf == NULL || stripes == NULL || blocks == NULL
|| zero == NULL) {
free(stripe_buf);
free(stripes);
free(blocks);
free(zero);
return -2;
}
memset(zero, 0, chunk_size);
for (i=0; i<raid_disks; i++)
stripes[i] = stripe_buf + i * chunk_size;
while (length > 0) {
int len = data_disks * chunk_size;
unsigned long long offset;
int disk, qdisk;
if (length < len)
return -3;
for (i=0; i < data_disks; i++) {
int disk = geo_map(i, start/chunk_size/data_disks,
raid_disks, level, layout);
if (lseek64(source, read_offset, 0) != read_offset)
return -1;
if (read(source, stripes[disk], chunk_size) != chunk_size)
return -1;
read_offset += chunk_size;
}
/* We have the data, now do the parity */
offset = (start/chunk_size/data_disks) * chunk_size;
switch (level) {
case 4:
case 5:
disk = geo_map(-1, start/chunk_size/data_disks,
raid_disks, level, layout);
for (i = 0; i < data_disks; i++)
blocks[i] = stripes[(disk+1+i) % raid_disks];
xor_blocks(stripes[disk], blocks, data_disks, chunk_size);
break;
case 6:
disk = geo_map(-1, start/chunk_size/data_disks,
raid_disks, level, layout);
qdisk = geo_map(-2, start/chunk_size/data_disks,
raid_disks, level, layout);
if (is_ddf(layout)) {
/* q over 'raid_disks' blocks, in device order.
* 'p' and 'q' get to be all zero
*/
for (i = 0; i < raid_disks; i++)
if (i == disk || i == qdisk)
blocks[i] = zero;
else
blocks[i] = stripes[i];
qsyndrome(stripes[disk], stripes[qdisk],
blocks, raid_disks, chunk_size);
} else {
/* for md' q is over 'data_disks' blocks,
* starting immediately after 'q'
*/
for (i = 0; i < data_disks; i++)
blocks[i] = stripes[(qdisk+1+i) % raid_disks];
qsyndrome(stripes[disk], stripes[qdisk], blocks,
data_disks, chunk_size);
}
break;
}
for (i=0; i < raid_disks ; i++)
if (dest[i] >= 0) {
if (lseek64(dest[i], offsets[i]+offset, 0) < 0)
return -1;
if (write(dest[i], stripes[i], chunk_size) != chunk_size)
return -1;
}
length -= len;
start += len;
}
return 0;
}
#ifdef MAIN
int test_stripes(int *source, unsigned long long *offsets,
int raid_disks, int chunk_size, int level, int layout,
unsigned long long start, unsigned long long length)
{
/* ready the data and p (and q) blocks, and check we got them right */
char *stripe_buf = malloc(raid_disks * chunk_size);
char **stripes = malloc(raid_disks * sizeof(char*));
char **blocks = malloc(raid_disks * sizeof(char*));
char *p = malloc(chunk_size);
char *q = malloc(chunk_size);
int i;
int data_disks = raid_disks - (level == 5 ? 1: 2);
for ( i = 0 ; i < raid_disks ; i++)
stripes[i] = stripe_buf + i * chunk_size;
while (length > 0) {
int disk;
for (i = 0 ; i < raid_disks ; i++) {
lseek64(source[i], offsets[i]+start, 0);
read(source[i], stripes[i], chunk_size);
}
for (i = 0 ; i < data_disks ; i++) {
int disk = geo_map(i, start/chunk_size, raid_disks,
level, layout);
blocks[i] = stripes[disk];
printf("%d->%d\n", i, disk);
}
switch(level) {
case 6:
qsyndrome(p, q, blocks, data_disks, chunk_size);
disk = geo_map(-1, start/chunk_size, raid_disks,
level, layout);
if (memcmp(p, stripes[disk], chunk_size) != 0) {
printf("P(%d) wrong at %llu\n", disk,
start / chunk_size);
}
disk = geo_map(-2, start/chunk_size, raid_disks,
level, layout);
if (memcmp(q, stripes[disk], chunk_size) != 0) {
printf("Q(%d) wrong at %llu\n", disk,
start / chunk_size);
}
break;
}
length -= chunk_size;
start += chunk_size;
}
return 0;
}
unsigned long long getnum(char *str, char **err)
{
char *e;
unsigned long long rv = strtoull(str, &e, 10);
if (e==str || *e) {
*err = str;
return 0;
}
return rv;
}
main(int argc, char *argv[])
{
/* save/restore file raid_disks chunk_size level layout start length devices...
*/
int save;
int *fds;
char *file;
int storefd;
unsigned long long *offsets;
int raid_disks, chunk_size, level, layout;
unsigned long long start, length;
int i;
char *err = NULL;
if (argc < 10) {
fprintf(stderr, "Usage: test_stripe save/restore file raid_disks"
" chunk_size level layout start length devices...\n");
exit(1);
}
if (strcmp(argv[1], "save")==0)
save = 1;
else if (strcmp(argv[1], "restore") == 0)
save = 0;
else if (strcmp(argv[1], "test") == 0)
save = 2;
else {
fprintf(stderr, "test_stripe: must give 'save' or 'restore'.\n");
exit(2);
}
file = argv[2];
raid_disks = getnum(argv[3], &err);
chunk_size = getnum(argv[4], &err);
level = getnum(argv[5], &err);
layout = getnum(argv[6], &err);
start = getnum(argv[7], &err);
length = getnum(argv[8], &err);
if (err) {
fprintf(stderr, "test_stripe: Bad number: %s\n", err);
exit(2);
}
if (argc != raid_disks + 9) {
fprintf(stderr, "test_stripe: wrong number of devices: want %d found %d\n",
raid_disks, argc-9);
exit(2);
}
fds = malloc(raid_disks * sizeof(*fds));
offsets = malloc(raid_disks * sizeof(*offsets));
memset(offsets, 0, raid_disks * sizeof(*offsets));
storefd = open(file, O_RDWR);
if (storefd < 0) {
perror(file);
fprintf(stderr, "test_stripe: could not open %s.\n", file);
exit(3);
}
for (i=0; i<raid_disks; i++) {
fds[i] = open(argv[9+i], O_RDWR);
if (fds[i] < 0) {
perror(argv[9+i]);
fprintf(stderr,"test_stripe: cannot open %s.\n", argv[9+i]);
exit(3);
}
}
if (save == 1) {
int rv = save_stripes(fds, offsets,
raid_disks, chunk_size, level, layout,
1, &storefd,
start, length);
if (rv != 0) {
fprintf(stderr,
"test_stripe: save_stripes returned %d\n", rv);
exit(1);
}
} else if (save == 2) {
int rv = test_stripes(fds, offsets,
raid_disks, chunk_size, level, layout,
start, length);
if (rv != 0) {
fprintf(stderr,
"test_stripe: test_stripes returned %d\n", rv);
exit(1);
}
} else {
int rv = restore_stripes(fds, offsets,
raid_disks, chunk_size, level, layout,
storefd, 0ULL,
start, length);
if (rv != 0) {
fprintf(stderr,
"test_stripe: restore_stripes returned %d\n",
rv);
exit(1);
}
}
exit(0);
}
#endif /* MAIN */