2006-03-13 06:51:32 +01:00
|
|
|
/*
|
|
|
|
* mdadm - manage Linux "md" devices aka RAID arrays.
|
|
|
|
*
|
2009-06-02 06:35:45 +02:00
|
|
|
* Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
|
2006-03-13 06:51:32 +01:00
|
|
|
*
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write to the Free Software
|
|
|
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
*
|
|
|
|
* Author: Neil Brown
|
|
|
|
* Email: <neilb@suse.de>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "mdadm.h"
|
2009-07-14 07:12:30 +02:00
|
|
|
#include <stdint.h>
|
2006-03-13 06:51:32 +01:00
|
|
|
|
|
|
|
/* To restripe, we read from old geometry to a buffer, and
|
|
|
|
* read from buffer to new geometry.
|
2009-07-14 07:12:30 +02:00
|
|
|
* When reading, we might have missing devices and so could need
|
|
|
|
* to reconstruct.
|
|
|
|
* When writing, we need to create correct parity and Q.
|
2006-03-13 06:51:32 +01:00
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
RAID-6 check standalone
Hi Neil,
please find attached a patch, to mdadm-3.2 base, including
a standalone versione of the raid-6 check.
This is basically a re-working (and hopefully improvement)
of the already implemented check in "restripe.c".
I splitted the check function into "collect" and "stats",
so that the second one could be easily replaced.
The API is also simplified.
The command line option are reduced, since we only level
is raid-6, but the ":offset" option is included.
The output reports the block/stripe rotation, P/Q errors
and the possible HDD (or unknown).
BTW, the patch applies also to the already patched "restripe.c",
including the last ":offset" patch (which is not yet in git).
Other item is that due to "sysfs.c" linking (see below) the
"Makefile" needed some changes, I hope this is not a problem.
Next steps (TODO list you like) would be:
1) Add the "sysfs.c" code in order to retrieve the HDDs info
from the MD device. It is already linked, together with the
whole (mdadm) universe, since it seems it cannot leave alone.
I'll need some advice or hint on how to do use it. I checked
"sysfs.c", but before I dig deep into it maybe better to
have some advice (maybe just one function call will do it).
2) Add the suspend lo/hi control. Fellow John Robinson was
suggesting to look into "Grow.c", which I did, but I guess
the same story as 1) is valid: better to have some hint on
where to look before wasting time.
3) Add a repair option (future). This should have different
levels, like "all", "disk", "stripe". That is, fix everything
(more or less like "repair"), fix only if a disk is clearly
having problems, fix each stripe which has clearly a problem
(but maybe different stripes may belong to different HDDs).
So, for the point 1) and 2) would be nice to have some more
detail on where to look what. Point 3) we will discuss later.
Thanks, please consider for inclusion,
bye,
pg
Signed-off-by: NeilBrown <neilb@suse.de>
2011-03-21 03:52:44 +01:00
|
|
|
int geo_map(int block, unsigned long long stripe, int raid_disks,
|
2009-05-25 02:52:31 +02:00
|
|
|
int level, int layout)
|
2006-03-13 06:51:32 +01:00
|
|
|
{
|
2007-02-22 04:59:19 +01:00
|
|
|
/* On the given stripe, find which disk in the array will have
|
2006-03-13 06:51:32 +01:00
|
|
|
* block numbered 'block'.
|
2007-02-22 04:59:19 +01:00
|
|
|
* '-1' means the parity block.
|
|
|
|
* '-2' means the Q syndrome.
|
2006-03-13 06:51:32 +01:00
|
|
|
*/
|
|
|
|
int pd;
|
|
|
|
|
2010-12-03 05:03:25 +01:00
|
|
|
/* layout is not relevant for raid0 and raid4 */
|
|
|
|
if ((level == 0) ||
|
|
|
|
(level == 4))
|
|
|
|
layout = 0;
|
|
|
|
|
2006-03-13 06:51:32 +01:00
|
|
|
switch(level*100 + layout) {
|
|
|
|
case 000:
|
|
|
|
case 400:
|
2009-05-25 02:52:31 +02:00
|
|
|
case 500 + ALGORITHM_PARITY_N:
|
2006-03-13 06:51:32 +01:00
|
|
|
/* raid 4 isn't messed around by parity blocks */
|
|
|
|
if (block == -1)
|
|
|
|
return raid_disks-1; /* parity block */
|
|
|
|
return block;
|
|
|
|
case 500 + ALGORITHM_LEFT_ASYMMETRIC:
|
|
|
|
pd = (raid_disks-1) - stripe % raid_disks;
|
|
|
|
if (block == -1) return pd;
|
|
|
|
if (block >= pd)
|
|
|
|
block++;
|
|
|
|
return block;
|
|
|
|
|
|
|
|
case 500 + ALGORITHM_RIGHT_ASYMMETRIC:
|
|
|
|
pd = stripe % raid_disks;
|
|
|
|
if (block == -1) return pd;
|
|
|
|
if (block >= pd)
|
|
|
|
block++;
|
|
|
|
return block;
|
|
|
|
|
|
|
|
case 500 + ALGORITHM_LEFT_SYMMETRIC:
|
|
|
|
pd = (raid_disks - 1) - stripe % raid_disks;
|
|
|
|
if (block == -1) return pd;
|
|
|
|
return (pd + 1 + block) % raid_disks;
|
|
|
|
|
|
|
|
case 500 + ALGORITHM_RIGHT_SYMMETRIC:
|
|
|
|
pd = stripe % raid_disks;
|
|
|
|
if (block == -1) return pd;
|
|
|
|
return (pd + 1 + block) % raid_disks;
|
|
|
|
|
2009-05-25 02:52:31 +02:00
|
|
|
case 500 + ALGORITHM_PARITY_0:
|
|
|
|
return block + 1;
|
|
|
|
|
|
|
|
|
|
|
|
case 600 + ALGORITHM_PARITY_N_6:
|
|
|
|
if (block == -2)
|
|
|
|
return raid_disks - 1;
|
|
|
|
if (block == -1)
|
|
|
|
return raid_disks - 2; /* parity block */
|
|
|
|
return block;
|
|
|
|
case 600 + ALGORITHM_LEFT_ASYMMETRIC_6:
|
|
|
|
if (block == -2)
|
|
|
|
return raid_disks - 1;
|
|
|
|
raid_disks--;
|
|
|
|
pd = (raid_disks-1) - stripe % raid_disks;
|
|
|
|
if (block == -1) return pd;
|
|
|
|
if (block >= pd)
|
|
|
|
block++;
|
|
|
|
return block;
|
|
|
|
|
|
|
|
case 600 + ALGORITHM_RIGHT_ASYMMETRIC_6:
|
|
|
|
if (block == -2)
|
|
|
|
return raid_disks - 1;
|
|
|
|
raid_disks--;
|
|
|
|
pd = stripe % raid_disks;
|
|
|
|
if (block == -1) return pd;
|
|
|
|
if (block >= pd)
|
|
|
|
block++;
|
|
|
|
return block;
|
|
|
|
|
|
|
|
case 600 + ALGORITHM_LEFT_SYMMETRIC_6:
|
|
|
|
if (block == -2)
|
|
|
|
return raid_disks - 1;
|
|
|
|
raid_disks--;
|
|
|
|
pd = (raid_disks - 1) - stripe % raid_disks;
|
|
|
|
if (block == -1) return pd;
|
|
|
|
return (pd + 1 + block) % raid_disks;
|
|
|
|
|
|
|
|
case 600 + ALGORITHM_RIGHT_SYMMETRIC_6:
|
|
|
|
if (block == -2)
|
|
|
|
return raid_disks - 1;
|
|
|
|
raid_disks--;
|
|
|
|
pd = stripe % raid_disks;
|
|
|
|
if (block == -1) return pd;
|
|
|
|
return (pd + 1 + block) % raid_disks;
|
|
|
|
|
|
|
|
case 600 + ALGORITHM_PARITY_0_6:
|
|
|
|
if (block == -2)
|
|
|
|
return raid_disks - 1;
|
|
|
|
return block + 1;
|
|
|
|
|
|
|
|
|
|
|
|
case 600 + ALGORITHM_PARITY_0:
|
|
|
|
if (block == -1)
|
|
|
|
return 0;
|
|
|
|
if (block == -2)
|
|
|
|
return 1;
|
|
|
|
return block + 2;
|
|
|
|
|
2006-03-13 06:51:32 +01:00
|
|
|
case 600 + ALGORITHM_LEFT_ASYMMETRIC:
|
|
|
|
pd = raid_disks - 1 - (stripe % raid_disks);
|
|
|
|
if (block == -1) return pd;
|
2007-02-22 04:59:19 +01:00
|
|
|
if (block == -2) return (pd+1) % raid_disks;
|
2006-03-13 06:51:32 +01:00
|
|
|
if (pd == raid_disks - 1)
|
|
|
|
return block+1;
|
|
|
|
if (block >= pd)
|
|
|
|
return block+2;
|
|
|
|
return block;
|
|
|
|
|
2009-05-25 02:52:31 +02:00
|
|
|
case 600 + ALGORITHM_ROTATING_ZERO_RESTART:
|
|
|
|
/* Different order for calculating Q, otherwize same as ... */
|
2006-03-13 06:51:32 +01:00
|
|
|
case 600 + ALGORITHM_RIGHT_ASYMMETRIC:
|
|
|
|
pd = stripe % raid_disks;
|
|
|
|
if (block == -1) return pd;
|
2007-02-22 04:59:19 +01:00
|
|
|
if (block == -2) return (pd+1) % raid_disks;
|
2006-03-13 06:51:32 +01:00
|
|
|
if (pd == raid_disks - 1)
|
|
|
|
return block+1;
|
|
|
|
if (block >= pd)
|
|
|
|
return block+2;
|
|
|
|
return block;
|
|
|
|
|
|
|
|
case 600 + ALGORITHM_LEFT_SYMMETRIC:
|
|
|
|
pd = raid_disks - 1 - (stripe % raid_disks);
|
|
|
|
if (block == -1) return pd;
|
2007-02-22 04:59:19 +01:00
|
|
|
if (block == -2) return (pd+1) % raid_disks;
|
2006-03-13 06:51:32 +01:00
|
|
|
return (pd + 2 + block) % raid_disks;
|
|
|
|
|
|
|
|
case 600 + ALGORITHM_RIGHT_SYMMETRIC:
|
|
|
|
pd = stripe % raid_disks;
|
|
|
|
if (block == -1) return pd;
|
2007-02-22 04:59:19 +01:00
|
|
|
if (block == -2) return (pd+1) % raid_disks;
|
2006-03-13 06:51:32 +01:00
|
|
|
return (pd + 2 + block) % raid_disks;
|
2009-05-25 02:52:31 +02:00
|
|
|
|
|
|
|
|
|
|
|
case 600 + ALGORITHM_ROTATING_N_RESTART:
|
|
|
|
/* Same a left_asymmetric, by first stripe is
|
|
|
|
* D D D P Q rather than
|
|
|
|
* Q D D D P
|
|
|
|
*/
|
|
|
|
pd = raid_disks - 1 - ((stripe + 1) % raid_disks);
|
|
|
|
if (block == -1) return pd;
|
|
|
|
if (block == -2) return (pd+1) % raid_disks;
|
|
|
|
if (pd == raid_disks - 1)
|
|
|
|
return block+1;
|
|
|
|
if (block >= pd)
|
|
|
|
return block+2;
|
|
|
|
return block;
|
|
|
|
|
|
|
|
case 600 + ALGORITHM_ROTATING_N_CONTINUE:
|
|
|
|
/* Same as left_symmetric but Q is before P */
|
|
|
|
pd = raid_disks - 1 - (stripe % raid_disks);
|
|
|
|
if (block == -1) return pd;
|
|
|
|
if (block == -2) return (pd+raid_disks-1) % raid_disks;
|
|
|
|
return (pd + 1 + block) % raid_disks;
|
2006-03-13 06:51:32 +01:00
|
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
2009-05-25 02:52:31 +02:00
|
|
|
static int is_ddf(int layout)
|
|
|
|
{
|
|
|
|
switch (layout)
|
|
|
|
{
|
|
|
|
default:
|
|
|
|
return 0;
|
|
|
|
case ALGORITHM_ROTATING_N_CONTINUE:
|
|
|
|
case ALGORITHM_ROTATING_N_RESTART:
|
|
|
|
case ALGORITHM_ROTATING_ZERO_RESTART:
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
2006-03-13 06:51:32 +01:00
|
|
|
|
|
|
|
|
|
|
|
static void xor_blocks(char *target, char **sources, int disks, int size)
|
|
|
|
{
|
|
|
|
int i, j;
|
|
|
|
/* Amazingly inefficient... */
|
|
|
|
for (i=0; i<size; i++) {
|
|
|
|
char c = 0;
|
|
|
|
for (j=0 ; j<disks; j++)
|
|
|
|
c ^= sources[j][i];
|
|
|
|
target[i] = c;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
RAID-6 check standalone
Hi Neil,
please find attached a patch, to mdadm-3.2 base, including
a standalone versione of the raid-6 check.
This is basically a re-working (and hopefully improvement)
of the already implemented check in "restripe.c".
I splitted the check function into "collect" and "stats",
so that the second one could be easily replaced.
The API is also simplified.
The command line option are reduced, since we only level
is raid-6, but the ":offset" option is included.
The output reports the block/stripe rotation, P/Q errors
and the possible HDD (or unknown).
BTW, the patch applies also to the already patched "restripe.c",
including the last ":offset" patch (which is not yet in git).
Other item is that due to "sysfs.c" linking (see below) the
"Makefile" needed some changes, I hope this is not a problem.
Next steps (TODO list you like) would be:
1) Add the "sysfs.c" code in order to retrieve the HDDs info
from the MD device. It is already linked, together with the
whole (mdadm) universe, since it seems it cannot leave alone.
I'll need some advice or hint on how to do use it. I checked
"sysfs.c", but before I dig deep into it maybe better to
have some advice (maybe just one function call will do it).
2) Add the suspend lo/hi control. Fellow John Robinson was
suggesting to look into "Grow.c", which I did, but I guess
the same story as 1) is valid: better to have some hint on
where to look before wasting time.
3) Add a repair option (future). This should have different
levels, like "all", "disk", "stripe". That is, fix everything
(more or less like "repair"), fix only if a disk is clearly
having problems, fix each stripe which has clearly a problem
(but maybe different stripes may belong to different HDDs).
So, for the point 1) and 2) would be nice to have some more
detail on where to look what. Point 3) we will discuss later.
Thanks, please consider for inclusion,
bye,
pg
Signed-off-by: NeilBrown <neilb@suse.de>
2011-03-21 03:52:44 +01:00
|
|
|
void qsyndrome(uint8_t *p, uint8_t *q, uint8_t **sources, int disks, int size)
|
2007-02-22 04:59:19 +01:00
|
|
|
{
|
|
|
|
int d, z;
|
2009-07-14 07:12:30 +02:00
|
|
|
uint8_t wq0, wp0, wd0, w10, w20;
|
2007-02-22 04:59:19 +01:00
|
|
|
for ( d = 0; d < size; d++) {
|
|
|
|
wq0 = wp0 = sources[disks-1][d];
|
|
|
|
for ( z = disks-2 ; z >= 0 ; z-- ) {
|
|
|
|
wd0 = sources[z][d];
|
|
|
|
wp0 ^= wd0;
|
|
|
|
w20 = (wq0&0x80) ? 0xff : 0x00;
|
|
|
|
w10 = (wq0 << 1) & 0xff;
|
|
|
|
w20 &= 0x1d;
|
|
|
|
w10 ^= w20;
|
|
|
|
wq0 = w10 ^ wd0;
|
|
|
|
}
|
|
|
|
p[d] = wp0;
|
|
|
|
q[d] = wq0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-07-14 07:12:30 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The following was taken from linux/drivers/md/mktables.c, and modified
|
|
|
|
* to create in-memory tables rather than C code
|
|
|
|
*/
|
|
|
|
static uint8_t gfmul(uint8_t a, uint8_t b)
|
|
|
|
{
|
|
|
|
uint8_t v = 0;
|
|
|
|
|
|
|
|
while (b) {
|
|
|
|
if (b & 1)
|
|
|
|
v ^= a;
|
|
|
|
a = (a << 1) ^ (a & 0x80 ? 0x1d : 0);
|
|
|
|
b >>= 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return v;
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint8_t gfpow(uint8_t a, int b)
|
|
|
|
{
|
|
|
|
uint8_t v = 1;
|
|
|
|
|
|
|
|
b %= 255;
|
|
|
|
if (b < 0)
|
|
|
|
b += 255;
|
|
|
|
|
|
|
|
while (b) {
|
|
|
|
if (b & 1)
|
|
|
|
v = gfmul(v, a);
|
|
|
|
a = gfmul(a, a);
|
|
|
|
b >>= 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return v;
|
|
|
|
}
|
|
|
|
|
|
|
|
int tables_ready = 0;
|
|
|
|
uint8_t raid6_gfmul[256][256];
|
|
|
|
uint8_t raid6_gfexp[256];
|
|
|
|
uint8_t raid6_gfinv[256];
|
|
|
|
uint8_t raid6_gfexi[256];
|
User space RAID-6 access
> test_stripe assumes that the data starts at the start of each device.
> AS you are using 1.2 metadata (the default), data starts about 1M in to
> the device (I think - you can check with --examine)
>
> You could fix test_stripe to put the right value in the 'offsets' array,
> or you could create the array with 1.0 or 0.90 metadata.
Hi Neil,
thanks for the info, maybe this should be a second patch.
In the meantime, please find attached a patch to restripe.c
of mdadm 3.2 (latest, I hope).
This should add the functionality to detect, in RAID-6,
which of the disks potentially has problems, in case of
parity errors.
Some checks take place in order to avoid false positives,
I hope these are correct and enough.
I'm not 100% happy of the interface (too much redundancy),
but for the time being it could be OK.
Of course, any improvement is welcome.
Please consider to include these changes to the next mdadm
whatever release.
bye,
Signed-off-by: NeilBrown <neilb@suse.de>
2011-02-08 01:44:23 +01:00
|
|
|
uint8_t raid6_gflog[256];
|
|
|
|
uint8_t raid6_gfilog[256];
|
2009-07-14 07:12:30 +02:00
|
|
|
void make_tables(void)
|
|
|
|
{
|
|
|
|
int i, j;
|
|
|
|
uint8_t v;
|
User space RAID-6 access
> test_stripe assumes that the data starts at the start of each device.
> AS you are using 1.2 metadata (the default), data starts about 1M in to
> the device (I think - you can check with --examine)
>
> You could fix test_stripe to put the right value in the 'offsets' array,
> or you could create the array with 1.0 or 0.90 metadata.
Hi Neil,
thanks for the info, maybe this should be a second patch.
In the meantime, please find attached a patch to restripe.c
of mdadm 3.2 (latest, I hope).
This should add the functionality to detect, in RAID-6,
which of the disks potentially has problems, in case of
parity errors.
Some checks take place in order to avoid false positives,
I hope these are correct and enough.
I'm not 100% happy of the interface (too much redundancy),
but for the time being it could be OK.
Of course, any improvement is welcome.
Please consider to include these changes to the next mdadm
whatever release.
bye,
Signed-off-by: NeilBrown <neilb@suse.de>
2011-02-08 01:44:23 +01:00
|
|
|
uint32_t b, log;
|
2009-07-14 07:12:30 +02:00
|
|
|
|
|
|
|
/* Compute multiplication table */
|
|
|
|
for (i = 0; i < 256; i++)
|
|
|
|
for (j = 0; j < 256; j++)
|
|
|
|
raid6_gfmul[i][j] = gfmul(i, j);
|
|
|
|
|
|
|
|
/* Compute power-of-2 table (exponent) */
|
|
|
|
v = 1;
|
|
|
|
for (i = 0; i < 256; i++) {
|
|
|
|
raid6_gfexp[i] = v;
|
|
|
|
v = gfmul(v, 2);
|
|
|
|
if (v == 1)
|
|
|
|
v = 0; /* For entry 255, not a real entry */
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Compute inverse table x^-1 == x^254 */
|
|
|
|
for (i = 0; i < 256; i++)
|
|
|
|
raid6_gfinv[i] = gfpow(i, 254);
|
|
|
|
|
|
|
|
/* Compute inv(2^x + 1) (exponent-xor-inverse) table */
|
|
|
|
for (i = 0; i < 256; i ++)
|
|
|
|
raid6_gfexi[i] = raid6_gfinv[raid6_gfexp[i] ^ 1];
|
|
|
|
|
User space RAID-6 access
> test_stripe assumes that the data starts at the start of each device.
> AS you are using 1.2 metadata (the default), data starts about 1M in to
> the device (I think - you can check with --examine)
>
> You could fix test_stripe to put the right value in the 'offsets' array,
> or you could create the array with 1.0 or 0.90 metadata.
Hi Neil,
thanks for the info, maybe this should be a second patch.
In the meantime, please find attached a patch to restripe.c
of mdadm 3.2 (latest, I hope).
This should add the functionality to detect, in RAID-6,
which of the disks potentially has problems, in case of
parity errors.
Some checks take place in order to avoid false positives,
I hope these are correct and enough.
I'm not 100% happy of the interface (too much redundancy),
but for the time being it could be OK.
Of course, any improvement is welcome.
Please consider to include these changes to the next mdadm
whatever release.
bye,
Signed-off-by: NeilBrown <neilb@suse.de>
2011-02-08 01:44:23 +01:00
|
|
|
/* Compute log and inverse log */
|
|
|
|
/* Modified code from:
|
|
|
|
* http://web.eecs.utk.edu/~plank/plank/papers/CS-96-332.html
|
|
|
|
*/
|
|
|
|
b = 1;
|
|
|
|
raid6_gflog[0] = 0;
|
|
|
|
raid6_gfilog[255] = 0;
|
|
|
|
|
|
|
|
for (log = 0; log < 255; log++) {
|
|
|
|
raid6_gflog[b] = (uint8_t) log;
|
|
|
|
raid6_gfilog[log] = (uint8_t) b;
|
|
|
|
b = b << 1;
|
|
|
|
if (b & 256) b = b ^ 0435;
|
|
|
|
}
|
|
|
|
|
2009-07-14 07:12:30 +02:00
|
|
|
tables_ready = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint8_t *zero;
|
2011-04-05 13:43:52 +02:00
|
|
|
int zero_size;
|
2009-07-14 07:12:30 +02:00
|
|
|
/* Following was taken from linux/drivers/md/raid6recov.c */
|
|
|
|
|
|
|
|
/* Recover two failed data blocks. */
|
|
|
|
void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
|
|
|
|
uint8_t **ptrs)
|
|
|
|
{
|
|
|
|
uint8_t *p, *q, *dp, *dq;
|
|
|
|
uint8_t px, qx, db;
|
|
|
|
const uint8_t *pbmul; /* P multiplier table for B data */
|
|
|
|
const uint8_t *qmul; /* Q multiplier table (for both) */
|
|
|
|
|
|
|
|
p = ptrs[disks-2];
|
|
|
|
q = ptrs[disks-1];
|
|
|
|
|
|
|
|
/* Compute syndrome with zero for the missing data pages
|
|
|
|
Use the dead data pages as temporary storage for
|
|
|
|
delta p and delta q */
|
|
|
|
dp = ptrs[faila];
|
|
|
|
ptrs[faila] = zero;
|
|
|
|
dq = ptrs[failb];
|
|
|
|
ptrs[failb] = zero;
|
|
|
|
|
|
|
|
qsyndrome(dp, dq, ptrs, disks-2, bytes);
|
|
|
|
|
|
|
|
/* Restore pointer table */
|
|
|
|
ptrs[faila] = dp;
|
|
|
|
ptrs[failb] = dq;
|
|
|
|
|
|
|
|
/* Now, pick the proper data tables */
|
|
|
|
pbmul = raid6_gfmul[raid6_gfexi[failb-faila]];
|
|
|
|
qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]];
|
|
|
|
|
|
|
|
/* Now do it... */
|
|
|
|
while ( bytes-- ) {
|
|
|
|
px = *p ^ *dp;
|
|
|
|
qx = qmul[*q ^ *dq];
|
|
|
|
*dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */
|
|
|
|
*dp++ = db ^ px; /* Reconstructed A */
|
|
|
|
p++; q++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Recover failure of one data block plus the P block */
|
|
|
|
void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs)
|
|
|
|
{
|
|
|
|
uint8_t *p, *q, *dq;
|
|
|
|
const uint8_t *qmul; /* Q multiplier table */
|
|
|
|
|
|
|
|
p = ptrs[disks-2];
|
|
|
|
q = ptrs[disks-1];
|
|
|
|
|
|
|
|
/* Compute syndrome with zero for the missing data page
|
|
|
|
Use the dead data page as temporary storage for delta q */
|
|
|
|
dq = ptrs[faila];
|
|
|
|
ptrs[faila] = zero;
|
|
|
|
|
|
|
|
qsyndrome(p, dq, ptrs, disks-2, bytes);
|
|
|
|
|
|
|
|
/* Restore pointer table */
|
|
|
|
ptrs[faila] = dq;
|
|
|
|
|
|
|
|
/* Now, pick the proper data tables */
|
|
|
|
qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]];
|
|
|
|
|
|
|
|
/* Now do it... */
|
|
|
|
while ( bytes-- ) {
|
|
|
|
*p++ ^= *dq = qmul[*q ^ *dq];
|
|
|
|
q++; dq++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
User space RAID-6 access
> test_stripe assumes that the data starts at the start of each device.
> AS you are using 1.2 metadata (the default), data starts about 1M in to
> the device (I think - you can check with --examine)
>
> You could fix test_stripe to put the right value in the 'offsets' array,
> or you could create the array with 1.0 or 0.90 metadata.
Hi Neil,
thanks for the info, maybe this should be a second patch.
In the meantime, please find attached a patch to restripe.c
of mdadm 3.2 (latest, I hope).
This should add the functionality to detect, in RAID-6,
which of the disks potentially has problems, in case of
parity errors.
Some checks take place in order to avoid false positives,
I hope these are correct and enough.
I'm not 100% happy of the interface (too much redundancy),
but for the time being it could be OK.
Of course, any improvement is welcome.
Please consider to include these changes to the next mdadm
whatever release.
bye,
Signed-off-by: NeilBrown <neilb@suse.de>
2011-02-08 01:44:23 +01:00
|
|
|
/* Try to find out if a specific disk has a problem */
|
|
|
|
int raid6_check_disks(int data_disks, int start, int chunk_size,
|
|
|
|
int level, int layout, int diskP, int diskQ,
|
|
|
|
char *p, char *q, char **stripes)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
int data_id, diskD;
|
|
|
|
uint8_t Px, Qx;
|
|
|
|
int curr_broken_disk = -1;
|
|
|
|
int prev_broken_disk = -1;
|
|
|
|
int broken_status = 0;
|
|
|
|
|
|
|
|
for(i = 0; i < chunk_size; i++) {
|
|
|
|
Px = (uint8_t)stripes[diskP][i] ^ (uint8_t)p[i];
|
|
|
|
Qx = (uint8_t)stripes[diskQ][i] ^ (uint8_t)q[i];
|
|
|
|
|
|
|
|
if((Px != 0) && (Qx == 0))
|
|
|
|
curr_broken_disk = diskP;
|
|
|
|
|
|
|
|
|
|
|
|
if((Px == 0) && (Qx != 0))
|
|
|
|
curr_broken_disk = diskQ;
|
|
|
|
|
|
|
|
|
|
|
|
if((Px != 0) && (Qx != 0)) {
|
2011-02-18 13:51:19 +01:00
|
|
|
data_id = (raid6_gflog[Qx] - raid6_gflog[Px]);
|
|
|
|
if(data_id < 0) data_id += 255;
|
User space RAID-6 access
> test_stripe assumes that the data starts at the start of each device.
> AS you are using 1.2 metadata (the default), data starts about 1M in to
> the device (I think - you can check with --examine)
>
> You could fix test_stripe to put the right value in the 'offsets' array,
> or you could create the array with 1.0 or 0.90 metadata.
Hi Neil,
thanks for the info, maybe this should be a second patch.
In the meantime, please find attached a patch to restripe.c
of mdadm 3.2 (latest, I hope).
This should add the functionality to detect, in RAID-6,
which of the disks potentially has problems, in case of
parity errors.
Some checks take place in order to avoid false positives,
I hope these are correct and enough.
I'm not 100% happy of the interface (too much redundancy),
but for the time being it could be OK.
Of course, any improvement is welcome.
Please consider to include these changes to the next mdadm
whatever release.
bye,
Signed-off-by: NeilBrown <neilb@suse.de>
2011-02-08 01:44:23 +01:00
|
|
|
diskD = geo_map(data_id, start/chunk_size,
|
|
|
|
data_disks + 2, level, layout);
|
|
|
|
curr_broken_disk = diskD;
|
|
|
|
}
|
|
|
|
|
|
|
|
if((Px == 0) && (Qx == 0))
|
|
|
|
curr_broken_disk = curr_broken_disk;
|
|
|
|
|
2011-02-18 13:51:19 +01:00
|
|
|
if(curr_broken_disk >= data_disks + 2)
|
|
|
|
broken_status = 2;
|
|
|
|
|
User space RAID-6 access
> test_stripe assumes that the data starts at the start of each device.
> AS you are using 1.2 metadata (the default), data starts about 1M in to
> the device (I think - you can check with --examine)
>
> You could fix test_stripe to put the right value in the 'offsets' array,
> or you could create the array with 1.0 or 0.90 metadata.
Hi Neil,
thanks for the info, maybe this should be a second patch.
In the meantime, please find attached a patch to restripe.c
of mdadm 3.2 (latest, I hope).
This should add the functionality to detect, in RAID-6,
which of the disks potentially has problems, in case of
parity errors.
Some checks take place in order to avoid false positives,
I hope these are correct and enough.
I'm not 100% happy of the interface (too much redundancy),
but for the time being it could be OK.
Of course, any improvement is welcome.
Please consider to include these changes to the next mdadm
whatever release.
bye,
Signed-off-by: NeilBrown <neilb@suse.de>
2011-02-08 01:44:23 +01:00
|
|
|
switch(broken_status) {
|
|
|
|
case 0:
|
|
|
|
if(curr_broken_disk != -1) {
|
|
|
|
prev_broken_disk = curr_broken_disk;
|
|
|
|
broken_status = 1;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 1:
|
|
|
|
if(curr_broken_disk != prev_broken_disk)
|
|
|
|
broken_status = 2;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 2:
|
|
|
|
default:
|
|
|
|
curr_broken_disk = prev_broken_disk = -2;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return curr_broken_disk;
|
|
|
|
}
|
|
|
|
|
2011-06-08 08:24:48 +02:00
|
|
|
/*******************************************************************************
|
|
|
|
* Function: save_stripes
|
|
|
|
* Description:
|
|
|
|
* Function reads data (only data without P and Q) from array and writes
|
|
|
|
* it to buf and opcjonaly to backup files
|
|
|
|
* Parameters:
|
|
|
|
* source : A list of 'fds' of the active disks.
|
|
|
|
* Some may be absent
|
|
|
|
* offsets : A list of offsets on disk belonging
|
|
|
|
* to the array [bytes]
|
|
|
|
* raid_disks : geometry: number of disks in the array
|
|
|
|
* chunk_size : geometry: chunk size [bytes]
|
|
|
|
* level : geometry: RAID level
|
|
|
|
* layout : geometry: layout
|
|
|
|
* nwrites : number of backup files
|
|
|
|
* dest : A list of 'fds' for mirrored targets
|
|
|
|
* (e.g. backup files). They are already seeked to right
|
|
|
|
* (write) location. If NULL, data will be wrote
|
|
|
|
* to the buf only
|
|
|
|
* start : start address of data to read (must be stripe-aligned)
|
|
|
|
* [bytes]
|
|
|
|
* length - : length of data to read (must be stripe-aligned)
|
|
|
|
* [bytes]
|
|
|
|
* buf : buffer for data. It is large enough to hold
|
|
|
|
* one stripe. It is stripe aligned
|
|
|
|
* Returns:
|
|
|
|
* 0 : success
|
|
|
|
* -1 : fail
|
|
|
|
******************************************************************************/
|
2006-03-13 06:51:32 +01:00
|
|
|
int save_stripes(int *source, unsigned long long *offsets,
|
|
|
|
int raid_disks, int chunk_size, int level, int layout,
|
|
|
|
int nwrites, int *dest,
|
2009-07-14 07:12:30 +02:00
|
|
|
unsigned long long start, unsigned long long length,
|
|
|
|
char *buf)
|
2006-03-13 06:51:32 +01:00
|
|
|
{
|
|
|
|
int len;
|
|
|
|
int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2);
|
|
|
|
int disk;
|
2009-07-14 07:12:30 +02:00
|
|
|
int i;
|
2011-06-08 08:24:48 +02:00
|
|
|
unsigned long long length_test;
|
2006-03-13 06:51:32 +01:00
|
|
|
|
2009-07-14 07:12:30 +02:00
|
|
|
if (!tables_ready)
|
|
|
|
make_tables();
|
|
|
|
|
2011-04-05 13:43:52 +02:00
|
|
|
if (zero == NULL || chunk_size > zero_size) {
|
|
|
|
if (zero)
|
|
|
|
free(zero);
|
2009-07-14 07:12:30 +02:00
|
|
|
zero = malloc(chunk_size);
|
2011-04-05 13:43:52 +02:00
|
|
|
if (zero)
|
|
|
|
memset(zero, 0, chunk_size);
|
|
|
|
zero_size = chunk_size;
|
2009-07-14 07:12:30 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
len = data_disks * chunk_size;
|
2011-06-08 08:24:48 +02:00
|
|
|
length_test = length / len;
|
|
|
|
length_test *= len;
|
|
|
|
|
|
|
|
if (length != length_test) {
|
|
|
|
dprintf("Error: save_stripes(): Data are not alligned. EXIT\n");
|
|
|
|
dprintf("\tArea for saving stripes (length) = %llu\n", length);
|
|
|
|
dprintf("\tWork step (len) = %i\n", len);
|
|
|
|
dprintf("\tExpected save area (length_test) = %llu\n",
|
|
|
|
length_test);
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
|
2006-03-13 06:51:32 +01:00
|
|
|
while (length > 0) {
|
2009-07-14 07:12:30 +02:00
|
|
|
int failed = 0;
|
|
|
|
int fdisk[3], fblock[3];
|
|
|
|
for (disk = 0; disk < raid_disks ; disk++) {
|
|
|
|
unsigned long long offset;
|
|
|
|
int dnum;
|
|
|
|
|
|
|
|
offset = (start/chunk_size/data_disks)*chunk_size;
|
|
|
|
dnum = geo_map(disk < data_disks ? disk : data_disks - disk - 1,
|
|
|
|
start/chunk_size/data_disks,
|
|
|
|
raid_disks, level, layout);
|
2009-08-11 05:02:49 +02:00
|
|
|
if (dnum < 0) abort();
|
2009-07-14 07:12:30 +02:00
|
|
|
if (source[dnum] < 0 ||
|
2009-10-12 07:57:22 +02:00
|
|
|
lseek64(source[dnum], offsets[dnum]+offset, 0) < 0 ||
|
2009-08-11 05:02:49 +02:00
|
|
|
read(source[dnum], buf+disk * chunk_size, chunk_size)
|
|
|
|
!= chunk_size)
|
2009-07-14 07:12:30 +02:00
|
|
|
if (failed <= 2) {
|
|
|
|
fdisk[failed] = dnum;
|
|
|
|
fblock[failed] = disk;
|
|
|
|
failed++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (failed == 0 || fblock[0] >= data_disks)
|
|
|
|
/* all data disks are good */
|
|
|
|
;
|
|
|
|
else if (failed == 1 || fblock[1] >= data_disks+1) {
|
|
|
|
/* one failed data disk and good parity */
|
|
|
|
char *bufs[data_disks];
|
|
|
|
for (i=0; i < data_disks; i++)
|
|
|
|
if (fblock[0] == i)
|
|
|
|
bufs[i] = buf + data_disks*chunk_size;
|
|
|
|
else
|
|
|
|
bufs[i] = buf + i*chunk_size;
|
|
|
|
|
|
|
|
xor_blocks(buf + fblock[0]*chunk_size,
|
|
|
|
bufs, data_disks, chunk_size);
|
|
|
|
} else if (failed > 2 || level != 6)
|
|
|
|
/* too much failure */
|
2006-03-13 06:51:32 +01:00
|
|
|
return -1;
|
2009-07-14 07:12:30 +02:00
|
|
|
else {
|
|
|
|
/* RAID6 computations needed. */
|
|
|
|
uint8_t *bufs[data_disks+4];
|
|
|
|
int qdisk;
|
|
|
|
int syndrome_disks;
|
|
|
|
disk = geo_map(-1, start/chunk_size/data_disks,
|
|
|
|
raid_disks, level, layout);
|
|
|
|
qdisk = geo_map(-2, start/chunk_size/data_disks,
|
|
|
|
raid_disks, level, layout);
|
|
|
|
if (is_ddf(layout)) {
|
|
|
|
/* q over 'raid_disks' blocks, in device order.
|
|
|
|
* 'p' and 'q' get to be all zero
|
|
|
|
*/
|
|
|
|
for (i = 0; i < raid_disks; i++)
|
2009-10-12 07:57:22 +02:00
|
|
|
bufs[i] = zero;
|
|
|
|
for (i = 0; i < data_disks; i++) {
|
|
|
|
int dnum = geo_map(i,
|
|
|
|
start/chunk_size/data_disks,
|
|
|
|
raid_disks, level, layout);
|
|
|
|
int snum;
|
|
|
|
/* i is the logical block number, so is index to 'buf'.
|
|
|
|
* dnum is physical disk number
|
|
|
|
* and thus the syndrome number.
|
|
|
|
*/
|
|
|
|
snum = dnum;
|
|
|
|
bufs[snum] = (uint8_t*)buf + chunk_size * i;
|
|
|
|
}
|
2009-07-14 07:12:30 +02:00
|
|
|
syndrome_disks = raid_disks;
|
|
|
|
} else {
|
|
|
|
/* for md, q is over 'data_disks' blocks,
|
|
|
|
* starting immediately after 'q'
|
2009-10-16 08:50:06 +02:00
|
|
|
* Note that for the '_6' variety, the p block
|
|
|
|
* makes a hole that we need to be careful of.
|
2009-07-14 07:12:30 +02:00
|
|
|
*/
|
2009-10-16 08:50:06 +02:00
|
|
|
int j;
|
|
|
|
int snum = 0;
|
|
|
|
for (j = 0; j < raid_disks; j++) {
|
|
|
|
int dnum = (qdisk + 1 + j) % raid_disks;
|
|
|
|
if (dnum == disk || dnum == qdisk)
|
|
|
|
continue;
|
|
|
|
for (i = 0; i < data_disks; i++)
|
|
|
|
if (geo_map(i,
|
|
|
|
start/chunk_size/data_disks,
|
|
|
|
raid_disks, level, layout) == dnum)
|
|
|
|
break;
|
2009-10-12 07:57:22 +02:00
|
|
|
/* i is the logical block number, so is index to 'buf'.
|
|
|
|
* dnum is physical disk number
|
|
|
|
* snum is syndrome disk for which 0 is immediately after Q
|
|
|
|
*/
|
|
|
|
bufs[snum] = (uint8_t*)buf + chunk_size * i;
|
2009-10-16 08:50:06 +02:00
|
|
|
|
|
|
|
if (fblock[0] == i)
|
|
|
|
fdisk[0] = snum;
|
|
|
|
if (fblock[1] == i)
|
|
|
|
fdisk[1] = snum;
|
|
|
|
snum++;
|
2009-10-12 07:57:22 +02:00
|
|
|
}
|
2009-07-14 07:12:30 +02:00
|
|
|
|
|
|
|
syndrome_disks = data_disks;
|
|
|
|
}
|
2009-10-12 07:57:22 +02:00
|
|
|
|
|
|
|
/* Place P and Q blocks at end of bufs */
|
|
|
|
bufs[syndrome_disks] = (uint8_t*)buf + chunk_size * data_disks;
|
|
|
|
bufs[syndrome_disks+1] = (uint8_t*)buf + chunk_size * (data_disks+1);
|
|
|
|
|
2009-07-14 07:12:30 +02:00
|
|
|
if (fblock[1] == data_disks)
|
|
|
|
/* One data failed, and parity failed */
|
|
|
|
raid6_datap_recov(syndrome_disks+2, chunk_size,
|
|
|
|
fdisk[0], bufs);
|
2009-10-12 07:57:22 +02:00
|
|
|
else {
|
|
|
|
if (fdisk[0] > fdisk[1]) {
|
|
|
|
int t = fdisk[0];
|
|
|
|
fdisk[0] = fdisk[1];
|
|
|
|
fdisk[1] = t;
|
|
|
|
}
|
2009-07-14 07:12:30 +02:00
|
|
|
/* Two data blocks failed, P,Q OK */
|
|
|
|
raid6_2data_recov(syndrome_disks+2, chunk_size,
|
|
|
|
fdisk[0], fdisk[1], bufs);
|
2009-10-12 07:57:22 +02:00
|
|
|
}
|
2009-07-14 07:12:30 +02:00
|
|
|
}
|
2011-06-09 05:00:55 +02:00
|
|
|
if (dest) {
|
2011-06-08 08:24:48 +02:00
|
|
|
for (i = 0; i < nwrites; i++)
|
|
|
|
if (write(dest[i], buf, len) != len)
|
|
|
|
return -1;
|
2011-06-09 05:00:55 +02:00
|
|
|
} else {
|
|
|
|
/* build next stripe in buffer */
|
|
|
|
buf += len;
|
|
|
|
}
|
2006-03-13 06:51:32 +01:00
|
|
|
length -= len;
|
|
|
|
start += len;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Restore data:
|
|
|
|
* We are given:
|
|
|
|
* A list of 'fds' of the active disks. Some may be '-1' for not-available.
|
2006-03-20 04:17:31 +01:00
|
|
|
* A geometry: raid_disks, chunk_size, level, layout
|
2006-03-13 06:51:32 +01:00
|
|
|
* An 'fd' to read from. It is already seeked to the right (Read) location.
|
|
|
|
* A start and length.
|
|
|
|
* The length must be a multiple of the stripe size.
|
|
|
|
*
|
|
|
|
* We build a full stripe in memory and then write it out.
|
|
|
|
* We assume that there are enough working devices.
|
|
|
|
*/
|
|
|
|
int restore_stripes(int *dest, unsigned long long *offsets,
|
|
|
|
int raid_disks, int chunk_size, int level, int layout,
|
2006-03-20 04:17:31 +01:00
|
|
|
int source, unsigned long long read_offset,
|
2011-06-08 08:24:48 +02:00
|
|
|
unsigned long long start, unsigned long long length,
|
|
|
|
char *src_buf)
|
2006-03-13 06:51:32 +01:00
|
|
|
{
|
2009-08-13 03:12:54 +02:00
|
|
|
char *stripe_buf;
|
2006-03-13 06:51:32 +01:00
|
|
|
char **stripes = malloc(raid_disks * sizeof(char*));
|
|
|
|
char **blocks = malloc(raid_disks * sizeof(char*));
|
|
|
|
int i;
|
|
|
|
|
2009-07-14 07:12:30 +02:00
|
|
|
int data_disks = raid_disks - (level == 0 ? 0 : level <= 5 ? 1 : 2);
|
2006-03-13 06:51:32 +01:00
|
|
|
|
2010-03-03 00:54:17 +01:00
|
|
|
if (posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size))
|
|
|
|
stripe_buf = NULL;
|
2011-04-05 13:43:52 +02:00
|
|
|
|
|
|
|
if (zero == NULL || chunk_size > zero_size) {
|
|
|
|
if (zero)
|
|
|
|
free(zero);
|
2009-07-14 07:12:30 +02:00
|
|
|
zero = malloc(chunk_size);
|
|
|
|
if (zero)
|
|
|
|
memset(zero, 0, chunk_size);
|
2011-04-05 13:43:52 +02:00
|
|
|
zero_size = chunk_size;
|
2009-07-14 07:12:30 +02:00
|
|
|
}
|
2011-04-05 13:43:52 +02:00
|
|
|
|
2009-05-25 02:52:31 +02:00
|
|
|
if (stripe_buf == NULL || stripes == NULL || blocks == NULL
|
|
|
|
|| zero == NULL) {
|
2006-03-13 06:51:32 +01:00
|
|
|
free(stripe_buf);
|
|
|
|
free(stripes);
|
|
|
|
free(blocks);
|
2009-05-25 02:52:31 +02:00
|
|
|
free(zero);
|
2006-03-13 06:51:32 +01:00
|
|
|
return -2;
|
|
|
|
}
|
2011-06-08 08:24:48 +02:00
|
|
|
for (i = 0; i < raid_disks; i++)
|
2006-03-13 06:51:32 +01:00
|
|
|
stripes[i] = stripe_buf + i * chunk_size;
|
|
|
|
while (length > 0) {
|
2010-08-05 05:13:02 +02:00
|
|
|
unsigned int len = data_disks * chunk_size;
|
2006-03-13 06:51:32 +01:00
|
|
|
unsigned long long offset;
|
2007-02-22 04:59:19 +01:00
|
|
|
int disk, qdisk;
|
2009-07-14 07:12:30 +02:00
|
|
|
int syndrome_disks;
|
2006-03-13 06:51:32 +01:00
|
|
|
if (length < len)
|
|
|
|
return -3;
|
2011-06-08 08:24:48 +02:00
|
|
|
for (i = 0; i < data_disks; i++) {
|
2006-03-13 06:51:32 +01:00
|
|
|
int disk = geo_map(i, start/chunk_size/data_disks,
|
|
|
|
raid_disks, level, layout);
|
2011-06-08 08:24:48 +02:00
|
|
|
if (src_buf == NULL) {
|
|
|
|
/* read from file */
|
|
|
|
if (lseek64(source,
|
|
|
|
read_offset, 0) != (off64_t)read_offset)
|
|
|
|
return -1;
|
|
|
|
if (read(source,
|
|
|
|
stripes[disk],
|
|
|
|
chunk_size) != chunk_size)
|
|
|
|
return -1;
|
|
|
|
} else {
|
|
|
|
/* read from input buffer */
|
|
|
|
memcpy(stripes[disk],
|
|
|
|
src_buf + read_offset,
|
|
|
|
chunk_size);
|
|
|
|
}
|
2006-03-20 04:17:31 +01:00
|
|
|
read_offset += chunk_size;
|
2006-03-13 06:51:32 +01:00
|
|
|
}
|
|
|
|
/* We have the data, now do the parity */
|
|
|
|
offset = (start/chunk_size/data_disks) * chunk_size;
|
2007-02-22 04:59:19 +01:00
|
|
|
switch (level) {
|
|
|
|
case 4:
|
|
|
|
case 5:
|
|
|
|
disk = geo_map(-1, start/chunk_size/data_disks,
|
2006-03-13 06:51:32 +01:00
|
|
|
raid_disks, level, layout);
|
2009-05-25 02:52:31 +02:00
|
|
|
for (i = 0; i < data_disks; i++)
|
|
|
|
blocks[i] = stripes[(disk+1+i) % raid_disks];
|
2006-03-13 06:51:32 +01:00
|
|
|
xor_blocks(stripes[disk], blocks, data_disks, chunk_size);
|
2007-02-22 04:59:19 +01:00
|
|
|
break;
|
|
|
|
case 6:
|
|
|
|
disk = geo_map(-1, start/chunk_size/data_disks,
|
|
|
|
raid_disks, level, layout);
|
|
|
|
qdisk = geo_map(-2, start/chunk_size/data_disks,
|
|
|
|
raid_disks, level, layout);
|
2009-05-25 02:52:31 +02:00
|
|
|
if (is_ddf(layout)) {
|
|
|
|
/* q over 'raid_disks' blocks, in device order.
|
|
|
|
* 'p' and 'q' get to be all zero
|
|
|
|
*/
|
|
|
|
for (i = 0; i < raid_disks; i++)
|
|
|
|
if (i == disk || i == qdisk)
|
2009-07-14 07:12:30 +02:00
|
|
|
blocks[i] = (char*)zero;
|
2009-05-25 02:52:31 +02:00
|
|
|
else
|
|
|
|
blocks[i] = stripes[i];
|
2009-07-14 07:12:30 +02:00
|
|
|
syndrome_disks = raid_disks;
|
2009-05-25 02:52:31 +02:00
|
|
|
} else {
|
2009-07-14 07:12:30 +02:00
|
|
|
/* for md, q is over 'data_disks' blocks,
|
2009-05-25 02:52:31 +02:00
|
|
|
* starting immediately after 'q'
|
|
|
|
*/
|
|
|
|
for (i = 0; i < data_disks; i++)
|
|
|
|
blocks[i] = stripes[(qdisk+1+i) % raid_disks];
|
2007-02-22 04:59:19 +01:00
|
|
|
|
2009-07-14 07:12:30 +02:00
|
|
|
syndrome_disks = data_disks;
|
2009-05-25 02:52:31 +02:00
|
|
|
}
|
2009-07-14 07:12:30 +02:00
|
|
|
qsyndrome((uint8_t*)stripes[disk],
|
|
|
|
(uint8_t*)stripes[qdisk],
|
|
|
|
(uint8_t**)blocks,
|
|
|
|
syndrome_disks, chunk_size);
|
2007-02-22 04:59:19 +01:00
|
|
|
break;
|
2006-03-13 06:51:32 +01:00
|
|
|
}
|
|
|
|
for (i=0; i < raid_disks ; i++)
|
|
|
|
if (dest[i] >= 0) {
|
|
|
|
if (lseek64(dest[i], offsets[i]+offset, 0) < 0)
|
|
|
|
return -1;
|
|
|
|
if (write(dest[i], stripes[i], chunk_size) != chunk_size)
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
length -= len;
|
|
|
|
start += len;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef MAIN
|
|
|
|
|
2007-02-22 04:59:19 +01:00
|
|
|
int test_stripes(int *source, unsigned long long *offsets,
|
|
|
|
int raid_disks, int chunk_size, int level, int layout,
|
|
|
|
unsigned long long start, unsigned long long length)
|
|
|
|
{
|
|
|
|
/* ready the data and p (and q) blocks, and check we got them right */
|
|
|
|
char *stripe_buf = malloc(raid_disks * chunk_size);
|
|
|
|
char **stripes = malloc(raid_disks * sizeof(char*));
|
|
|
|
char **blocks = malloc(raid_disks * sizeof(char*));
|
|
|
|
char *p = malloc(chunk_size);
|
|
|
|
char *q = malloc(chunk_size);
|
|
|
|
|
|
|
|
int i;
|
User space RAID-6 access
> test_stripe assumes that the data starts at the start of each device.
> AS you are using 1.2 metadata (the default), data starts about 1M in to
> the device (I think - you can check with --examine)
>
> You could fix test_stripe to put the right value in the 'offsets' array,
> or you could create the array with 1.0 or 0.90 metadata.
Hi Neil,
thanks for the info, maybe this should be a second patch.
In the meantime, please find attached a patch to restripe.c
of mdadm 3.2 (latest, I hope).
This should add the functionality to detect, in RAID-6,
which of the disks potentially has problems, in case of
parity errors.
Some checks take place in order to avoid false positives,
I hope these are correct and enough.
I'm not 100% happy of the interface (too much redundancy),
but for the time being it could be OK.
Of course, any improvement is welcome.
Please consider to include these changes to the next mdadm
whatever release.
bye,
Signed-off-by: NeilBrown <neilb@suse.de>
2011-02-08 01:44:23 +01:00
|
|
|
int diskP, diskQ;
|
2007-02-22 04:59:19 +01:00
|
|
|
int data_disks = raid_disks - (level == 5 ? 1: 2);
|
User space RAID-6 access
> test_stripe assumes that the data starts at the start of each device.
> AS you are using 1.2 metadata (the default), data starts about 1M in to
> the device (I think - you can check with --examine)
>
> You could fix test_stripe to put the right value in the 'offsets' array,
> or you could create the array with 1.0 or 0.90 metadata.
Hi Neil,
thanks for the info, maybe this should be a second patch.
In the meantime, please find attached a patch to restripe.c
of mdadm 3.2 (latest, I hope).
This should add the functionality to detect, in RAID-6,
which of the disks potentially has problems, in case of
parity errors.
Some checks take place in order to avoid false positives,
I hope these are correct and enough.
I'm not 100% happy of the interface (too much redundancy),
but for the time being it could be OK.
Of course, any improvement is welcome.
Please consider to include these changes to the next mdadm
whatever release.
bye,
Signed-off-by: NeilBrown <neilb@suse.de>
2011-02-08 01:44:23 +01:00
|
|
|
|
|
|
|
if (!tables_ready)
|
|
|
|
make_tables();
|
|
|
|
|
2007-02-22 04:59:19 +01:00
|
|
|
for ( i = 0 ; i < raid_disks ; i++)
|
|
|
|
stripes[i] = stripe_buf + i * chunk_size;
|
|
|
|
|
|
|
|
while (length > 0) {
|
|
|
|
int disk;
|
|
|
|
|
|
|
|
for (i = 0 ; i < raid_disks ; i++) {
|
|
|
|
lseek64(source[i], offsets[i]+start, 0);
|
|
|
|
read(source[i], stripes[i], chunk_size);
|
|
|
|
}
|
|
|
|
for (i = 0 ; i < data_disks ; i++) {
|
|
|
|
int disk = geo_map(i, start/chunk_size, raid_disks,
|
|
|
|
level, layout);
|
|
|
|
blocks[i] = stripes[disk];
|
|
|
|
printf("%d->%d\n", i, disk);
|
|
|
|
}
|
|
|
|
switch(level) {
|
|
|
|
case 6:
|
2009-10-12 08:00:23 +02:00
|
|
|
qsyndrome(p, q, (uint8_t**)blocks, data_disks, chunk_size);
|
User space RAID-6 access
> test_stripe assumes that the data starts at the start of each device.
> AS you are using 1.2 metadata (the default), data starts about 1M in to
> the device (I think - you can check with --examine)
>
> You could fix test_stripe to put the right value in the 'offsets' array,
> or you could create the array with 1.0 or 0.90 metadata.
Hi Neil,
thanks for the info, maybe this should be a second patch.
In the meantime, please find attached a patch to restripe.c
of mdadm 3.2 (latest, I hope).
This should add the functionality to detect, in RAID-6,
which of the disks potentially has problems, in case of
parity errors.
Some checks take place in order to avoid false positives,
I hope these are correct and enough.
I'm not 100% happy of the interface (too much redundancy),
but for the time being it could be OK.
Of course, any improvement is welcome.
Please consider to include these changes to the next mdadm
whatever release.
bye,
Signed-off-by: NeilBrown <neilb@suse.de>
2011-02-08 01:44:23 +01:00
|
|
|
diskP = geo_map(-1, start/chunk_size, raid_disks,
|
2007-02-22 04:59:19 +01:00
|
|
|
level, layout);
|
User space RAID-6 access
> test_stripe assumes that the data starts at the start of each device.
> AS you are using 1.2 metadata (the default), data starts about 1M in to
> the device (I think - you can check with --examine)
>
> You could fix test_stripe to put the right value in the 'offsets' array,
> or you could create the array with 1.0 or 0.90 metadata.
Hi Neil,
thanks for the info, maybe this should be a second patch.
In the meantime, please find attached a patch to restripe.c
of mdadm 3.2 (latest, I hope).
This should add the functionality to detect, in RAID-6,
which of the disks potentially has problems, in case of
parity errors.
Some checks take place in order to avoid false positives,
I hope these are correct and enough.
I'm not 100% happy of the interface (too much redundancy),
but for the time being it could be OK.
Of course, any improvement is welcome.
Please consider to include these changes to the next mdadm
whatever release.
bye,
Signed-off-by: NeilBrown <neilb@suse.de>
2011-02-08 01:44:23 +01:00
|
|
|
if (memcmp(p, stripes[diskP], chunk_size) != 0) {
|
|
|
|
printf("P(%d) wrong at %llu\n", diskP,
|
2007-02-22 04:59:19 +01:00
|
|
|
start / chunk_size);
|
|
|
|
}
|
User space RAID-6 access
> test_stripe assumes that the data starts at the start of each device.
> AS you are using 1.2 metadata (the default), data starts about 1M in to
> the device (I think - you can check with --examine)
>
> You could fix test_stripe to put the right value in the 'offsets' array,
> or you could create the array with 1.0 or 0.90 metadata.
Hi Neil,
thanks for the info, maybe this should be a second patch.
In the meantime, please find attached a patch to restripe.c
of mdadm 3.2 (latest, I hope).
This should add the functionality to detect, in RAID-6,
which of the disks potentially has problems, in case of
parity errors.
Some checks take place in order to avoid false positives,
I hope these are correct and enough.
I'm not 100% happy of the interface (too much redundancy),
but for the time being it could be OK.
Of course, any improvement is welcome.
Please consider to include these changes to the next mdadm
whatever release.
bye,
Signed-off-by: NeilBrown <neilb@suse.de>
2011-02-08 01:44:23 +01:00
|
|
|
diskQ = geo_map(-2, start/chunk_size, raid_disks,
|
2007-02-22 04:59:19 +01:00
|
|
|
level, layout);
|
User space RAID-6 access
> test_stripe assumes that the data starts at the start of each device.
> AS you are using 1.2 metadata (the default), data starts about 1M in to
> the device (I think - you can check with --examine)
>
> You could fix test_stripe to put the right value in the 'offsets' array,
> or you could create the array with 1.0 or 0.90 metadata.
Hi Neil,
thanks for the info, maybe this should be a second patch.
In the meantime, please find attached a patch to restripe.c
of mdadm 3.2 (latest, I hope).
This should add the functionality to detect, in RAID-6,
which of the disks potentially has problems, in case of
parity errors.
Some checks take place in order to avoid false positives,
I hope these are correct and enough.
I'm not 100% happy of the interface (too much redundancy),
but for the time being it could be OK.
Of course, any improvement is welcome.
Please consider to include these changes to the next mdadm
whatever release.
bye,
Signed-off-by: NeilBrown <neilb@suse.de>
2011-02-08 01:44:23 +01:00
|
|
|
if (memcmp(q, stripes[diskQ], chunk_size) != 0) {
|
|
|
|
printf("Q(%d) wrong at %llu\n", diskQ,
|
2007-02-22 04:59:19 +01:00
|
|
|
start / chunk_size);
|
|
|
|
}
|
User space RAID-6 access
> test_stripe assumes that the data starts at the start of each device.
> AS you are using 1.2 metadata (the default), data starts about 1M in to
> the device (I think - you can check with --examine)
>
> You could fix test_stripe to put the right value in the 'offsets' array,
> or you could create the array with 1.0 or 0.90 metadata.
Hi Neil,
thanks for the info, maybe this should be a second patch.
In the meantime, please find attached a patch to restripe.c
of mdadm 3.2 (latest, I hope).
This should add the functionality to detect, in RAID-6,
which of the disks potentially has problems, in case of
parity errors.
Some checks take place in order to avoid false positives,
I hope these are correct and enough.
I'm not 100% happy of the interface (too much redundancy),
but for the time being it could be OK.
Of course, any improvement is welcome.
Please consider to include these changes to the next mdadm
whatever release.
bye,
Signed-off-by: NeilBrown <neilb@suse.de>
2011-02-08 01:44:23 +01:00
|
|
|
disk = raid6_check_disks(data_disks, start, chunk_size,
|
|
|
|
level, layout, diskP, diskQ,
|
|
|
|
p, q, stripes);
|
|
|
|
if(disk >= 0) {
|
|
|
|
printf("Possible failed disk: %d\n", disk);
|
|
|
|
}
|
|
|
|
if(disk == -2) {
|
|
|
|
printf("Failure detected, but disk unknown\n");
|
|
|
|
}
|
2007-02-22 04:59:19 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
length -= chunk_size;
|
|
|
|
start += chunk_size;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2006-03-13 06:51:32 +01:00
|
|
|
unsigned long long getnum(char *str, char **err)
|
|
|
|
{
|
|
|
|
char *e;
|
|
|
|
unsigned long long rv = strtoull(str, &e, 10);
|
|
|
|
if (e==str || *e) {
|
|
|
|
*err = str;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return rv;
|
|
|
|
}
|
|
|
|
|
|
|
|
main(int argc, char *argv[])
|
|
|
|
{
|
|
|
|
/* save/restore file raid_disks chunk_size level layout start length devices...
|
|
|
|
*/
|
|
|
|
int save;
|
|
|
|
int *fds;
|
|
|
|
char *file;
|
2009-07-14 07:12:30 +02:00
|
|
|
char *buf;
|
2006-03-13 06:51:32 +01:00
|
|
|
int storefd;
|
|
|
|
unsigned long long *offsets;
|
|
|
|
int raid_disks, chunk_size, level, layout;
|
|
|
|
unsigned long long start, length;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
char *err = NULL;
|
|
|
|
if (argc < 10) {
|
|
|
|
fprintf(stderr, "Usage: test_stripe save/restore file raid_disks"
|
|
|
|
" chunk_size level layout start length devices...\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
if (strcmp(argv[1], "save")==0)
|
|
|
|
save = 1;
|
|
|
|
else if (strcmp(argv[1], "restore") == 0)
|
|
|
|
save = 0;
|
2007-02-22 04:59:19 +01:00
|
|
|
else if (strcmp(argv[1], "test") == 0)
|
|
|
|
save = 2;
|
2006-03-13 06:51:32 +01:00
|
|
|
else {
|
|
|
|
fprintf(stderr, "test_stripe: must give 'save' or 'restore'.\n");
|
|
|
|
exit(2);
|
|
|
|
}
|
|
|
|
|
|
|
|
file = argv[2];
|
|
|
|
raid_disks = getnum(argv[3], &err);
|
|
|
|
chunk_size = getnum(argv[4], &err);
|
|
|
|
level = getnum(argv[5], &err);
|
|
|
|
layout = getnum(argv[6], &err);
|
|
|
|
start = getnum(argv[7], &err);
|
|
|
|
length = getnum(argv[8], &err);
|
|
|
|
if (err) {
|
|
|
|
fprintf(stderr, "test_stripe: Bad number: %s\n", err);
|
|
|
|
exit(2);
|
|
|
|
}
|
|
|
|
if (argc != raid_disks + 9) {
|
|
|
|
fprintf(stderr, "test_stripe: wrong number of devices: want %d found %d\n",
|
|
|
|
raid_disks, argc-9);
|
|
|
|
exit(2);
|
|
|
|
}
|
|
|
|
fds = malloc(raid_disks * sizeof(*fds));
|
|
|
|
offsets = malloc(raid_disks * sizeof(*offsets));
|
|
|
|
memset(offsets, 0, raid_disks * sizeof(*offsets));
|
|
|
|
|
|
|
|
storefd = open(file, O_RDWR);
|
|
|
|
if (storefd < 0) {
|
|
|
|
perror(file);
|
|
|
|
fprintf(stderr, "test_stripe: could not open %s.\n", file);
|
|
|
|
exit(3);
|
|
|
|
}
|
|
|
|
for (i=0; i<raid_disks; i++) {
|
2011-03-22 00:09:38 +01:00
|
|
|
char *p;
|
|
|
|
p = strchr(argv[9+i], ':');
|
|
|
|
|
|
|
|
if(p != NULL) {
|
|
|
|
*p++ = '\0';
|
|
|
|
offsets[i] = atoll(p) * 512;
|
|
|
|
}
|
|
|
|
|
2006-03-13 06:51:32 +01:00
|
|
|
fds[i] = open(argv[9+i], O_RDWR);
|
|
|
|
if (fds[i] < 0) {
|
|
|
|
perror(argv[9+i]);
|
|
|
|
fprintf(stderr,"test_stripe: cannot open %s.\n", argv[9+i]);
|
|
|
|
exit(3);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-07-14 07:12:30 +02:00
|
|
|
buf = malloc(raid_disks * chunk_size);
|
|
|
|
|
2007-02-22 04:59:19 +01:00
|
|
|
if (save == 1) {
|
2006-03-13 06:51:32 +01:00
|
|
|
int rv = save_stripes(fds, offsets,
|
|
|
|
raid_disks, chunk_size, level, layout,
|
|
|
|
1, &storefd,
|
2009-07-14 07:12:30 +02:00
|
|
|
start, length, buf);
|
2006-03-13 06:51:32 +01:00
|
|
|
if (rv != 0) {
|
2007-02-22 04:59:19 +01:00
|
|
|
fprintf(stderr,
|
|
|
|
"test_stripe: save_stripes returned %d\n", rv);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
} else if (save == 2) {
|
|
|
|
int rv = test_stripes(fds, offsets,
|
|
|
|
raid_disks, chunk_size, level, layout,
|
|
|
|
start, length);
|
|
|
|
if (rv != 0) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"test_stripe: test_stripes returned %d\n", rv);
|
2006-03-13 06:51:32 +01:00
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
int rv = restore_stripes(fds, offsets,
|
|
|
|
raid_disks, chunk_size, level, layout,
|
2006-03-20 04:17:31 +01:00
|
|
|
storefd, 0ULL,
|
2011-06-09 05:00:46 +02:00
|
|
|
start, length, NULL);
|
2006-03-13 06:51:32 +01:00
|
|
|
if (rv != 0) {
|
2007-02-22 04:59:19 +01:00
|
|
|
fprintf(stderr,
|
|
|
|
"test_stripe: restore_stripes returned %d\n",
|
|
|
|
rv);
|
2006-03-13 06:51:32 +01:00
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
exit(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* MAIN */
|