diff --git a/Create.c b/Create.c index 420b9136..2ce50ff3 100644 --- a/Create.c +++ b/Create.c @@ -48,6 +48,23 @@ static int round_size_and_verify(unsigned long long *size, int chunk) return 0; } +static void check_logical_block_size_compatibility(struct mdinfo *info) +{ + char buf[10] = {0}; + + /* + * The logical block size feature is only used with metadata 1.x, + * so the metadata version also needs to be checked. + */ + if (sysfs_attribute_available(info, NULL, "logical_block_size") && + sysfs_get_str(info, NULL, "metadata_version", buf, sizeof(buf)) > 0 && + !memcmp(buf, "1.", 2)) { + pr_info("Arrays created by newer kernels have the logical block size feature enabled " + "by default. Such arrays can only be used on kernel versions later than 6.18\n"); + } + +} + /** * default_layout() - Get default layout for level. * @st: metadata requested, could be NULL. @@ -1281,6 +1298,14 @@ int Create(struct supertype *st, struct mddev_ident *ident, int subdevs, goto abort; } } else { + if (s->logical_block_size && + sysfs_set_num(&info, NULL, "logical_block_size", + s->logical_block_size)) { + pr_err("Failed to set logical_block_size %u\n", + s->logical_block_size); + goto abort; + } + /* param is not actually used */ mdu_param_t param; if (ioctl(mdfd, RUN_ARRAY, ¶m)) { @@ -1295,6 +1320,14 @@ int Create(struct supertype *st, struct mddev_ident *ident, int subdevs, ioctl(mdfd, STOP_ARRAY, NULL); goto abort; } + + /* After the kernel started supporting the logical block size feature, + * it modified the array metadata, which may lead to compatibility issues. + * As a result, arrays created on newer kernels cannot be used on older + * kernel versions, so users need to warned. + */ + check_logical_block_size_compatibility(&info); + /* if start_ro module parameter is set, array is * auto-read-only, which is bad as the resync won't * start. So lets make it read-write now. diff --git a/ReadMe.c b/ReadMe.c index c2415c26..137a80c9 100644 --- a/ReadMe.c +++ b/ReadMe.c @@ -149,6 +149,7 @@ struct option long_options[] = { {"home-cluster", 1, 0, ClusterName}, {"write-journal", 1, 0, WriteJournal}, {"consistency-policy", 1, 0, 'k'}, + {"logical-block-size", 1, 0, LogicalBlockSize}, /* For assemble */ {"uuid", 1, 0, 'u'}, @@ -331,6 +332,7 @@ char Help_create[] = " --consistency-policy= : Specify the policy that determines how the array\n" " -k : maintains consistency in case of unexpected shutdown.\n" " --write-zeroes : Write zeroes to the disks before creating. This will bypass initial sync.\n" +" --logical-block-size= : Set the logical block size (in Byte) for the RAID.\n" "\n" ; diff --git a/mdadm.8.in b/mdadm.8.in index 2a71e322..448ff2c9 100644 --- a/mdadm.8.in +++ b/mdadm.8.in @@ -989,6 +989,21 @@ Can be used with \-\-grow to change the consistency policy of an active array in some cases. See CONSISTENCY POLICY CHANGES below. .RE +.TP +.BR \-\-logical\-block\-size= +The option can only be used in Create mode and metadata version is 1.x. If during +creation, the member disks have mixed LBS values (512 and 4K), the array will +default to using 4K as its LBS. However, if the 4K disk is removed and the system +is rebooted, the array's LBS will fall back to 512. If the array's LBS is 512, +then a disk with a 4K LBS cannot be added to the array. + +In general, choosing a large LBS for the array is beneficial, but it can +introduce write-amplification. The larger the array's LBS, the larger the write +size to each member disk. Therefore, users should evaluate their actual workload +and choose an appropriate LBS accordingly. + +The option should be set by the user based on their own usage scenario, choosing +an LBS value that best matches their needs. .SH For assemble: diff --git a/mdadm.c b/mdadm.c index 14649a40..c3773292 100644 --- a/mdadm.c +++ b/mdadm.c @@ -30,6 +30,7 @@ #include "xmalloc.h" #include +#include /** * set_bitmap_value() - set bitmap value. @@ -76,6 +77,33 @@ static mdadm_status_t set_bitmap_value(struct shape *s, struct context *c, char return MDADM_STATUS_ERROR; } +/* + * Logical block size settings only support metadata 1.x. + */ +static mdadm_status_t shape_set_logical_block_size(struct shape *s, char *optarg) +{ + char *end; + unsigned long size = strtoul(optarg, &end, 10); + + if (end != optarg + strlen(optarg)) { + pr_err("logical block size [%s] can't be converted to an integer\n", optarg); + return MDADM_STATUS_ERROR; + } else if (errno == ERANGE) { + pr_err("logical block size [%s] more than ULONG_MAX\n", optarg); + return MDADM_STATUS_ERROR; + } + + /* Here only perform a simple check, while detailed check will be handled in kernel */ + if (size == 0 || size > UINT_MAX) { + pr_err("The range of logical-block-size is (0, %u], current is %lu\n", + UINT_MAX, size); + return MDADM_STATUS_ERROR; + } + + s->logical_block_size = size; + return MDADM_STATUS_SUCCESS; +} + static int scan_assemble(struct supertype *ss, struct context *c, struct mddev_ident *ident); @@ -116,6 +144,7 @@ int main(int argc, char *argv[]) .consistency_policy = CONSISTENCY_POLICY_UNKNOWN, .data_offset = INVALID_SECTORS, .btype = BitmapUnknown, + .logical_block_size = 0, }; char sys_hostname[256]; @@ -1185,6 +1214,10 @@ int main(int argc, char *argv[]) exit(2); } continue; + case O(CREATE, LogicalBlockSize): + if (shape_set_logical_block_size(&s, optarg) != MDADM_STATUS_SUCCESS) + exit(2); + continue; } /* We have now processed all the valid options. Anything else is * an error @@ -1200,6 +1233,18 @@ int main(int argc, char *argv[]) } + /* When metadata is not specified using the -e option, + * metadata version is 1.2 by default. So the logical + * block size can be configured. + * When using the -e option, need to check if the + * metadata version is 1.x. + */ + if (s.logical_block_size && ss && strcmp(ss->ss->name, "1.x")){ + pr_err("The logical block size is only supported for metadata 1.x.\n"); + pr_err("Current metadata version is %s\n", ss->ss->name); + exit(2); + } + if (print_help) { char *help_text; if (print_help == 2) diff --git a/mdadm.h b/mdadm.h index 84bd2c91..13b4821e 100644 --- a/mdadm.h +++ b/mdadm.h @@ -496,6 +496,7 @@ enum special_options { ClusterConfirm, WriteJournal, ConsistencyPolicy, + LogicalBlockSize, }; enum update_opt { @@ -657,6 +658,7 @@ struct shape { unsigned long long data_offset; int consistency_policy; change_dir_t direction; + unsigned int logical_block_size; }; /* List of device names - wildcards expanded */