Skip to content

Commit bbb21bb

Browse files
author
DocMAX
committed
scrub: improve stability during device disconnections
1 parent a522c59 commit bbb21bb

File tree

1 file changed

+81
-4
lines changed

1 file changed

+81
-4
lines changed

cmds/scrub.c

Lines changed: 81 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -816,7 +816,18 @@ static struct scrub_progress *scrub_resumed_stats(struct scrub_progress *data,
816816
_SCRUB_SUM(dest, data, malloc_errors);
817817
_SCRUB_SUM(dest, data, uncorrectable_errors);
818818
_SCRUB_SUM(dest, data, corrected_errors);
819-
_SCRUB_COPY(dest, data, last_physical);
819+
820+
/*
821+
* Preserve the maximum last_physical position from resumed or current data.
822+
* This handles the case where last_physical was reset to 0 due to device
823+
* disconnection but we still want to resume from the highest position
824+
* we actually reached.
825+
*/
826+
if (data->resumed->p.last_physical > data->scrub_args.progress.last_physical)
827+
dest->scrub_args.progress.last_physical = data->resumed->p.last_physical;
828+
else
829+
dest->scrub_args.progress.last_physical = data->scrub_args.progress.last_physical;
830+
820831
dest->stats.canceled = data->stats.canceled;
821832
dest->stats.finished = data->stats.finished;
822833
dest->stats.t_resumed = data->stats.t_start;
@@ -968,10 +979,23 @@ static void *scrub_one_dev(void *ctx)
968979
sp->stats.duration = tv.tv_sec - sp->stats.t_start;
969980
sp->stats.canceled = !!ret;
970981
sp->ioctl_errno = errno;
982+
983+
/*
984+
* For device disconnection errors, preserve the progress by marking
985+
* as interrupted rather than canceled, to allow resume to continue
986+
* from the last position
987+
*/
988+
if (ret && (errno == ENODEV || errno == ENOTCONN || errno == EIO)) {
989+
sp->stats.canceled = 0;
990+
sp->stats.finished = 0; /* Mark as interrupted for resume */
991+
} else {
992+
sp->stats.canceled = !!ret;
993+
sp->stats.finished = 1;
994+
}
995+
971996
ret = pthread_mutex_lock(&sp->progress_mutex);
972997
if (ret)
973998
return ERR_PTR(-ret);
974-
sp->stats.finished = 1;
975999
ret = pthread_mutex_unlock(&sp->progress_mutex);
9761000
if (ret)
9771001
return ERR_PTR(-ret);
@@ -1051,12 +1075,26 @@ static void *scrub_progress_cycle(void *ctx)
10511075
gettimeofday(&tv, NULL);
10521076
this = (this + 1)%2;
10531077
last = (last + 1)%2;
1078+
10541079
for (i = 0; i < ndev; ++i) {
10551080
sp = &spc->progress[this * ndev + i];
10561081
sp_last = &spc->progress[last * ndev + i];
10571082
sp_shared = &spc->shared_progress[i];
1083+
10581084
if (sp->stats.finished)
10591085
continue;
1086+
1087+
/*
1088+
* For devices with recent connection issues, try to
1089+
* reconnect by retrying the progress ioctl a few times
1090+
* in case the device comes back online
1091+
*/
1092+
int retry_count = 0;
1093+
if (sp_last->ioctl_errno == ENODEV || sp_last->ioctl_errno == ENOTCONN) {
1094+
retry_count = 3;
1095+
}
1096+
1097+
retry_progress:
10601098
progress_one_dev(sp);
10611099
sp->stats.duration = tv.tv_sec - sp->stats.t_start;
10621100
if (!sp->ret)
@@ -1066,11 +1104,27 @@ static void *scrub_progress_cycle(void *ctx)
10661104
ret = -sp->ioctl_errno;
10671105
goto out;
10681106
}
1107+
1108+
/*
1109+
* If device is temporarily unavailable and we have retries left,
1110+
* wait a moment and try again
1111+
*/
1112+
if (retry_count > 0 && (sp->ioctl_errno == ENODEV || sp->ioctl_errno == ENOTCONN)) {
1113+
struct timespec sleep_time = {0, 500000000}; /* 0.5 seconds */
1114+
nanosleep(&sleep_time, NULL);
1115+
retry_count--;
1116+
goto retry_progress;
1117+
}
1118+
10691119
/*
10701120
* scrub finished or device removed, check the
10711121
* finished flag. if unset, just use the last
10721122
* result we got for the current write and go
10731123
* on. flag should be set on next cycle, then.
1124+
*
1125+
* For device removal (ENODEV), preserve the last_physical
1126+
* position in case this was caused by a temporary
1127+
* disconnection like USB hub reset.
10741128
*/
10751129
perr = pthread_setcancelstate(
10761130
PTHREAD_CANCEL_DISABLE, &old);
@@ -1080,6 +1134,13 @@ static void *scrub_progress_cycle(void *ctx)
10801134
if (perr)
10811135
goto out;
10821136
if (!sp_shared->stats.finished) {
1137+
/*
1138+
* Preserve the last_physical position to avoid
1139+
* losing progress on temporary disconnections
1140+
*/
1141+
if (sp->ioctl_errno == ENODEV && sp_last->scrub_args.progress.last_physical > 0) {
1142+
sp_shared->scrub_args.progress.last_physical = sp_last->scrub_args.progress.last_physical;
1143+
}
10831144
perr = pthread_mutex_unlock(
10841145
&sp_shared->progress_mutex);
10851146
if (perr)
@@ -1120,8 +1181,24 @@ static void *scrub_progress_cycle(void *ctx)
11201181
}
11211182
if (!spc->do_record)
11221183
continue;
1123-
ret = scrub_write_progress(spc->write_mutex, fsid,
1124-
&spc->progress[this * ndev], ndev);
1184+
1185+
/*
1186+
* Force progress saving more frequently if we have device issues
1187+
* to prevent data loss during temporary disconnections
1188+
*/
1189+
int force_write = 0;
1190+
for (i = 0; i < ndev; ++i) {
1191+
struct scrub_progress *sp_check = &spc->progress[this * ndev + i];
1192+
if (sp_check->ioctl_errno == ENODEV || sp_check->ioctl_errno == ENOTCONN) {
1193+
force_write = 1;
1194+
break;
1195+
}
1196+
}
1197+
1198+
if (force_write || (tv.tv_sec % 30) == 0) {
1199+
ret = scrub_write_progress(spc->write_mutex, fsid,
1200+
&spc->progress[this * ndev], ndev);
1201+
}
11251202
if (ret)
11261203
goto out;
11271204
}

0 commit comments

Comments
 (0)