From 1490ff235388daf85aba13f0e2b9d402fc5493c8 Mon Sep 17 00:00:00 2001 From: Charlotte Godley Date: Fri, 8 Jun 2018 11:32:58 +0100 Subject: [PATCH 01/60] #428: make host port customisable through args to cluster init --- cmd/dm/pkg/commands/cluster.go | 6 ++++++ cmd/dotmesh-server/require_zfs.sh | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/cmd/dm/pkg/commands/cluster.go b/cmd/dm/pkg/commands/cluster.go index 569e658fa..e7c4cd308 100644 --- a/cmd/dm/pkg/commands/cluster.go +++ b/cmd/dm/pkg/commands/cluster.go @@ -67,6 +67,7 @@ var ( usePoolDir string usePoolName string discoveryUrl string + port int ) // names of environment variables we pass from the content of `dm cluster {init,join}` @@ -250,6 +251,10 @@ func NewCmdClusterInit(out io.Writer) *cobra.Command { &serverCount, "count", 1, "Initial cluster size", ) + cmd.Flags().IntVar( + &port, "port", 0, + "Port to run cluster on" + ) return cmd } @@ -662,6 +667,7 @@ func startDotmeshContainer(pkiPath string) error { "-e", fmt.Sprintf("DOTMESH_DOCKER_IMAGE=%s", dotmeshDockerImage), "-e", fmt.Sprintf("DOTMESH_UPGRADES_URL=%s", checkpointUrl), "-e", fmt.Sprintf("DOTMESH_UPGRADES_INTERVAL_SECONDS=%d", checkpointInterval), + "-e", fmt.Sprintf("DOTMESH_SERVER_PORT=%d", port), } // inject the inherited env variables from the context of the dm binary into require_zfs.sh diff --git a/cmd/dotmesh-server/require_zfs.sh b/cmd/dotmesh-server/require_zfs.sh index 4e095750f..8f923a359 100755 --- a/cmd/dotmesh-server/require_zfs.sh +++ b/cmd/dotmesh-server/require_zfs.sh @@ -206,8 +206,8 @@ pki_volume_mount="" if [ "$PKI_PATH" != "" ]; then pki_volume_mount="-v $PKI_PATH:/pki" fi - -net="-p 32607:32607 -p 32608:32608" +PORT=${DOTMESH_SERVER_PORT:-32607} +net="-p ${PORT}:32607 -p 32608:32608" link="" # this setting means we have set DOTMESH_ETCD_ENDPOINT to a known working From 94e4ddbaeb209d317e58d44deefe59ade4a952a7 Mon Sep 17 00:00:00 2001 From: Charlotte Godley Date: Fri, 8 Jun 2018 16:19:39 +0100 Subject: [PATCH 02/60] #428: corrected default if port not supplied --- cmd/dm/pkg/commands/cluster.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/dm/pkg/commands/cluster.go b/cmd/dm/pkg/commands/cluster.go index e7c4cd308..720549453 100644 --- a/cmd/dm/pkg/commands/cluster.go +++ b/cmd/dm/pkg/commands/cluster.go @@ -252,7 +252,7 @@ func NewCmdClusterInit(out io.Writer) *cobra.Command { "Initial cluster size", ) cmd.Flags().IntVar( - &port, "port", 0, + &port, "port", 32607, "Port to run cluster on" ) return cmd From 3dd7f00d70e48d55c02323cfb137a3a46f1ed089 Mon Sep 17 00:00:00 2001 From: Charlotte Godley Date: Fri, 8 Jun 2018 16:36:36 +0100 Subject: [PATCH 03/60] #428: fix syntax error --- cmd/dm/pkg/commands/cluster.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/dm/pkg/commands/cluster.go b/cmd/dm/pkg/commands/cluster.go index 720549453..e48cc78ba 100644 --- a/cmd/dm/pkg/commands/cluster.go +++ b/cmd/dm/pkg/commands/cluster.go @@ -253,7 +253,7 @@ func NewCmdClusterInit(out io.Writer) *cobra.Command { ) cmd.Flags().IntVar( &port, "port", 32607, - "Port to run cluster on" + "Port to run cluster on", ) return cmd } From 653df690a27843a633ebd39432adc5db54d7b29f Mon Sep 17 00:00:00 2001 From: Charlotte Godley Date: Fri, 8 Jun 2018 18:27:44 +0100 Subject: [PATCH 04/60] #248: put back default + cut out hardcoding of port 0 --- cmd/dm/pkg/commands/cluster.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cmd/dm/pkg/commands/cluster.go b/cmd/dm/pkg/commands/cluster.go index e48cc78ba..24c4c4891 100644 --- a/cmd/dm/pkg/commands/cluster.go +++ b/cmd/dm/pkg/commands/cluster.go @@ -251,8 +251,9 @@ func NewCmdClusterInit(out io.Writer) *cobra.Command { &serverCount, "count", 1, "Initial cluster size", ) + // TODO: need to block using 32608, and probably block anything lower than 32xx for host port reasons? cmd.Flags().IntVar( - &port, "port", 32607, + &port, "port", 0, "Port to run cluster on", ) return cmd @@ -883,7 +884,7 @@ func clusterCommonSetup(clusterUrl, adminPassword, adminKey, pkiPath string) err return err } } - err = config.AddRemote("local", "admin", getHostFromEnv(), 0, adminKey) + err = config.AddRemote("local", "admin", getHostFromEnv(), port, adminKey) if err != nil { return err } @@ -919,16 +920,19 @@ func clusterCommonSetup(clusterUrl, adminPassword, adminKey, pkiPath string) err time.Sleep(250 * time.Millisecond) } if err != nil { + fmt.Printf("Errored creating api") e() return false } var response bool response, err = dm.PingLocal() if err != nil { + fmt.Printf("Errored pinging, %#v", dm.Configuration.Remotes["local"]) e() return false } if !response { + fmt.Printf("Response failure...") e() } fmt.Printf("\n") From 41edda5d240be956528d634977ae7ef703ec2444 Mon Sep 17 00:00:00 2001 From: Charlotte Godley Date: Fri, 8 Jun 2018 18:28:07 +0100 Subject: [PATCH 05/60] #248: make test use the port option to check cluster init --- tests/acceptance_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/acceptance_test.go b/tests/acceptance_test.go index a1120d359..c94b2beb9 100644 --- a/tests/acceptance_test.go +++ b/tests/acceptance_test.go @@ -1332,8 +1332,8 @@ func TestTwoSingleNodeClusters(t *testing.T) { citools.TeardownFinishedTestRuns() f := citools.Federation{ - citools.NewCluster(1), // cluster_0_node_0 - citools.NewCluster(1), // cluster_1_node_0 + citools.NewCluster(1), // cluster_0_node_0 + citools.NewClusterWithArgs(1, map[string]string{}, " --port 32609"), // cluster_1_node_0 } defer citools.TestMarkForCleanup(f) citools.AddFuncToCleanups(func() { citools.TestMarkForCleanup(f) }) @@ -1347,7 +1347,7 @@ func TestTwoSingleNodeClusters(t *testing.T) { node2 := f[1].GetNode(0).Container t.Run("SpecifyPort", func(t *testing.T) { - citools.RunOnNode(t, node1, "echo "+f[1].GetNode(0).ApiKey+" | dm remote add funny_port_remote admin@"+f[1].GetNode(0).IP+":32607") + citools.RunOnNode(t, node1, "echo "+f[1].GetNode(0).ApiKey+" | dm remote add funny_port_remote admin@"+f[1].GetNode(0).IP+":32609") }) t.Run("PushCommitBranchExtantBase", func(t *testing.T) { From 6f6da8d129e885501d3033209b0408a8db7bbf69 Mon Sep 17 00:00:00 2001 From: Charlotte Godley Date: Fri, 8 Jun 2018 19:03:38 +0100 Subject: [PATCH 06/60] #428: only add the port to the args list if it isn't the default (0) --- cmd/dm/pkg/commands/cluster.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cmd/dm/pkg/commands/cluster.go b/cmd/dm/pkg/commands/cluster.go index 24c4c4891..8a61a2351 100644 --- a/cmd/dm/pkg/commands/cluster.go +++ b/cmd/dm/pkg/commands/cluster.go @@ -668,7 +668,10 @@ func startDotmeshContainer(pkiPath string) error { "-e", fmt.Sprintf("DOTMESH_DOCKER_IMAGE=%s", dotmeshDockerImage), "-e", fmt.Sprintf("DOTMESH_UPGRADES_URL=%s", checkpointUrl), "-e", fmt.Sprintf("DOTMESH_UPGRADES_INTERVAL_SECONDS=%d", checkpointInterval), - "-e", fmt.Sprintf("DOTMESH_SERVER_PORT=%d", port), + } + if port != 0 { + args = append(args, "-e") + args = append(args, fmt.Sprintf("DOTMESH_SERVER_PORT=%d", port)) } // inject the inherited env variables from the context of the dm binary into require_zfs.sh @@ -885,6 +888,8 @@ func clusterCommonSetup(clusterUrl, adminPassword, adminKey, pkiPath string) err } } err = config.AddRemote("local", "admin", getHostFromEnv(), port, adminKey) + fmt.Printf("Port: %d", port) + fmt.Printf("Local remote: %#v", config.Remotes["local"]) if err != nil { return err } From d85a6d02a94ab521b75e574624b1c9ebe23f6afe Mon Sep 17 00:00:00 2001 From: Charlotte Godley Date: Mon, 11 Jun 2018 16:05:32 +0100 Subject: [PATCH 07/60] #428: use new method --- tests/acceptance_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/acceptance_test.go b/tests/acceptance_test.go index c94b2beb9..ca42022c6 100644 --- a/tests/acceptance_test.go +++ b/tests/acceptance_test.go @@ -1332,8 +1332,8 @@ func TestTwoSingleNodeClusters(t *testing.T) { citools.TeardownFinishedTestRuns() f := citools.Federation{ - citools.NewCluster(1), // cluster_0_node_0 - citools.NewClusterWithArgs(1, map[string]string{}, " --port 32609"), // cluster_1_node_0 + citools.NewCluster(1), // cluster_0_node_0 + citools.NewClusterOnPort(32609, 1), // cluster_1_node_0 } defer citools.TestMarkForCleanup(f) citools.AddFuncToCleanups(func() { citools.TestMarkForCleanup(f) }) From 0cecf880c3562065f49c056b59bb3d76edab5a4a Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Tue, 12 Jun 2018 16:07:57 +0100 Subject: [PATCH 08/60] #448: Make the operator replace pods in the "succeeded" state --- cmd/dotmesh-server/pkg/operator/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/dotmesh-server/pkg/operator/main.go b/cmd/dotmesh-server/pkg/operator/main.go index 4cbf45aad..a2aaa1cfa 100644 --- a/cmd/dotmesh-server/pkg/operator/main.go +++ b/cmd/dotmesh-server/pkg/operator/main.go @@ -599,7 +599,7 @@ func (c *dotmeshController) process() error { continue } - if status == v1.PodFailed { + if status == v1.PodFailed || status == v1.PodSucceeded { // We're deleting the pod, so the user can't "kubectl describe" it, so let's log lots of stuff glog.Infof("Observing pod %s - status %s: FAILED (Message: %s) (Reason: %s)", podName, From a07fe107a7ee76c923e8572b30b6b714ed9ee8a7 Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Tue, 12 Jun 2018 16:33:09 +0100 Subject: [PATCH 09/60] #449: Don't trap EXIT now we have cleanup code at the end; it was causing us to `exit 0` and discard the error code. Still doesn't explain why the container was dying (with exit code 2, as it happens); just why the exit code failure was being reported as successful completion. --- cmd/dotmesh-server/require_zfs.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/cmd/dotmesh-server/require_zfs.sh b/cmd/dotmesh-server/require_zfs.sh index 5e369e65c..cc84edc05 100755 --- a/cmd/dotmesh-server/require_zfs.sh +++ b/cmd/dotmesh-server/require_zfs.sh @@ -329,7 +329,6 @@ shutdown() { exit 0 } -trap 'shutdown EXIT' EXIT trap 'shutdown SIGTERM' SIGTERM trap 'shutdown SIGINT' SIGINT trap 'shutdown SIGQUIT' SIGQUIT From 27fea40f5e4825fc6fe5ea8d3ae72d12d26449a1 Mon Sep 17 00:00:00 2001 From: Priya Samuel Date: Tue, 12 Jun 2018 15:54:07 +0000 Subject: [PATCH 10/60] FIX: hold the lock on rpcTracker until the end of the function --- cmd/dotmesh-server/pkg/main/http.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/dotmesh-server/pkg/main/http.go b/cmd/dotmesh-server/pkg/main/http.go index 994060a30..8aaf46f35 100644 --- a/cmd/dotmesh-server/pkg/main/http.go +++ b/cmd/dotmesh-server/pkg/main/http.go @@ -332,12 +332,12 @@ func rpcAfterFunc(reqInfo *rpc.RequestInfo) { return } rpcTracker.mutex.Lock() + defer rpcTracker.mutex.Unlock() startedAt, found := rpcTracker.rpcDuration[reqUUID] if !found { fmt.Printf("Error: Unable to find requestUUID in requestTracker: %s", reqUUID) return } - rpcTracker.mutex.Unlock() duration := time.Since(startedAt) url := fmt.Sprintf("%s", reqInfo.Request.URL) statusCode := fmt.Sprintf("%v", reqInfo.StatusCode) From 19cd9d1ffdbfd84ea6da18ab10fa66face2ccf47 Mon Sep 17 00:00:00 2001 From: Luke Marsden Date: Tue, 12 Jun 2018 17:28:37 +0100 Subject: [PATCH 11/60] #450: fail forever if our attempt to align mount states fails. We really ought to be able to mount or unmount filesystems, if we can't, something is horribly wrong with that ZFS filesystem, and we should avoid thrashing. --- cmd/dotmesh-server/pkg/main/statemachines.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cmd/dotmesh-server/pkg/main/statemachines.go b/cmd/dotmesh-server/pkg/main/statemachines.go index 933fcdc39..b28aaf46a 100644 --- a/cmd/dotmesh-server/pkg/main/statemachines.go +++ b/cmd/dotmesh-server/pkg/main/statemachines.go @@ -1330,6 +1330,12 @@ func backoffState(f *fsMachine) stateFn { return discoveringState } +func failedState(f *fsMachine) stateFn { + f.transitionedTo("failed", "never coming back") + log.Printf("entering failed state for %s", f.filesystemId) + select {} +} + func (f *fsMachine) discover() error { // discover system state synchronously filesystem, err := discoverSystem(f.filesystemId) @@ -1384,11 +1390,12 @@ func discoveringState(f *fsMachine) stateFn { err := f.state.alignMountStateWithMasters(f.filesystemId) if err != nil { log.Printf( - "[discoveringState:%s] error trying to align mount state with masters: %v", + "[discoveringState:%s] error trying to align mount state with masters: %v, "+ + "going into failed state forever", f.filesystemId, err, ) - return backoffState + return failedState } // TODO do we need to acquire some locks here? if f.filesystem.mounted { From 7acbce6cb97a5af02cbeb56408617520c772e48a Mon Sep 17 00:00:00 2001 From: Luke Marsden Date: Tue, 12 Jun 2018 18:12:16 +0100 Subject: [PATCH 12/60] #450: stop looping as hard as possible on NoFromSnaps --- cmd/dotmesh-server/pkg/main/statemachines.go | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cmd/dotmesh-server/pkg/main/statemachines.go b/cmd/dotmesh-server/pkg/main/statemachines.go index b28aaf46a..e65bdbc2a 100644 --- a/cmd/dotmesh-server/pkg/main/statemachines.go +++ b/cmd/dotmesh-server/pkg/main/statemachines.go @@ -1465,15 +1465,14 @@ func receivingState(f *fsMachine) stateFn { if err != nil { switch err := err.(type) { - // TODO should the following 'discoveringState's be 'backoffState's? case *ToSnapsUpToDate: log.Printf("receivingState: ToSnapsUpToDate %s got %s", f.filesystemId, err) // this is fine, we're up-to-date - return discoveringState + return backoffState case *NoFromSnaps: log.Printf("receivingState: NoFromSnaps %s got %s", f.filesystemId, err) // this is fine, no snaps; can't replicate yet, but will - return discoveringState + return backoffState case *ToSnapsAhead: log.Printf("receivingState: ToSnapsAhead %s got %s", f.filesystemId, err) // erk, slave is ahead of master @@ -1482,7 +1481,7 @@ func receivingState(f *fsMachine) stateFn { log.Printf("receivingState(%s): Unable to recover from divergence: %+v", f.filesystemId, errx) } // Go to discovering state, to update the world with our recent ZFS actions. - return discoveringState + return backoffState case *ToSnapsDiverged: log.Printf("receivingState: ToSnapsDiverged %s got %s", f.filesystemId, err) errx := f.recoverFromDivergence(err.latestCommonSnapshot) @@ -1490,16 +1489,16 @@ func receivingState(f *fsMachine) stateFn { log.Printf("receivingState(%s): Unable to recover from divergence: %+v", f.filesystemId, errx) } // Go to discovering state, to update the world with our recent ZFS actions. - return discoveringState + return backoffState case *NoCommonSnapshots: log.Printf("receivingState: NoCommonSnapshots %s got %s", f.filesystemId, err) // erk, no common snapshots between master and slave // TODO: create a new local clone (branch), then delete the current // filesystem to enable replication to continue - return discoveringState + return backoffState default: log.Printf("receivingState: default error handler %s got %s", f.filesystemId, err) - return discoveringState + return backoffState } } From 3f45d5c1849f37567558e752a8f555a550ec4056 Mon Sep 17 00:00:00 2001 From: Luke Marsden Date: Tue, 12 Jun 2018 19:00:42 +0100 Subject: [PATCH 13/60] #450: fail immediately if you're a peer of a failed filesystem --- cmd/dotmesh-server/pkg/main/statemachines.go | 36 ++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/cmd/dotmesh-server/pkg/main/statemachines.go b/cmd/dotmesh-server/pkg/main/statemachines.go index e65bdbc2a..063d58dcc 100644 --- a/cmd/dotmesh-server/pkg/main/statemachines.go +++ b/cmd/dotmesh-server/pkg/main/statemachines.go @@ -1162,6 +1162,32 @@ func transferRequestify(in interface{}) (TransferRequest, error) { }, nil } +func (f *fsMachine) failedOnMaster() bool { + master := f.state.masterFor(f.filesystemId) + if master == "" { + log.Printf( + "[failedOnMaster:%s] unable to determine whether failed on master: %s has empty master", + f.filesystemId, + ) + return false + } + f.state.globalStateCacheLock.Lock() + defer f.state.globalStateCacheLock.Lock() + state, ok := (*f.state.globalStateCache)[master][f.filesystemId] + if !ok { + log.Printf( + "[failedOnMaster:%s] unable to determine whether failed on master: "+ + "%s has no entry in globalStateCache[%s][%s]", + f.filesystemId, + master, + f.filesystemId, + ) + // Don't know, say it's not failed. + return false + } + return state["state"] == "failed" +} + // either missing because you're about to be locally created or because the // filesystem exists somewhere else in the cluster func missingState(f *fsMachine) stateFn { @@ -1378,6 +1404,16 @@ func discoveringState(f *fsMachine) stateFn { f.transitionedTo("discovering", "loading") log.Printf("entering discovering state for %s", f.filesystemId) + if f.failedOnMaster() { + // abandon hope, all ye who enter here. this fsMachine is failed until + // further notice... + f.transitionedTo( + "discovering", + "going to failed because were failed on the master", + ) + return failedState + } + err := f.discover() if err != nil { log.Printf("%v while discovering state", err) From 63016bebdf85c6e9519b50036f02b6152914282d Mon Sep 17 00:00:00 2001 From: Luke Marsden Date: Tue, 12 Jun 2018 19:02:31 +0100 Subject: [PATCH 14/60] #450: fixup logging --- cmd/dotmesh-server/pkg/main/statemachines.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/dotmesh-server/pkg/main/statemachines.go b/cmd/dotmesh-server/pkg/main/statemachines.go index 063d58dcc..84cbeaa1a 100644 --- a/cmd/dotmesh-server/pkg/main/statemachines.go +++ b/cmd/dotmesh-server/pkg/main/statemachines.go @@ -1166,7 +1166,7 @@ func (f *fsMachine) failedOnMaster() bool { master := f.state.masterFor(f.filesystemId) if master == "" { log.Printf( - "[failedOnMaster:%s] unable to determine whether failed on master: %s has empty master", + "[failedOnMaster:%s] unable to determine whether failed on master: got empty master", f.filesystemId, ) return false @@ -1177,7 +1177,7 @@ func (f *fsMachine) failedOnMaster() bool { if !ok { log.Printf( "[failedOnMaster:%s] unable to determine whether failed on master: "+ - "%s has no entry in globalStateCache[%s][%s]", + "no entry in globalStateCache[%s][%s]", f.filesystemId, master, f.filesystemId, From 83584bc086dcc262f0d10d59116f719f2923ab56 Mon Sep 17 00:00:00 2001 From: Luke Marsden Date: Tue, 12 Jun 2018 19:45:50 +0100 Subject: [PATCH 15/60] #450: fixup locking --- cmd/dotmesh-server/pkg/main/statemachines.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/dotmesh-server/pkg/main/statemachines.go b/cmd/dotmesh-server/pkg/main/statemachines.go index 84cbeaa1a..8adec3260 100644 --- a/cmd/dotmesh-server/pkg/main/statemachines.go +++ b/cmd/dotmesh-server/pkg/main/statemachines.go @@ -1172,7 +1172,7 @@ func (f *fsMachine) failedOnMaster() bool { return false } f.state.globalStateCacheLock.Lock() - defer f.state.globalStateCacheLock.Lock() + defer f.state.globalStateCacheLock.Unlock() state, ok := (*f.state.globalStateCache)[master][f.filesystemId] if !ok { log.Printf( From 46fd890bd1b4c1dd98524881d51d0781a3fbfec2 Mon Sep 17 00:00:00 2001 From: Luke Marsden Date: Tue, 12 Jun 2018 22:59:33 +0100 Subject: [PATCH 16/60] #450: repair unmountable filesystems by unmounting them from the root namespace --- cmd/dotmesh-server/pkg/main/controller.go | 4 +- cmd/dotmesh-server/pkg/main/statemachines.go | 61 +++++++++----------- 2 files changed, 29 insertions(+), 36 deletions(-) diff --git a/cmd/dotmesh-server/pkg/main/controller.go b/cmd/dotmesh-server/pkg/main/controller.go index 8d96a7853..502f08c7e 100644 --- a/cmd/dotmesh-server/pkg/main/controller.go +++ b/cmd/dotmesh-server/pkg/main/controller.go @@ -139,13 +139,13 @@ func (s *InMemoryState) alignMountStateWithMasters(filesystemId string) error { fs, ok := (*s.filesystems)[filesystemId] if !ok { log.Printf( - "[maybeMountFilesystem] not doing anything - cannot find %v in fsMachines", + "[alignMountStateWithMasters] not doing anything - cannot find %v in fsMachines", filesystemId, ) return nil, false, fmt.Errorf("cannot find %v in fsMachines", filesystemId) } log.Printf( - "[maybeMountFilesystem] called for %v; masterFor=%v, myNodeId=%v; mounted=%b", + "[alignMountStateWithMasters] called for %v; masterFor=%v, myNodeId=%v; mounted=%b", filesystemId, s.masterFor(filesystemId), s.myNodeId, diff --git a/cmd/dotmesh-server/pkg/main/statemachines.go b/cmd/dotmesh-server/pkg/main/statemachines.go index 8adec3260..c33b502e3 100644 --- a/cmd/dotmesh-server/pkg/main/statemachines.go +++ b/cmd/dotmesh-server/pkg/main/statemachines.go @@ -520,46 +520,15 @@ waitingForSlaveSnapshot: } func (f *fsMachine) mount() (responseEvent *Event, nextState stateFn) { - /* - out, err := exec.Command( - "mkdir", "-p", mnt(f.filesystemId)).CombinedOutput() - if err != nil { - log.Printf("%v while trying to mkdir mountpoint %s", err, fq(f.filesystemId)) - return &Event{ - Name: "failed-mkdir-mountpoint", - Args: &EventArgs{"err": err, "combined-output": string(out)}, - }, backoffState - } - out, err = exec.Command("mount.zfs", "-o", "noatime", - fq(f.filesystemId), mnt(f.filesystemId)).CombinedOutput() - if err != nil { - log.Printf("%v while trying to mount %s", err, fq(f.filesystemId)) - return &Event{ - Name: "failed-mount", - Args: &EventArgs{"err": err, "combined-output": string(out)}, - }, backoffState - } - // trust that zero exit codes from mkdir && mount.zfs means - // that it worked and that the filesystem now exists and is - // mounted - f.snapshotsLock.Lock() - defer f.snapshotsLock.Unlock() - f.filesystem.exists = true // needed in create case - f.filesystem.mounted = true - return &Event{Name: "mounted", Args: &EventArgs{}}, activeState - */ - mountPath := mnt(f.filesystemId) zfsPath := fq(f.filesystemId) - // only try to make the folder if it doesn't already exist - // if there is an error - it means we could not mount so don't - // update the filesystem with mounted = true + // only try to make the directory if it doesn't already exist if _, err := os.Stat(mountPath); os.IsNotExist(err) { out, err := exec.Command( "mkdir", "-p", mountPath).CombinedOutput() if err != nil { - log.Printf("%v while trying to mkdir mountpoint %s", err, zfsPath) + log.Printf("[mount:%s] %v while trying to mkdir mountpoint %s", f.filesystemId, err, zfsPath) return &Event{ Name: "failed-mkdir-mountpoint", Args: &EventArgs{"err": err, "combined-output": string(out)}, @@ -567,7 +536,7 @@ func (f *fsMachine) mount() (responseEvent *Event, nextState stateFn) { } } - // omly try to use mount.zfs if it's not already present in the output + // only try to use mount.zfs if it's not already present in the output // of calling "mount" mounted, err := isFilesystemMounted(f.filesystemId) if err != nil { @@ -579,7 +548,31 @@ func (f *fsMachine) mount() (responseEvent *Event, nextState stateFn) { if !mounted { out, err := exec.Command("mount.zfs", "-o", "noatime", zfsPath, mountPath).CombinedOutput() + // if there is an error - it means we could not mount so don't + // update the filesystem with mounted = true if err != nil { + log.Printf("[mount:%s] %v while trying to mount %s", f.filesystemId, err, zfsPath) + if strings.Contains(string(out), "already mounted") { + // This can happen when the filesystem is mounted in another + // namespace. Try unmounting it from the root... + log.Printf( + "[mount:%s] attempting recovery-unmount in ns 1 after %v/%v", + f.filesystemId, err, string(out), + ) + rout, rerr := exec.Command( + "nsenter", "-t", "1", "-m", "-u", "-n", "-i", + "umount", mountPath, + ).CombinedOutput() + if rerr != nil { + return &Event{ + Name: "failed-recovery-unmount", + Args: &EventArgs{"err": rerr, "combined-output": string(rout)}, + }, backoffState + } + // recurse + // TODO limit recursion depth + return f.mount() + } log.Printf("%v while trying to mount %s", err, zfsPath) return &Event{ Name: "failed-mount", From 54b6f093b640b8fbe15b0ce8e25f518f67815e3d Mon Sep 17 00:00:00 2001 From: Luke Marsden Date: Tue, 12 Jun 2018 23:10:08 +0100 Subject: [PATCH 17/60] #450: globalStateCache is unreliable --- cmd/dotmesh-server/pkg/main/statemachines.go | 36 -------------------- 1 file changed, 36 deletions(-) diff --git a/cmd/dotmesh-server/pkg/main/statemachines.go b/cmd/dotmesh-server/pkg/main/statemachines.go index c33b502e3..36d68e8c0 100644 --- a/cmd/dotmesh-server/pkg/main/statemachines.go +++ b/cmd/dotmesh-server/pkg/main/statemachines.go @@ -1155,32 +1155,6 @@ func transferRequestify(in interface{}) (TransferRequest, error) { }, nil } -func (f *fsMachine) failedOnMaster() bool { - master := f.state.masterFor(f.filesystemId) - if master == "" { - log.Printf( - "[failedOnMaster:%s] unable to determine whether failed on master: got empty master", - f.filesystemId, - ) - return false - } - f.state.globalStateCacheLock.Lock() - defer f.state.globalStateCacheLock.Unlock() - state, ok := (*f.state.globalStateCache)[master][f.filesystemId] - if !ok { - log.Printf( - "[failedOnMaster:%s] unable to determine whether failed on master: "+ - "no entry in globalStateCache[%s][%s]", - f.filesystemId, - master, - f.filesystemId, - ) - // Don't know, say it's not failed. - return false - } - return state["state"] == "failed" -} - // either missing because you're about to be locally created or because the // filesystem exists somewhere else in the cluster func missingState(f *fsMachine) stateFn { @@ -1397,16 +1371,6 @@ func discoveringState(f *fsMachine) stateFn { f.transitionedTo("discovering", "loading") log.Printf("entering discovering state for %s", f.filesystemId) - if f.failedOnMaster() { - // abandon hope, all ye who enter here. this fsMachine is failed until - // further notice... - f.transitionedTo( - "discovering", - "going to failed because were failed on the master", - ) - return failedState - } - err := f.discover() if err != nil { log.Printf("%v while discovering state", err) From c8a86e41cbb5f7431d92764fe8e80bc11962fa0f Mon Sep 17 00:00:00 2001 From: Luke Marsden Date: Tue, 12 Jun 2018 23:48:32 +0100 Subject: [PATCH 18/60] FIX: Don't ever log to stdout, that's where JSON goes. --- cmd/dotmesh-server/pkg/dind-flexvolume/flexvolume.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmd/dotmesh-server/pkg/dind-flexvolume/flexvolume.go b/cmd/dotmesh-server/pkg/dind-flexvolume/flexvolume.go index c14ed20a4..cb1e2191d 100644 --- a/cmd/dotmesh-server/pkg/dind-flexvolume/flexvolume.go +++ b/cmd/dotmesh-server/pkg/dind-flexvolume/flexvolume.go @@ -23,7 +23,6 @@ import ( "encoding/json" "errors" "fmt" - "log" "os" "os/exec" "strconv" @@ -38,7 +37,7 @@ var flexVolumeDebug = false const DIND_SHARED_FOLDER = "/dotmesh-test-pools/dind-flexvolume" func System(cmd string, args ...string) error { - log.Printf("[system] running %s %s", cmd, args) + logger.Printf("[system] running %s %s", cmd, args) c := exec.Command(cmd, args...) c.Stdout = os.Stdout c.Stderr = os.Stderr From b9a966918368da10c471c56243ce409145f6d35d Mon Sep 17 00:00:00 2001 From: Luke Marsden Date: Tue, 12 Jun 2018 23:50:39 +0100 Subject: [PATCH 19/60] FIX: Don't ever log to stdout, that's where JSON goes - fixup. --- cmd/dotmesh-server/pkg/dind-flexvolume/flexvolume.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/dotmesh-server/pkg/dind-flexvolume/flexvolume.go b/cmd/dotmesh-server/pkg/dind-flexvolume/flexvolume.go index cb1e2191d..b00437b3e 100644 --- a/cmd/dotmesh-server/pkg/dind-flexvolume/flexvolume.go +++ b/cmd/dotmesh-server/pkg/dind-flexvolume/flexvolume.go @@ -23,6 +23,7 @@ import ( "encoding/json" "errors" "fmt" + "log" "os" "os/exec" "strconv" From d6adb34b580999e3512ae06b1315aaa760eee948 Mon Sep 17 00:00:00 2001 From: Luke Marsden Date: Wed, 13 Jun 2018 02:17:56 +0100 Subject: [PATCH 20/60] #450: put dotmesh-server-inner in pid=host so that it can get into the host mount namespace in order to repair broken mounts; also add better debugging around failed-recovery-mount --- cmd/dotmesh-server/pkg/main/statemachines.go | 5 ++++- cmd/dotmesh-server/require_zfs.sh | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cmd/dotmesh-server/pkg/main/statemachines.go b/cmd/dotmesh-server/pkg/main/statemachines.go index 36d68e8c0..84c0bdc74 100644 --- a/cmd/dotmesh-server/pkg/main/statemachines.go +++ b/cmd/dotmesh-server/pkg/main/statemachines.go @@ -566,7 +566,10 @@ func (f *fsMachine) mount() (responseEvent *Event, nextState stateFn) { if rerr != nil { return &Event{ Name: "failed-recovery-unmount", - Args: &EventArgs{"err": rerr, "combined-output": string(rout)}, + Args: &EventArgs{ + "original-err": err, "original-combined-output": string(out), + "recovery-err": rerr, "recovery-combined-output": string(rout), + }, }, backoffState } // recurse diff --git a/cmd/dotmesh-server/require_zfs.sh b/cmd/dotmesh-server/require_zfs.sh index cc84edc05..9409b6e4e 100755 --- a/cmd/dotmesh-server/require_zfs.sh +++ b/cmd/dotmesh-server/require_zfs.sh @@ -337,7 +337,7 @@ trap 'shutdown SIGKILL' SIGKILL set +e -docker run -i $rm_opt --privileged --name=$DOTMESH_INNER_SERVER_NAME \ +docker run -i $rm_opt --pid=host --privileged --name=$DOTMESH_INNER_SERVER_NAME \ -v /var/run/docker.sock:/var/run/docker.sock \ -v /run/docker/plugins:/run/docker/plugins \ -v $OUTER_DIR:$OUTER_DIR:rshared \ From c3ea9d255530b976ede0cb71068257ecf719e670 Mon Sep 17 00:00:00 2001 From: Luke Marsden Date: Wed, 13 Jun 2018 02:46:52 +0100 Subject: [PATCH 21/60] #450: search for and unmount a stuck filesystem from any pid namespace where it's mounted (hope that 'unmount' binary exists there). --- .gitlab-ci.yml | 2 +- cmd/dotmesh-server/pkg/main/statemachines.go | 61 +++++++++++++++++--- 2 files changed, 53 insertions(+), 10 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3870158d5..3f96dea65 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -17,8 +17,8 @@ variables: stages: - build - - test - deploy + - test - manual_deploy - notify diff --git a/cmd/dotmesh-server/pkg/main/statemachines.go b/cmd/dotmesh-server/pkg/main/statemachines.go index 84c0bdc74..60075414a 100644 --- a/cmd/dotmesh-server/pkg/main/statemachines.go +++ b/cmd/dotmesh-server/pkg/main/statemachines.go @@ -9,6 +9,7 @@ import ( "net/http" "os" "os/exec" + "path/filepath" "strconv" "strings" "sync" @@ -548,19 +549,58 @@ func (f *fsMachine) mount() (responseEvent *Event, nextState stateFn) { if !mounted { out, err := exec.Command("mount.zfs", "-o", "noatime", zfsPath, mountPath).CombinedOutput() - // if there is an error - it means we could not mount so don't - // update the filesystem with mounted = true if err != nil { log.Printf("[mount:%s] %v while trying to mount %s", f.filesystemId, err, zfsPath) if strings.Contains(string(out), "already mounted") { - // This can happen when the filesystem is mounted in another - // namespace. Try unmounting it from the root... + // This can happen when the filesystem is mounted in some other + // namespace for some reason. Try searching for it in all + // processes' mount namespaces, and recursively unmounting it + // from one namespace at a time until becomes free... + firstPidNSToUnmount, rerr := func() (string, error) { + mountTables, err := filepath.Glob("/proc/*/mounts") + if err != nil { + return "", err + } + if mountTables == nil { + return "", fmt.Errorf("no mount tables in /proc/*/mounts") + } + for _, mountTable := range mountTables { + mounts, err := ioutil.ReadFile(mountTable) + if err != nil { + // pids can disappear between globbing and reading + log.Printf( + "[mount:%s] ignoring error reading pid mount table %v: %v", + mountTable, err, + ) + continue + } + // return the first namespace found, as we'll unmount + // in there and then try again (recursively) + for _, line := range strings.Split(string(mounts), "\n") { + if strings.Contains(line, f.filesystemId) { + shrapnel := strings.Split(mountTable, "/") + // e.g. (0)/(1)proc/(2)X/(3)mounts + return shrapnel[2], nil + } + } + } + return "", fmt.Errorf("unable to find %s in any /proc/*/mounts", f.filesystemId) + }() + if rerr != nil { + return &Event{ + Name: "failed-finding-namespace-to-unmount", + Args: &EventArgs{ + "original-err": err, "original-combined-output": string(out), + "recovery-err": rerr, + }, + }, backoffState + } log.Printf( - "[mount:%s] attempting recovery-unmount in ns 1 after %v/%v", - f.filesystemId, err, string(out), + "[mount:%s] attempting recovery-unmount in ns %s after %v/%v", + f.filesystemId, firstPidNSToUnmount, err, string(out), ) rout, rerr := exec.Command( - "nsenter", "-t", "1", "-m", "-u", "-n", "-i", + "nsenter", "-t", firstPidNSToUnmount, "-m", "-u", "-n", "-i", "umount", mountPath, ).CombinedOutput() if rerr != nil { @@ -572,11 +612,14 @@ func (f *fsMachine) mount() (responseEvent *Event, nextState stateFn) { }, }, backoffState } - // recurse + // recurse, maybe we've made enough progress to be able to + // mount this time? + // // TODO limit recursion depth return f.mount() } - log.Printf("%v while trying to mount %s", err, zfsPath) + // if there is an error - it means we could not mount so don't + // update the filesystem with mounted = true return &Event{ Name: "failed-mount", Args: &EventArgs{"err": err, "combined-output": string(out)}, From 5e968d6554f17c0e79504176db742ed2fa67d3a8 Mon Sep 17 00:00:00 2001 From: Luke Marsden Date: Wed, 13 Jun 2018 03:56:27 +0100 Subject: [PATCH 22/60] FIX: At least we can stop reporting stale data in 'dm dot show' and the web UI and such. --- cmd/dotmesh-server/pkg/main/statemachines.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cmd/dotmesh-server/pkg/main/statemachines.go b/cmd/dotmesh-server/pkg/main/statemachines.go index 60075414a..90731b99e 100644 --- a/cmd/dotmesh-server/pkg/main/statemachines.go +++ b/cmd/dotmesh-server/pkg/main/statemachines.go @@ -344,6 +344,12 @@ func (f *fsMachine) transitionedTo(state string, status string) { log.Printf("error updating etcd: %s", update, err) return } + // we don't hear our own echo, so set it locally too. + f.state.globalStateCacheLock.Lock() + defer f.state.globalStateCacheLock.Unlock() + // fake an etcd version for anyone expecting a version field + update["version"] = "0" + (*f.state.globalStateCache)[f.state.myNodeId][f.filesystemId] = update } // state functions From 64195deee7e185c7f91857027060bbd9f70b6bde Mon Sep 17 00:00:00 2001 From: Luke Marsden Date: Wed, 13 Jun 2018 04:44:54 +0100 Subject: [PATCH 23/60] FIX: Create the map if it doesn't exist. --- cmd/dotmesh-server/pkg/main/statemachines.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmd/dotmesh-server/pkg/main/statemachines.go b/cmd/dotmesh-server/pkg/main/statemachines.go index 90731b99e..fe73f1028 100644 --- a/cmd/dotmesh-server/pkg/main/statemachines.go +++ b/cmd/dotmesh-server/pkg/main/statemachines.go @@ -349,6 +349,9 @@ func (f *fsMachine) transitionedTo(state string, status string) { defer f.state.globalStateCacheLock.Unlock() // fake an etcd version for anyone expecting a version field update["version"] = "0" + if _, ok := (*f.state.globalStateCache)[f.state.myNodeId]; !ok { + (*f.state.globalStateCache)[f.state.myNodeId] = map[string]map[string]string{} + } (*f.state.globalStateCache)[f.state.myNodeId][f.filesystemId] = update } From 97dddb987cf2306c0b2c4db79c767e39ad1f1885 Mon Sep 17 00:00:00 2001 From: Luke Marsden Date: Wed, 13 Jun 2018 05:05:04 +0100 Subject: [PATCH 24/60] #450: experiment - do the unmount and export from an appropriately rshared container, rather than at the require_zfs.sh level. --- cmd/dotmesh-server/require_zfs.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cmd/dotmesh-server/require_zfs.sh b/cmd/dotmesh-server/require_zfs.sh index 9409b6e4e..79df74220 100755 --- a/cmd/dotmesh-server/require_zfs.sh +++ b/cmd/dotmesh-server/require_zfs.sh @@ -309,11 +309,16 @@ cleanup() { return fi - # Release the ZFS pool + # Release the ZFS pool. Do so in a mount namespace which has $OUTER_DIR + # rshared, otherwise zpool export's unmounts can be mighty confusing. echo "`date`: Unmounting $MOUNTPOINT:" >> $POOL_LOGFILE - umount "$MOUNTPOINT" >> $POOL_LOGFILE 2>&1 || true + docker run -i --rm --name=require-zfs-unmounter-$$ --pid=host --privileged \ + -v $OUTER_DIR:$OUTER_DIR:rshared $DOTMESH_DOCKER_IMAGE \ + umount "$MOUNTPOINT" >> $POOL_LOGFILE 2>&1 || true echo "`date`: zpool exporting $POOL:" >> $POOL_LOGFILE - zpool export -f "$POOL" >> $POOL_LOGFILE 2>&1 + docker run -i --rm --name=require-zfs-exporter-$$ --pid=host --privileged \ + -v $OUTER_DIR:$OUTER_DIR:rshared $DOTMESH_DOCKER_IMAGE \ + zpool export -f "$POOL" >> $POOL_LOGFILE 2>&1 echo "`date`: Finished cleanup: zpool export returned $?" >> $POOL_LOGFILE } From a937d1528b028d661f8ef85b263af2fb7a1ba143 Mon Sep 17 00:00:00 2001 From: Priya Samuel Date: Wed, 13 Jun 2018 09:55:28 +0000 Subject: [PATCH 25/60] NFC: test and then deploy, to have some sanity checks --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3f96dea65..3870158d5 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -17,8 +17,8 @@ variables: stages: - build - - deploy - test + - deploy - manual_deploy - notify From 0e35915c1c8d9465a6a4d72869284c86540a0e53 Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Wed, 13 Jun 2018 14:01:23 +0100 Subject: [PATCH 26/60] NFC: A very useful comment about what namespace -v mounts come from, as I keep forgetting. --- cmd/dotmesh-server/require_zfs.sh | 33 ++++++++++++++++++------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/cmd/dotmesh-server/require_zfs.sh b/cmd/dotmesh-server/require_zfs.sh index 79df74220..7d08089b6 100755 --- a/cmd/dotmesh-server/require_zfs.sh +++ b/cmd/dotmesh-server/require_zfs.sh @@ -281,20 +281,6 @@ done # a pod but a container run from /var/run/docker.sock (while true; do docker logs -f dotmesh-server-inner || true; sleep 1; done) & -# In order of the -v options below: - -# 1. Mount the docker socket so that we can stop and start containers around -# e.g. dm reset. - -# 2. Be able to install the docker plugin. - -# 3. Be able to mount zfs filesystems from inside the container in -# such a way that they propogate up to the host, and be able to -# create some symlinks that we hand to the docker volume plugin. - -# 4. Be able to install a Kubernetes FlexVolume driver (we make symlinks -# where it tells us to). - TERMINATING=no cleanup() { @@ -342,6 +328,25 @@ trap 'shutdown SIGKILL' SIGKILL set +e +# In order of the -v options below: + +# 1. Mount the docker socket so that we can stop and start containers around +# e.g. dm reset. + +# 2. Be able to install the docker plugin. + +# 3. Be able to mount zfs filesystems from inside the container in +# such a way that they propogate up to the host, and be able to +# create some symlinks that we hand to the docker volume plugin. + +# 4. Be able to install a Kubernetes FlexVolume driver (we make symlinks +# where it tells us to). + +# NOTE: The *source* path in the -v flags IS AS SEEN BY THE HOST, +# *not* as seen by this container running require_zfs.sh, because +# we're talking to the host docker. That's why we must map +# $OUTER_DIR:$OUTER_DIR and not $DIR:$OUTER_DIR... + docker run -i $rm_opt --pid=host --privileged --name=$DOTMESH_INNER_SERVER_NAME \ -v /var/run/docker.sock:/var/run/docker.sock \ -v /run/docker/plugins:/run/docker/plugins \ From a60662031510e2235b5374f96719a5fbdf756da1 Mon Sep 17 00:00:00 2001 From: Priya Samuel Date: Wed, 13 Jun 2018 14:52:05 +0000 Subject: [PATCH 27/60] NFC: Remove stray println command on client --- cmd/dm/pkg/commands/main.go | 1 - 1 file changed, 1 deletion(-) diff --git a/cmd/dm/pkg/commands/main.go b/cmd/dm/pkg/commands/main.go index e43b5ce04..31795b5e1 100644 --- a/cmd/dm/pkg/commands/main.go +++ b/cmd/dm/pkg/commands/main.go @@ -165,7 +165,6 @@ func NewCmdRemote(out io.Writer) *cobra.Command { if err != nil { return err } - fmt.Printf("dm client %#v\n", dm) // allow this to be used be a script apiKey := os.Getenv("DOTMESH_PASSWORD") if apiKey == "" { From 988f7cf2d4ae42e519f8d9bb91708517e949aaf3 Mon Sep 17 00:00:00 2001 From: Charlotte Godley Date: Thu, 14 Jun 2018 10:58:14 +0100 Subject: [PATCH 28/60] #428: bump citools up to the latest version incl port work --- tests/Gopkg.lock | 2 +- .../dotmesh-io/citools/testtools.go | 47 +++++++++++++------ 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/tests/Gopkg.lock b/tests/Gopkg.lock index 93319bd5f..f2dd3f5c5 100644 --- a/tests/Gopkg.lock +++ b/tests/Gopkg.lock @@ -5,7 +5,7 @@ branch = "master" name = "github.com/dotmesh-io/citools" packages = ["."] - revision = "368e185f95705174d6d064dbf7e430f2d9eeedea" + revision = "930915871cd64abbed410f1767e2b3c6a470f674" [[projects]] branch = "master" diff --git a/tests/vendor/github.com/dotmesh-io/citools/testtools.go b/tests/vendor/github.com/dotmesh-io/citools/testtools.go index de637c1cd..7ee491af8 100644 --- a/tests/vendor/github.com/dotmesh-io/citools/testtools.go +++ b/tests/vendor/github.com/dotmesh-io/citools/testtools.go @@ -640,16 +640,15 @@ func TeardownFinishedTestRuns() { } func docker(node string, cmd string, env map[string]string) (string, error) { - envString := "" - c := exec.Command("docker", "exec", "-i", node, "bash", "-c", cmd) + args := []string{"exec"} if env != nil { for name, value := range env { - envString += name + "=" + value + " " - } - if len(envString) != 0 { - c = exec.Command("docker", "exec", "-e", envString, "-i", node, "bash", "-c", cmd) + args = append(args, []string{"-e", fmt.Sprintf("%s=%s", name, value)}...) } } + args = append(args, []string{"-i", node, "bash", "-c", cmd}...) + c := exec.Command("docker", args...) + var b bytes.Buffer var o, e io.Writer if _, ok := env["DEBUG_MODE"]; ok { @@ -853,10 +852,12 @@ type Node struct { IP string ApiKey string Password string + Port int } type Cluster struct { DesiredNodeCount int + Port int Env map[string]string ClusterArgs string Nodes []Node @@ -875,19 +876,24 @@ type Pair struct { RemoteName string } +func NewClusterOnPort(port, desiredNodeCount int) *Cluster { + emptyEnv := make(map[string]string) + return NewClusterWithArgs(desiredNodeCount, port, emptyEnv, "") +} + func NewCluster(desiredNodeCount int) *Cluster { emptyEnv := make(map[string]string) - return NewClusterWithArgs(desiredNodeCount, emptyEnv, "") + return NewClusterWithArgs(desiredNodeCount, 0, emptyEnv, "") } func NewClusterWithEnv(desiredNodeCount int, env map[string]string) *Cluster { - return NewClusterWithArgs(desiredNodeCount, env, "") + return NewClusterWithArgs(desiredNodeCount, 0, env, "") } // custom arguments that are passed through to `dm cluster {init,join}` -func NewClusterWithArgs(desiredNodeCount int, env map[string]string, args string) *Cluster { +func NewClusterWithArgs(desiredNodeCount, port int, env map[string]string, args string) *Cluster { env["DOTMESH_UPGRADES_URL"] = "" //set default test env vars - return &Cluster{DesiredNodeCount: desiredNodeCount, Env: env, ClusterArgs: args} + return &Cluster{DesiredNodeCount: desiredNodeCount, Port: port, Env: env, ClusterArgs: args} } func NewKubernetes(desiredNodeCount int, storageMode string, dindStorage bool) *Kubernetes { @@ -923,15 +929,22 @@ func NodeFromNodeName(t *testing.T, now int64, i, j int, clusterName string) Nod nil, ) var apiKey string + var port int if err != nil { fmt.Printf("no dm config found, proceeding without recording apiKey\n") } else { fmt.Printf("dm config on %s: %s\n", nodeName(now, i, j), dotmeshConfig) m := struct { - Remotes struct{ Local struct{ ApiKey string } } + Remotes struct { + Local struct { + ApiKey string + Port int + } + } }{} json.Unmarshal([]byte(dotmeshConfig), &m) apiKey = m.Remotes.Local.ApiKey + port = m.Remotes.Local.Port } // /root/.dotmesh/admin-password.txt is created on docker @@ -952,6 +965,7 @@ func NodeFromNodeName(t *testing.T, now int64, i, j int, clusterName string) Nod IP: nodeIP, ApiKey: apiKey, Password: password, + Port: port, } } @@ -1025,10 +1039,11 @@ SEARCHABLE HEADER: STARTING CLUSTER _, err := docker( pair.From.Container, fmt.Sprintf( - "echo %s |dm remote add %s admin@%s", + "echo %s |dm remote add %s admin@%s:%d", pair.To.ApiKey, pair.RemoteName, pair.To.IP, + pair.To.Port, ), nil, ) @@ -1601,12 +1616,16 @@ func (c *Cluster) Start(t *testing.T, now int64, i int) error { " --use-pool-dir /dotmesh-test-pools/" + poolId(now, i, 0) + " --use-pool-name " + poolId(now, i, 0) + " --dotmesh-upgrades-url ''" + + " --port " + strconv.Itoa(c.Port) + c.ClusterArgs fmt.Printf("running dm cluster init with following command: %s\n", dmInitCommand) + env := c.Env + env["DEBUG_MODE"] = "1" + st, err := docker( - nodeName(now, i, 0), dmInitCommand, c.Env) + nodeName(now, i, 0), dmInitCommand, env) if err != nil { return err @@ -1639,7 +1658,7 @@ func (c *Cluster) Start(t *testing.T, now int64, i int) error { localImageArgs()+" --use-pool-dir /dotmesh-test-pools/"+poolId(now, i, j), joinUrl, " --use-pool-name "+poolId(now, i, j)+c.ClusterArgs, - ), c.Env) + ), env) if err != nil { return err } From db54441b1153dce8c176950d1fc643a2b17c1607 Mon Sep 17 00:00:00 2001 From: Charlotte Godley Date: Thu, 14 Jun 2018 11:00:07 +0100 Subject: [PATCH 29/60] #428: add port to TransferRequest definitions and usage --- cmd/dm/pkg/remotes/api.go | 7 +- cmd/dotmesh-server/pkg/main/statemachines.go | 82 +++++++++++++------- cmd/dotmesh-server/pkg/main/types.go | 1 + 3 files changed, 59 insertions(+), 31 deletions(-) diff --git a/cmd/dm/pkg/remotes/api.go b/cmd/dm/pkg/remotes/api.go index 62d0b35c4..04528a2fc 100644 --- a/cmd/dm/pkg/remotes/api.go +++ b/cmd/dm/pkg/remotes/api.go @@ -2,8 +2,6 @@ package remotes import ( "fmt" - "golang.org/x/net/context" - "gopkg.in/cheggaaa/pb.v1" "io" "log" "os" @@ -12,6 +10,9 @@ import ( "sort" "strings" "time" + + "golang.org/x/net/context" + "gopkg.in/cheggaaa/pb.v1" ) const DEFAULT_BRANCH string = "master" @@ -843,6 +844,7 @@ push type TransferRequest struct { Peer string User string + Port int ApiKey string Direction string LocalNamespace string @@ -986,6 +988,7 @@ func (dm *DotmeshAPI) RequestTransfer( "DotmeshRPC.Transfer", TransferRequest{ Peer: remote.Hostname, User: remote.User, + Port: remote.Port, ApiKey: remote.ApiKey, Direction: direction, LocalNamespace: localNamespace, diff --git a/cmd/dotmesh-server/pkg/main/statemachines.go b/cmd/dotmesh-server/pkg/main/statemachines.go index 933fcdc39..a8bbb0078 100644 --- a/cmd/dotmesh-server/pkg/main/statemachines.go +++ b/cmd/dotmesh-server/pkg/main/statemachines.go @@ -1147,10 +1147,17 @@ func transferRequestify(in interface{}) (TransferRequest, error) { "Unable to cast %s to map[string]interface{}", in, ) } + var port int + if typed["Port"] == nil { + port = 0 + } else { + port = int(typed["Port"].(float64)) + } return TransferRequest{ Peer: typed["Peer"].(string), User: typed["User"].(string), ApiKey: typed["ApiKey"].(string), + Port: port, Direction: typed["Direction"].(string), LocalNamespace: typed["LocalNamespace"].(string), LocalName: typed["LocalName"].(string), @@ -1305,6 +1312,12 @@ func missingState(f *fsMachine) stateFn { f.innerResponses <- responseEvent return nextState } + } else if e.Name == "mount" { + f.innerResponses <- &Event{ + Name: "nothing-to-mount", + Args: &EventArgs{}, + } + return missingState } else { f.innerResponses <- &Event{ Name: "unhandled", @@ -1748,6 +1761,7 @@ func pushInitiatorState(f *fsMachine) stateFn { transferRequest.User, transferRequest.Peer, transferRequest.ApiKey, + transferRequest.Port, ) // TODO should we wait for the remote to ack that it's gone into the right state? @@ -2047,20 +2061,25 @@ func (f *fsMachine) pull( // 2. Perform GET, as receivingState does. Update as we go, similar to how // push does it. - url, err := deduceUrl( - context.Background(), - []string{transferRequest.Peer}, - // pulls are between clusters, so use external address where - // appropriate - "external", - transferRequest.User, - transferRequest.ApiKey, - ) - if err != nil { - return &Event{ - Name: "push-initiator-cant-deduce-url", - Args: &EventArgs{"err": err}, - }, backoffState + var url string + if transferRequest.Port == 0 { + url, err = deduceUrl( + context.Background(), + []string{transferRequest.Peer}, + // pulls are between clusters, so use external address where + // appropriate + "external", + transferRequest.User, + transferRequest.ApiKey, + ) + if err != nil { + return &Event{ + Name: "push-initiator-cant-deduce-url", + Args: &EventArgs{"err": err}, + }, backoffState + } + } else { + url = fmt.Sprintf("http://%s:%d", transferRequest.Peer, transferRequest.Port) } url = fmt.Sprintf( @@ -2321,22 +2340,26 @@ func (f *fsMachine) push( defer postWriter.Close() defer postReader.Close() - url, err := deduceUrl( - ctx, - []string{transferRequest.Peer}, - // pushes are between clusters, so use external address where - // appropriate - "external", - transferRequest.User, - transferRequest.ApiKey, - ) - if err != nil { - return &Event{ - Name: "push-initiator-cant-deduce-url", - Args: &EventArgs{"err": err}, - }, backoffState + var url string + if transferRequest.Port == 0 { + url, err = deduceUrl( + ctx, + []string{transferRequest.Peer}, + // pushes are between clusters, so use external address where + // appropriate + "external", + transferRequest.User, + transferRequest.ApiKey, + ) + if err != nil { + return &Event{ + Name: "push-initiator-cant-deduce-url", + Args: &EventArgs{"err": err}, + }, backoffState + } + } else { + url = fmt.Sprintf("http://%s:%d", transferRequest.Peer, transferRequest.Port) } - url = fmt.Sprintf( "%s/filesystems/%s/%s/%s", url, @@ -2833,6 +2856,7 @@ func pullInitiatorState(f *fsMachine) stateFn { transferRequest.User, transferRequest.Peer, transferRequest.ApiKey, + transferRequest.Port, ) var path PathToTopLevelFilesystem diff --git a/cmd/dotmesh-server/pkg/main/types.go b/cmd/dotmesh-server/pkg/main/types.go index d6d23abd5..d7519f65a 100644 --- a/cmd/dotmesh-server/pkg/main/types.go +++ b/cmd/dotmesh-server/pkg/main/types.go @@ -251,6 +251,7 @@ type fsMachine struct { type TransferRequest struct { Peer string // hostname User string + Port int ApiKey string //protected value in toString Direction string // "push" or "pull" LocalNamespace string From f4cfcbf71700fa6b7ff38ddd29c2a06ad0431500 Mon Sep 17 00:00:00 2001 From: Charlotte Godley Date: Thu, 14 Jun 2018 11:00:44 +0100 Subject: [PATCH 30/60] #428: add port to JsonRPC client + usage --- cmd/dotmesh-server/pkg/main/client.go | 16 ++++++++++++---- cmd/dotmesh-server/pkg/main/rpc.go | 2 +- cmd/dotmesh-server/pkg/main/utils.go | 2 +- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/cmd/dotmesh-server/pkg/main/client.go b/cmd/dotmesh-server/pkg/main/client.go index 213caa001..9a80d1818 100644 --- a/cmd/dotmesh-server/pkg/main/client.go +++ b/cmd/dotmesh-server/pkg/main/client.go @@ -19,13 +19,15 @@ type JsonRpcClient struct { User string Hostname string ApiKey string + Port int } -func NewJsonRpcClient(user, hostname, apiKey string) *JsonRpcClient { +func NewJsonRpcClient(user, hostname, apiKey string, port int) *JsonRpcClient { return &JsonRpcClient{ User: user, Hostname: hostname, ApiKey: apiKey, + Port: port, } } @@ -35,9 +37,15 @@ func (j *JsonRpcClient) CallRemote( ctx context.Context, method string, args interface{}, result interface{}, ) error { // RPCs are always between clusters, so "external" - url, err := deduceUrl(ctx, []string{j.Hostname}, "external", j.User, j.ApiKey) - if err != nil { - return err + var url string + var err error + if j.Port == 0 { + url, err = deduceUrl(ctx, []string{j.Hostname}, "external", j.User, j.ApiKey) + if err != nil { + return err + } + } else { + url = fmt.Sprintf("http://%s:%d", j.Hostname, j.Port) } url = fmt.Sprintf("%s/rpc", url) return j.reallyCallRemote(ctx, method, args, result, url) diff --git a/cmd/dotmesh-server/pkg/main/rpc.go b/cmd/dotmesh-server/pkg/main/rpc.go index 9f2f628c3..377389438 100644 --- a/cmd/dotmesh-server/pkg/main/rpc.go +++ b/cmd/dotmesh-server/pkg/main/rpc.go @@ -1106,7 +1106,7 @@ func (d *DotmeshRPC) Transfer( args *TransferRequest, result *string, ) error { - client := NewJsonRpcClient(args.User, args.Peer, args.ApiKey) + client := NewJsonRpcClient(args.User, args.Peer, args.ApiKey, args.Port) log.Printf("[Transfer] starting with %+v", safeArgs(*args)) diff --git a/cmd/dotmesh-server/pkg/main/utils.go b/cmd/dotmesh-server/pkg/main/utils.go index 50cc36259..1c939c3af 100644 --- a/cmd/dotmesh-server/pkg/main/utils.go +++ b/cmd/dotmesh-server/pkg/main/utils.go @@ -43,7 +43,7 @@ func deduceUrl(ctx context.Context, hostnames []string, mode, user, apiKey strin for _, urlToTry := range urlsToTry { // hostname (2nd arg) doesn't matter because we're just calling // reallyCallRemote which doesn't use it. - j := NewJsonRpcClient(user, "", apiKey) + j := NewJsonRpcClient(user, "", apiKey, 0) var result bool err := j.reallyCallRemote(ctx, "DotmeshRPC.Ping", nil, &result, urlToTry+"/rpc") if err == nil { From 91e5171bc1e48ca524d8f4b91576ca8b7a447a3f Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Thu, 14 Jun 2018 11:41:09 +0100 Subject: [PATCH 31/60] #401: Attempt to perform ALL zfs operations in a cleaned-up namespace. --- cmd/dotmesh-server/require_zfs.sh | 52 ++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 11 deletions(-) diff --git a/cmd/dotmesh-server/require_zfs.sh b/cmd/dotmesh-server/require_zfs.sh index 7d08089b6..4debfc654 100755 --- a/cmd/dotmesh-server/require_zfs.sh +++ b/cmd/dotmesh-server/require_zfs.sh @@ -84,7 +84,7 @@ if [ -n "$CONTAINER_POOL_MNT" ]; then # Make paths involving $OUTER_DIR work in OUR namespace, by binding $DIR to $OUTER_DIR mkdir -p $OUTER_DIR mount --make-rshared / - mount --bind $DIR $OUTER_DIR + mount --rbind $DIR $OUTER_DIR mount --make-rshared $OUTER_DIR echo "Here's the contents of $OUTER_DIR in the require_zfs.sh container:" ls -l $OUTER_DIR @@ -135,25 +135,34 @@ fi POOL_LOGFILE=$DIR/dotmesh_pool.log +run_in_zfs_container() { + NAME=$1 + shift + docker run -i --rm --pid=host --privileged --name=dotmesh-$NAME-$$ \ + -v $OUTER_DIR:$OUTER_DIR:rshared \ + $DOTMESH_DOCKER_IMAGE \ + "$@" +} + set -ex if [ ! -e /dev/zfs ]; then mknod -m 660 /dev/zfs c $(cat /sys/class/misc/zfs/dev |sed 's/:/ /g') fi -if ! zpool status $POOL; then +if ! run_in_zfs_container zpool-status zpool status $POOL; then # TODO: make case where truncate previously succeeded but zpool create # failed or never run recoverable. if [ ! -f $FILE ]; then truncate -s $POOL_SIZE $FILE - zpool create -m $MOUNTPOINT $POOL "$OUTER_DIR/dotmesh_data" + run_in_zfs_container zpool-create zpool create -m $MOUNTPOINT $POOL "$OUTER_DIR/dotmesh_data" echo "This directory contains dotmesh data files, please leave them alone unless you know what you're doing. See github.com/dotmesh-io/dotmesh for more information." > $DIR/README - zpool get -H guid $POOL |cut -f 3 > $DIR/dotmesh_pool_id + run_in_zfs_container zpool-get zpool get -H guid $POOL |cut -f 3 > $DIR/dotmesh_pool_id if [ -n "$CONTAINER_POOL_PVC_NAME" ]; then echo "$CONTAINER_POOL_PVC_NAME" > $DIR/dotmesh_pvc_name fi else - zpool import -f -d $OUTER_DIR $POOL + run_in_zfs_container zpool-import zpool import -f -d $OUTER_DIR $POOL fi echo "`date`: Pool '$POOL' mounted from host mountpoint '$OUTER_DIR', zfs mountpoint '$MOUNTPOINT'" >> $POOL_LOGFILE else @@ -281,6 +290,8 @@ done # a pod but a container run from /var/run/docker.sock (while true; do docker logs -f dotmesh-server-inner || true; sleep 1; done) & +# Prepare cleanup logic + TERMINATING=no cleanup() { @@ -295,16 +306,35 @@ cleanup() { return fi + if false + then + # Log mounts + + # '| egrep "$DIR|$OUTER_DIR"' might make this less verbose, but + # also might miss out useful information about parent + # mountpoints. For instaince, in dind mode in the test suite, the + # relevent mountpoint in the host is `/dotmesh-test-pools` rather + # than the full $OUTER_DIR. + + echo "`date`: DEBUG mounts on host:" >> $POOL_LOGFILE + nsenter -t 1 -m -u -n -i cat /proc/self/mountinfo | sed 's/^/HOST: /' >> $POOL_LOGFILE || true + echo "`date`: DEBUG mounts in require_zfs.sh container:" >> $POOL_LOGFILE + cat /proc/self/mountinfo | sed 's/^/OUTER: /' >> $POOL_LOGFILE || true + echo "`date`: DEBUG mounts in an inner container:" >> $POOL_LOGFILE + run_in_zfs_container inspect-namespace /bin/cat /proc/self/mountinfo | sed 's/^/INNER: /' >> $POOL_LOGFILE || true + echo "`date`: End of mount tables." >> $POOL_LOGFILE + fi + # Release the ZFS pool. Do so in a mount namespace which has $OUTER_DIR # rshared, otherwise zpool export's unmounts can be mighty confusing. + + # Step 1: Unmount the ZFS mountpoint, and any dots still inside echo "`date`: Unmounting $MOUNTPOINT:" >> $POOL_LOGFILE - docker run -i --rm --name=require-zfs-unmounter-$$ --pid=host --privileged \ - -v $OUTER_DIR:$OUTER_DIR:rshared $DOTMESH_DOCKER_IMAGE \ - umount "$MOUNTPOINT" >> $POOL_LOGFILE 2>&1 || true + run_in_zfs_container zpool-unmount umount --force --recursive "$MOUNTPOINT" >> $POOL_LOGFILE 2>&1 || true + + # Step 2: Shut down the pool. echo "`date`: zpool exporting $POOL:" >> $POOL_LOGFILE - docker run -i --rm --name=require-zfs-exporter-$$ --pid=host --privileged \ - -v $OUTER_DIR:$OUTER_DIR:rshared $DOTMESH_DOCKER_IMAGE \ - zpool export -f "$POOL" >> $POOL_LOGFILE 2>&1 + run_in_zfs_container zpool-export zpool export -f "$POOL" >> $POOL_LOGFILE 2>&1 echo "`date`: Finished cleanup: zpool export returned $?" >> $POOL_LOGFILE } From d99881d0a21fa50e4b4b59102a22eaf1182352de Mon Sep 17 00:00:00 2001 From: Priya Samuel Date: Fri, 15 Jun 2018 10:46:18 +0000 Subject: [PATCH 32/60] NFC: Allow failure of flaky tests until we can address them in flakes week --- .gitlab-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3870158d5..877712725 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -314,6 +314,7 @@ linux_kubernetes_operator_PVC: when: always except: - schedules + allow_failure: true linux_deletion_simple: stage: test From 9e62f98551a1a64081bc8867d3ce90542cd7f4ac Mon Sep 17 00:00:00 2001 From: Charlotte Godley Date: Fri, 15 Jun 2018 14:58:17 +0100 Subject: [PATCH 33/60] NFC: start samba server so file sharing with host is easier --- scripts/prepare_vagrant.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/prepare_vagrant.sh b/scripts/prepare_vagrant.sh index 1ccb718fa..21d578eeb 100644 --- a/scripts/prepare_vagrant.sh +++ b/scripts/prepare_vagrant.sh @@ -54,3 +54,6 @@ cd $HOME/$DISCOVERY_REPO cd $GOPATH/src/$GITHUB_HOST/$GITHUB_ORG/$GITHUB_REPO ./prime.sh + +# start samba for sharing between host and vm +docker run -d -p 139:139 -p 445:445 --name samba --restart always -v $GOPATH/src/$GITHUB_HOST/$GITHUB_ORG:/mount dperson/samba -u "admin;password" -s "vagrantshare;/mount;yes;no;no;admin;admin;admin;comment" \ No newline at end of file From 574d3d585872e28a9490b65d1285a5eb5bfb3634 Mon Sep 17 00:00:00 2001 From: Charlotte Godley Date: Fri, 15 Jun 2018 15:02:14 +0100 Subject: [PATCH 34/60] NFC: add docs on samba->vagrant usage --- docs/dev-commands.md | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/docs/dev-commands.md b/docs/dev-commands.md index 52fc735cc..667a3e0a9 100644 --- a/docs/dev-commands.md +++ b/docs/dev-commands.md @@ -196,28 +196,13 @@ vagrant ssh bash /vagrant/scripts/reset_vagrant.sh ``` -#### symlink code +#### Working from your host **Optional**: If you would like to work on code from your local machine (i.e not inside a VM), follow this section. Alternatively, you can just change code inside the vagrant VM and commit/push from inside it. - -There can be issues with Vagrant shared folders hence this being a manual step. - -```bash -vagrant ssh -cd $GOPATH/src/github.com/dotmesh-io -# might as well keep this - backs up the version vagrant checked out for you -mv dotmesh dotmesh2 -ln -s /vagrant dotmesh -# now $GOPATH/src/github.com/dotmesh-io/dotmesh -> /vagrant -> this repo on your host ``` - -**NOTE:** using the symlink can drastically slow down docker builds. - -You can use this script which copies the latest git hash from your host: - -```bash -cd /vagrant -make vagrant.sync +open smb://admin:password@172.17.1.178/vagrantshare ``` +This should open a shared network drive which allows you to edit files in the dotmesh-io folder of your vagrant machine go path from your host machine. + ## generic setup instructions From 10b3f839641ea166d2a16584e15ffe103db207b2 Mon Sep 17 00:00:00 2001 From: Kai Davenport Date: Fri, 15 Jun 2018 15:42:06 +0100 Subject: [PATCH 35/60] FIX: let's try TerminationGracePeriodSeconds to give the require_zfs.sh a chance to do it's thing --- cmd/dotmesh-server/pkg/operator/main.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cmd/dotmesh-server/pkg/operator/main.go b/cmd/dotmesh-server/pkg/operator/main.go index a2aaa1cfa..1fecf5f62 100644 --- a/cmd/dotmesh-server/pkg/operator/main.go +++ b/cmd/dotmesh-server/pkg/operator/main.go @@ -985,9 +985,10 @@ nodeLoop: }, }, }, - RestartPolicy: v1.RestartPolicyNever, - ServiceAccountName: "dotmesh", - Volumes: volumes, + RestartPolicy: v1.RestartPolicyNever, + TerminationGracePeriodSeconds: int64(500), + ServiceAccountName: "dotmesh", + Volumes: volumes, }, } From 4f7afff5f81d2ed34ddddc24ec65343668fde012 Mon Sep 17 00:00:00 2001 From: Kai Davenport Date: Fri, 15 Jun 2018 15:50:57 +0100 Subject: [PATCH 36/60] FIX: kai learns a bit about pointers --- cmd/dotmesh-server/pkg/operator/main.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmd/dotmesh-server/pkg/operator/main.go b/cmd/dotmesh-server/pkg/operator/main.go index 1fecf5f62..f65738ec1 100644 --- a/cmd/dotmesh-server/pkg/operator/main.go +++ b/cmd/dotmesh-server/pkg/operator/main.go @@ -905,6 +905,7 @@ nodeLoop: // Create a dotmesh pod (with local storage for now) assigned to this node privileged := true + terminationGracePeriodSeconds := int64(500) newDotmesh := v1.Pod{ ObjectMeta: meta_v1.ObjectMeta{ @@ -986,7 +987,7 @@ nodeLoop: }, }, RestartPolicy: v1.RestartPolicyNever, - TerminationGracePeriodSeconds: int64(500), + TerminationGracePeriodSeconds: &terminationGracePeriodSeconds, ServiceAccountName: "dotmesh", Volumes: volumes, }, From 2d84bde6117eb64e6f7554ec9e521a766386b266 Mon Sep 17 00:00:00 2001 From: Kai Davenport Date: Fri, 15 Jun 2018 16:36:33 +0100 Subject: [PATCH 37/60] FIX: only trigger end-to-end tests for master --- .gitlab-ci.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 877712725..80474a602 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -480,9 +480,8 @@ trigger_e2e_test: script: - echo - curl -X POST -F "token=$E2E_TRIGGER_TOKEN" -F "ref=master" -F "variables[DOTMESH_CI_COMMIT_SHA]=$CI_COMMIT_SHA" -F "variables[DOTMESH_CI_BUILD_REF_NAME]=$CI_BUILD_REF_NAME" http://gitlab.dotmesh.io:9999/api/v4/projects/31/trigger/pipeline - curl -X POST -F "token=$E2E_TRIGGER_TOKEN" -F "ref=master" -F "variables[DOTMESH_CI_COMMIT_SHA]=$CI_COMMIT_SHA" -F "variables[DOTMESH_CI_BUILD_REF_NAME]=$CI_BUILD_REF_NAME" http://gitlab.dotmesh.io:9999/api/v4/projects/31/trigger/pipeline - except: - - /^release-.*$/ - - schedules + only: + - master deploy_unstable_build: stage: deploy From 113a69da90f3ded61d23fc76dc5fc579863f8ba3 Mon Sep 17 00:00:00 2001 From: Kai Davenport Date: Fri, 15 Jun 2018 17:37:55 +0100 Subject: [PATCH 38/60] FIX: don't exit until the shutdown stack has unwound --- cmd/dotmesh-server/require_zfs.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/cmd/dotmesh-server/require_zfs.sh b/cmd/dotmesh-server/require_zfs.sh index 4debfc654..75b2b13e5 100755 --- a/cmd/dotmesh-server/require_zfs.sh +++ b/cmd/dotmesh-server/require_zfs.sh @@ -346,8 +346,6 @@ shutdown() { trap - $SIGNAL cleanup "signal $SIGNAL" - - exit 0 } trap 'shutdown SIGTERM' SIGTERM From 725f530ea2af39672e82d2976a371cb3f83d32c6 Mon Sep 17 00:00:00 2001 From: Kai Davenport Date: Fri, 15 Jun 2018 17:46:37 +0100 Subject: [PATCH 39/60] FIX: don't auto-deploy for schedules (smoke tests) --- .gitlab-ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 80474a602..b3dcbf6af 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -482,6 +482,8 @@ trigger_e2e_test: - curl -X POST -F "token=$E2E_TRIGGER_TOKEN" -F "ref=master" -F "variables[DOTMESH_CI_COMMIT_SHA]=$CI_COMMIT_SHA" -F "variables[DOTMESH_CI_BUILD_REF_NAME]=$CI_BUILD_REF_NAME" http://gitlab.dotmesh.io:9999/api/v4/projects/31/trigger/pipeline only: - master + except: + - schedules deploy_unstable_build: stage: deploy From ec103d390db9469e515abd38cbb3ac8d5f27fd0f Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Mon, 18 Jun 2018 09:31:30 +0100 Subject: [PATCH 40/60] #401: Tidy up logging --- cmd/dotmesh-server/require_zfs.sh | 37 ++++++++++++++++++------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/cmd/dotmesh-server/require_zfs.sh b/cmd/dotmesh-server/require_zfs.sh index 82456656a..4652e57d6 100755 --- a/cmd/dotmesh-server/require_zfs.sh +++ b/cmd/dotmesh-server/require_zfs.sh @@ -299,14 +299,14 @@ cleanup() { if [ $TERMINATING = no ] then - echo "`date`: Shutting down due to $REASON" >> $POOL_LOGFILE + echo "Shutting down due to $REASON" TERMINATING=yes else - echo "`date`: Ignoring $REASON as we're already shutting down" >> $POOL_LOGFILE + echo "Ignoring $REASON as we're already shutting down" return fi - if false + if true then # Log mounts @@ -316,27 +316,32 @@ cleanup() { # relevent mountpoint in the host is `/dotmesh-test-pools` rather # than the full $OUTER_DIR. - echo "`date`: DEBUG mounts on host:" >> $POOL_LOGFILE - nsenter -t 1 -m -u -n -i cat /proc/self/mountinfo | sed 's/^/HOST: /' >> $POOL_LOGFILE || true - echo "`date`: DEBUG mounts in require_zfs.sh container:" >> $POOL_LOGFILE - cat /proc/self/mountinfo | sed 's/^/OUTER: /' >> $POOL_LOGFILE || true - echo "`date`: DEBUG mounts in an inner container:" >> $POOL_LOGFILE - run_in_zfs_container inspect-namespace /bin/cat /proc/self/mountinfo | sed 's/^/INNER: /' >> $POOL_LOGFILE || true - echo "`date`: End of mount tables." >> $POOL_LOGFILE + echo "DEBUG mounts on host:" + nsenter -t 1 -m -u -n -i cat /proc/self/mountinfo | sed 's/^/HOST: /' || true + echo "DEBUG mounts in require_zfs.sh container:" + cat /proc/self/mountinfo | sed 's/^/OUTER: /' || true + echo "DEBUG mounts in an inner container:" + run_in_zfs_container inspect-namespace /bin/cat /proc/self/mountinfo | sed 's/^/INNER: /' || true + echo "DEBUG End of mount tables." fi # Release the ZFS pool. Do so in a mount namespace which has $OUTER_DIR # rshared, otherwise zpool export's unmounts can be mighty confusing. # Step 1: Unmount the ZFS mountpoint, and any dots still inside - echo "`date`: Unmounting $MOUNTPOINT:" >> $POOL_LOGFILE - run_in_zfs_container zpool-unmount umount --force --recursive "$MOUNTPOINT" >> $POOL_LOGFILE 2>&1 || true + echo "Unmounting $MOUNTPOINT:" + run_in_zfs_container zpool-unmount umount --force --recursive "$MOUNTPOINT"|| true # Step 2: Shut down the pool. - echo "`date`: zpool exporting $POOL:" >> $POOL_LOGFILE - run_in_zfs_container zpool-export zpool export -f "$POOL" >> $POOL_LOGFILE 2>&1 - - echo "`date`: Finished cleanup: zpool export returned $?" >> $POOL_LOGFILE + echo "zpool exporting $POOL:" + if run_in_zfs_container zpool-export zpool export -f "$POOL" + then + echo "`date`: Exported pool OK" >> $POOL_LOGFILE + echo "Exported pool OK." + else + echo "`date`: ERROR: Failed exporting pool!" >> $POOL_LOGFILE + echo "ERROR: Failed exporting pool!." + fi } shutdown() { From eaa543b39d1e35607cd0409481767e47370ac628 Mon Sep 17 00:00:00 2001 From: Priya Samuel Date: Mon, 18 Jun 2018 11:19:44 +0000 Subject: [PATCH 41/60] #401: Manually unmount all /mnt/dmfs/ on shutdown --- cmd/dotmesh-server/require_zfs.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmd/dotmesh-server/require_zfs.sh b/cmd/dotmesh-server/require_zfs.sh index 4652e57d6..87c36b8da 100755 --- a/cmd/dotmesh-server/require_zfs.sh +++ b/cmd/dotmesh-server/require_zfs.sh @@ -328,7 +328,9 @@ cleanup() { # Release the ZFS pool. Do so in a mount namespace which has $OUTER_DIR # rshared, otherwise zpool export's unmounts can be mighty confusing. - # Step 1: Unmount the ZFS mountpoint, and any dots still inside + # Step 1: Unmount any dots and the ZFS mountpoint (if it's there) + echo "Unmount dots in $MOUNTPOINT/mnt/dmfs:" + run_in_zfs_container zpool-unmount-dots sh -c "cd \"$MOUNTPOINT/mnt/dmfs\"; for i in *; do umount --force $i; done" || true echo "Unmounting $MOUNTPOINT:" run_in_zfs_container zpool-unmount umount --force --recursive "$MOUNTPOINT"|| true From 1a9e100da03d3d1112238ef2e91ba10542412fbf Mon Sep 17 00:00:00 2001 From: Charlotte Godley Date: Mon, 18 Jun 2018 12:19:53 +0100 Subject: [PATCH 42/60] NFC: strip out some debug lines --- cmd/dm/pkg/commands/cluster.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/cmd/dm/pkg/commands/cluster.go b/cmd/dm/pkg/commands/cluster.go index 8a61a2351..ab6664d3a 100644 --- a/cmd/dm/pkg/commands/cluster.go +++ b/cmd/dm/pkg/commands/cluster.go @@ -932,12 +932,10 @@ func clusterCommonSetup(clusterUrl, adminPassword, adminKey, pkiPath string) err var response bool response, err = dm.PingLocal() if err != nil { - fmt.Printf("Errored pinging, %#v", dm.Configuration.Remotes["local"]) e() return false } if !response { - fmt.Printf("Response failure...") e() } fmt.Printf("\n") From da28b785549da84837c7b27fe78f7d1fa316e611 Mon Sep 17 00:00:00 2001 From: Charlotte Godley Date: Mon, 18 Jun 2018 12:22:44 +0100 Subject: [PATCH 43/60] NFC: strip out some debug lines --- cmd/dm/pkg/commands/cluster.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/cmd/dm/pkg/commands/cluster.go b/cmd/dm/pkg/commands/cluster.go index ab6664d3a..b44077742 100644 --- a/cmd/dm/pkg/commands/cluster.go +++ b/cmd/dm/pkg/commands/cluster.go @@ -888,8 +888,6 @@ func clusterCommonSetup(clusterUrl, adminPassword, adminKey, pkiPath string) err } } err = config.AddRemote("local", "admin", getHostFromEnv(), port, adminKey) - fmt.Printf("Port: %d", port) - fmt.Printf("Local remote: %#v", config.Remotes["local"]) if err != nil { return err } From 7de0116c00b06976be3da25b56c990a8d58e3a9a Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Mon, 18 Jun 2018 11:52:50 +0100 Subject: [PATCH 44/60] #401: Remove $DIR->$OUTER_DIR bind mount in `require_zfs.sh` container, as we no longer run ZFS commands here. --- cmd/dotmesh-server/require_zfs.sh | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/cmd/dotmesh-server/require_zfs.sh b/cmd/dotmesh-server/require_zfs.sh index 87c36b8da..caf1c5dca 100755 --- a/cmd/dotmesh-server/require_zfs.sh +++ b/cmd/dotmesh-server/require_zfs.sh @@ -78,20 +78,6 @@ if [ -n "$CONTAINER_POOL_MNT" ]; then echo "$DIR seems to be mounted from $BLOCK_DEVICE" OUTER_DIR=`nsenter -t 1 -m -u -n -i /bin/sh -c 'mount' | grep $BLOCK_DEVICE | cut -f 3 -d ' ' | head -n 1` echo "$BLOCK_DEVICE seems to be mounted on $OUTER_DIR in the host" - - if [ $OUTER_DIR != $DIR ] - then - # Make paths involving $OUTER_DIR work in OUR namespace, by binding $DIR to $OUTER_DIR - mkdir -p $OUTER_DIR - mount --make-rshared / - mount --rbind $DIR $OUTER_DIR - mount --make-rshared $OUTER_DIR - echo "Here's the contents of $OUTER_DIR in the require_zfs.sh container:" - ls -l $OUTER_DIR - echo "Here's the contents of $DIR in the require_zfs.sh container:" - ls -l $DIR - echo "They should be the same!" - fi else OUTER_DIR="$DIR" fi From 2844387bfed1d110bfdf6ddde2e866e7e89ca4c7 Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Mon, 18 Jun 2018 11:53:18 +0100 Subject: [PATCH 45/60] #401: Make log file name consistent with other stuff in $DIR, and log the hostname as it's handy for finding lost mounts. --- cmd/dotmesh-server/require_zfs.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmd/dotmesh-server/require_zfs.sh b/cmd/dotmesh-server/require_zfs.sh index caf1c5dca..53bf14126 100755 --- a/cmd/dotmesh-server/require_zfs.sh +++ b/cmd/dotmesh-server/require_zfs.sh @@ -119,7 +119,7 @@ else fi fi -POOL_LOGFILE=$DIR/dotmesh_pool.log +POOL_LOGFILE=$DIR/dotmesh_pool_log run_in_zfs_container() { NAME=$1 @@ -150,9 +150,9 @@ if ! run_in_zfs_container zpool-status zpool status $POOL; then else run_in_zfs_container zpool-import zpool import -f -d $OUTER_DIR $POOL fi - echo "`date`: Pool '$POOL' mounted from host mountpoint '$OUTER_DIR', zfs mountpoint '$MOUNTPOINT'" >> $POOL_LOGFILE + echo "`date`: Pool '$POOL' mounted from host mountpoint '$OUTER_DIR', zfs mountpoint '$MOUNTPOINT' on `hostname`" >> $POOL_LOGFILE else - echo "`date`: Pool '$POOL' already exists, adopted by new dotmesh server" >> $POOL_LOGFILE + echo "`date`: Pool '$POOL' already exists, adopted by new dotmesh server on `hostname`" >> $POOL_LOGFILE fi # Clear away stale socket if existing From 812de9976f16fa5cb3fafa23e50ffa22ef08ead7 Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Mon, 18 Jun 2018 12:18:40 +0100 Subject: [PATCH 46/60] NFC: Comments explaining the shared mounting. Future-me will thank me. --- cmd/dotmesh-server/require_zfs.sh | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/cmd/dotmesh-server/require_zfs.sh b/cmd/dotmesh-server/require_zfs.sh index 53bf14126..f11a380b1 100755 --- a/cmd/dotmesh-server/require_zfs.sh +++ b/cmd/dotmesh-server/require_zfs.sh @@ -86,7 +86,25 @@ fi MOUNTPOINT=${MOUNTPOINT:-$OUTER_DIR/mnt} CONTAINER_MOUNT_PREFIX=${CONTAINER_MOUNT_PREFIX:-$OUTER_DIR/container_mnt} -# Set up mounts that are needed +# Set the shared flag on the working directory on the host. This is +# essential; it, combined with the presence of the shared flag on the +# bind-mount of this into the container namespace when we run the +# dotmesh server container, means that mounts created by the dotmesh +# server will be propogated back up to the host. + +# The bind mount of the outer dir onto itself is a trick to make sure +# it's actually a mount - if it's just a directory on the host (eg, +# /var/lib/dotmesh), then we can't set the shared flag on it as it's +# part of some larger mount. Creating a bind mount means we have a +# distinct mount point at the local. However, it's not necessary if it +# really IS a mountpoint already (eg, a k8s pv). + +# Confused? Here's some further reading, in order: + +# https://lwn.net/Articles/689856/ +# https://lwn.net/Articles/690679/ +# https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt + nsenter -t 1 -m -u -n -i /bin/sh -c \ "set -xe $EXTRA_HOST_COMMANDS From 6496fd2d349c09e348efb3f32bb9cebe2c24e227 Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Tue, 19 Jun 2018 10:06:48 +0100 Subject: [PATCH 47/60] #401: Log BLOCK_DEVICE when available --- cmd/dotmesh-server/require_zfs.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmd/dotmesh-server/require_zfs.sh b/cmd/dotmesh-server/require_zfs.sh index f11a380b1..b428bc617 100755 --- a/cmd/dotmesh-server/require_zfs.sh +++ b/cmd/dotmesh-server/require_zfs.sh @@ -79,6 +79,7 @@ if [ -n "$CONTAINER_POOL_MNT" ]; then OUTER_DIR=`nsenter -t 1 -m -u -n -i /bin/sh -c 'mount' | grep $BLOCK_DEVICE | cut -f 3 -d ' ' | head -n 1` echo "$BLOCK_DEVICE seems to be mounted on $OUTER_DIR in the host" else + BLOCK_DEVICE="n/a" OUTER_DIR="$DIR" fi @@ -168,7 +169,7 @@ if ! run_in_zfs_container zpool-status zpool status $POOL; then else run_in_zfs_container zpool-import zpool import -f -d $OUTER_DIR $POOL fi - echo "`date`: Pool '$POOL' mounted from host mountpoint '$OUTER_DIR', zfs mountpoint '$MOUNTPOINT' on `hostname`" >> $POOL_LOGFILE + echo "`date`: Pool '$POOL' mounted from host mountpoint '$OUTER_DIR' from device '$BLOCK_DEVICE', zfs mountpoint '$MOUNTPOINT' on `hostname`" >> $POOL_LOGFILE else echo "`date`: Pool '$POOL' already exists, adopted by new dotmesh server on `hostname`" >> $POOL_LOGFILE fi From 21ca5b3a118c9995705eb14b489ec627a3c0c0a3 Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Tue, 19 Jun 2018 10:15:37 +0100 Subject: [PATCH 48/60] #401: Improved pool setup logging --- cmd/dotmesh-server/require_zfs.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cmd/dotmesh-server/require_zfs.sh b/cmd/dotmesh-server/require_zfs.sh index b428bc617..56ad3c1ad 100755 --- a/cmd/dotmesh-server/require_zfs.sh +++ b/cmd/dotmesh-server/require_zfs.sh @@ -154,6 +154,9 @@ set -ex if [ ! -e /dev/zfs ]; then mknod -m 660 /dev/zfs c $(cat /sys/class/misc/zfs/dev |sed 's/:/ /g') fi + +echo "`date`: Working directory on host '`hostname`' = '$OUTER_DIR', device = '$BLOCK_DEVICE', zfs mountpoint = '$MOUNTPOINT', pool = '$POOL'" + if ! run_in_zfs_container zpool-status zpool status $POOL; then # TODO: make case where truncate previously succeeded but zpool create @@ -166,12 +169,13 @@ if ! run_in_zfs_container zpool-status zpool status $POOL; then if [ -n "$CONTAINER_POOL_PVC_NAME" ]; then echo "$CONTAINER_POOL_PVC_NAME" > $DIR/dotmesh_pvc_name fi + echo "`date`: Pool created" >> $POOL_LOGFILE else run_in_zfs_container zpool-import zpool import -f -d $OUTER_DIR $POOL + echo "`date`: Pool imported" >> $POOL_LOGFILE fi - echo "`date`: Pool '$POOL' mounted from host mountpoint '$OUTER_DIR' from device '$BLOCK_DEVICE', zfs mountpoint '$MOUNTPOINT' on `hostname`" >> $POOL_LOGFILE else - echo "`date`: Pool '$POOL' already exists, adopted by new dotmesh server on `hostname`" >> $POOL_LOGFILE + echo "`date`: Pool already exists" >> $POOL_LOGFILE fi # Clear away stale socket if existing From badfb4653d7813d6b5bbf9852e624d451401cf0d Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Tue, 19 Jun 2018 11:29:51 +0100 Subject: [PATCH 49/60] #401: Get hostname from the host, not the container --- cmd/dotmesh-server/require_zfs.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cmd/dotmesh-server/require_zfs.sh b/cmd/dotmesh-server/require_zfs.sh index 56ad3c1ad..13d4232ff 100755 --- a/cmd/dotmesh-server/require_zfs.sh +++ b/cmd/dotmesh-server/require_zfs.sh @@ -34,14 +34,17 @@ function fetch_zfs { echo "Successfully loaded downloaded ZFS for $KERN :)" } +# Find the hostname from the actual host, rather than the container. +HOSTNAME="`nsenter -t 1 -m -u -n -i hostname`" + # Put the data file inside /var/lib so that we end up on the big # partition if we're in a LinuxKit env. POOL_SIZE=${POOL_SIZE:-10G} DIR=${USE_POOL_DIR:-/var/lib/dotmesh} -DIR=$(echo $DIR |sed s/\#HOSTNAME\#/$(hostname)/) +DIR=$(echo $DIR |sed s/\#HOSTNAME\#/$HOSTNAME/) FILE=${DIR}/dotmesh_data POOL=${USE_POOL_NAME:-pool} -POOL=$(echo $POOL |sed s/\#HOSTNAME\#/$(hostname)/) +POOL=$(echo $POOL |sed s/\#HOSTNAME\#/$HOSTNAME/) DOTMESH_INNER_SERVER_NAME=${DOTMESH_INNER_SERVER_NAME:-dotmesh-server-inner} FLEXVOLUME_DRIVER_DIR=${FLEXVOLUME_DRIVER_DIR:-/usr/libexec/kubernetes/kubelet-plugins/volume/exec} INHERIT_ENVIRONMENT_NAMES=( "FILESYSTEM_METADATA_TIMEOUT" "DOTMESH_UPGRADES_URL" "DOTMESH_UPGRADES_INTERVAL_SECONDS") @@ -155,7 +158,7 @@ if [ ! -e /dev/zfs ]; then mknod -m 660 /dev/zfs c $(cat /sys/class/misc/zfs/dev |sed 's/:/ /g') fi -echo "`date`: Working directory on host '`hostname`' = '$OUTER_DIR', device = '$BLOCK_DEVICE', zfs mountpoint = '$MOUNTPOINT', pool = '$POOL'" +echo "`date`: On host '$HOSTNAME', working directory = '$OUTER_DIR', device = '$BLOCK_DEVICE', zfs mountpoint = '$MOUNTPOINT', pool = '$POOL'" if ! run_in_zfs_container zpool-status zpool status $POOL; then From 568c2655ce218f0e3596df5b89649d67d902c093 Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Tue, 19 Jun 2018 13:55:03 +0100 Subject: [PATCH 50/60] #457: Passed in the actual dot name after parsing, rather than the original one with `admin/` still attached --- cmd/dm/pkg/commands/dot.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cmd/dm/pkg/commands/dot.go b/cmd/dm/pkg/commands/dot.go index 21dcffe0b..ff2cd95ce 100644 --- a/cmd/dm/pkg/commands/dot.go +++ b/cmd/dm/pkg/commands/dot.go @@ -238,17 +238,17 @@ func dotShow(cmd *cobra.Command, args []string, out io.Writer) error { return err } - var localDot string + var qualifiedDotName string if len(args) == 1 { - localDot = args[0] + qualifiedDotName = args[0] } else { - localDot, err = dm.CurrentVolume() + qualifiedDotName, err = dm.CurrentVolume() if err != nil { return err } } - namespace, dot, err := remotes.ParseNamespacedVolume(localDot) + namespace, dot, err := remotes.ParseNamespacedVolume(qualifiedDotName) if err != nil { return err } @@ -307,12 +307,12 @@ func dotShow(cmd *cobra.Command, args []string, out io.Writer) error { } } - currentBranch, err := dm.CurrentBranch(localDot) + currentBranch, err := dm.CurrentBranch(qualifiedDotName) if err != nil { return err } - bs, err := dm.AllBranches(localDot) + bs, err := dm.AllBranches(qualifiedDotName) if err != nil { return err } @@ -329,7 +329,7 @@ func dotShow(cmd *cobra.Command, args []string, out io.Writer) error { if branch == "master" { branchDot = masterDot } else { - branchDot, err = dm.BranchInfo(namespace, localDot, branch) + branchDot, err = dm.BranchInfo(namespace, dot, branch) if err != nil { return err } @@ -371,7 +371,7 @@ func dotShow(cmd *cobra.Command, args []string, out io.Writer) error { branchInternalName = "" } - latency, err := dm.GetReplicationLatencyForBranch(localDot, branchInternalName) + latency, err := dm.GetReplicationLatencyForBranch(qualifiedDotName, branchInternalName) if err != nil { fmt.Fprintf(out, "unable to fetch replication status (%s), proceeding...\n", err) } else { From 410ee9d5f26b67285af92b3a4dd9ec249ea5a175 Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Tue, 19 Jun 2018 17:02:03 +0100 Subject: [PATCH 51/60] #401: Log Dotmesh server image (which carries the version hash) in the working directory --- cmd/dotmesh-server/require_zfs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/dotmesh-server/require_zfs.sh b/cmd/dotmesh-server/require_zfs.sh index 13d4232ff..29f668697 100755 --- a/cmd/dotmesh-server/require_zfs.sh +++ b/cmd/dotmesh-server/require_zfs.sh @@ -158,7 +158,7 @@ if [ ! -e /dev/zfs ]; then mknod -m 660 /dev/zfs c $(cat /sys/class/misc/zfs/dev |sed 's/:/ /g') fi -echo "`date`: On host '$HOSTNAME', working directory = '$OUTER_DIR', device = '$BLOCK_DEVICE', zfs mountpoint = '$MOUNTPOINT', pool = '$POOL'" +echo "`date`: On host '$HOSTNAME', working directory = '$OUTER_DIR', device = '$BLOCK_DEVICE', zfs mountpoint = '$MOUNTPOINT', pool = '$POOL', Dotmesh image = '$DOTMESH_DOCKER_IMAGE'" if ! run_in_zfs_container zpool-status zpool status $POOL; then From ab810a3ce9dd766b1661d7ad8d33d38604d69008 Mon Sep 17 00:00:00 2001 From: Priya Samuel Date: Wed, 20 Jun 2018 10:00:06 +0100 Subject: [PATCH 52/60] #462: Update progress bar with total size when we get it from the TransferPoll request --- cmd/dm/pkg/remotes/api.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cmd/dm/pkg/remotes/api.go b/cmd/dm/pkg/remotes/api.go index 04528a2fc..79be72fa9 100644 --- a/cmd/dm/pkg/remotes/api.go +++ b/cmd/dm/pkg/remotes/api.go @@ -3,7 +3,6 @@ package remotes import ( "fmt" "io" - "log" "os" "reflect" "regexp" @@ -762,9 +761,6 @@ func (dm *DotmeshAPI) PollTransfer(transferId string, out io.Writer) error { } } - if debugMode { - log.Printf("\nGot DotmeshRPC.GetTransfer response: %+v", result) - } if !started { bar = pb.New64(result.Size) bar.ShowFinalTime = false @@ -773,6 +769,10 @@ func (dm *DotmeshAPI) PollTransfer(transferId string, out io.Writer) error { bar.Start() started = true } + + if result.Size != 0 { + bar.Total = result.Size + } // Numbers reported by data transferred thru dotmesh versus size // of stream reported by 'zfs send -nP' are off by a few kilobytes, // fudge it (maybe no one will notice). From fcc6d51cb5cc9529b503e7873046be9b39b9a40e Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Wed, 20 Jun 2018 14:26:00 +0100 Subject: [PATCH 53/60] #422: Fix goroutine leak in fsMachine.push --- cmd/dotmesh-server/pkg/main/statemachines.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cmd/dotmesh-server/pkg/main/statemachines.go b/cmd/dotmesh-server/pkg/main/statemachines.go index b0800d91c..50a7685b0 100644 --- a/cmd/dotmesh-server/pkg/main/statemachines.go +++ b/cmd/dotmesh-server/pkg/main/statemachines.go @@ -2587,6 +2587,10 @@ func (f *fsMachine) push( resp, err := postClient.Do(req) if err != nil { log.Printf("[actualPush:%s] error in postClient.Do: %s", filesystemId, err) + + go func() { + _ = <-errch + }() _ = <-finished return &Event{ Name: "error-from-post-when-pushing", @@ -2603,6 +2607,10 @@ func (f *fsMachine) push( filesystemId, string(responseBody), err, ) + + go func() { + _ = <-errch + }() _ = <-finished return &Event{ Name: "error-reading-push-response-body", @@ -2614,6 +2622,10 @@ func (f *fsMachine) push( if resp.StatusCode != 200 { log.Printf("ABS TEST: Aborting!") + + go func() { + _ = <-errch + }() _ = <-finished return &Event{ Name: "error-pushing-posting", From f6cd3c1282b99969f74936195e6e738f79d04772 Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Wed, 20 Jun 2018 14:26:36 +0100 Subject: [PATCH 54/60] #422: Observer.Publish will log long waits for slow subscribers --- cmd/dotmesh-server/pkg/main/controller.go | 4 ++-- cmd/dotmesh-server/pkg/main/observer.go | 17 +++++++++++++++-- cmd/dotmesh-server/pkg/main/statemachines.go | 4 ++-- cmd/dotmesh-server/pkg/main/utils.go | 2 +- 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/cmd/dotmesh-server/pkg/main/controller.go b/cmd/dotmesh-server/pkg/main/controller.go index 502f08c7e..d078197d2 100644 --- a/cmd/dotmesh-server/pkg/main/controller.go +++ b/cmd/dotmesh-server/pkg/main/controller.go @@ -50,8 +50,8 @@ func NewInMemoryState(localPoolId string, config Config) *InMemoryState { // a sort of global event bus for filesystems getting new snapshots on // their masters, keyed on filesystem name, which interested parties // such as slaves for that filesystem may subscribe to - newSnapsOnMaster: NewObserver(), - localReceiveProgress: NewObserver(), + newSnapsOnMaster: NewObserver("newSnapsOnMaster"), + localReceiveProgress: NewObserver("localReceiveProgress"), // containers that are running with dotmesh volumes by filesystem id containers: d, containersLock: &sync.Mutex{}, diff --git a/cmd/dotmesh-server/pkg/main/observer.go b/cmd/dotmesh-server/pkg/main/observer.go index c5286d22c..4cb8d072b 100644 --- a/cmd/dotmesh-server/pkg/main/observer.go +++ b/cmd/dotmesh-server/pkg/main/observer.go @@ -5,18 +5,21 @@ package main import ( "fmt" + "log" "strings" "sync" "time" ) type Observer struct { + name string events map[string][]chan interface{} rwMutex sync.RWMutex } -func NewObserver() *Observer { +func NewObserver(name string) *Observer { return &Observer{ + name: name, rwMutex: sync.RWMutex{}, events: map[string][]chan interface{}{}, } @@ -118,7 +121,17 @@ func (o *Observer) Publish(event string, data interface{}) error { return } }() - outputChan <- data + + select { + case outputChan <- data: + // Sent OK without timing out + break + case <-time.After(600 * time.Second): + // Took more than 10 minutes + log.Printf("[Observer.Publish] LONG WAIT to send %#v on %s:%s", data, o.name, event) + outputChan <- data + log.Printf("[Observer.Publish] Finally sent %#v on %s:%s after a long wait", data, o.name, event) + } }(outputChan) } diff --git a/cmd/dotmesh-server/pkg/main/statemachines.go b/cmd/dotmesh-server/pkg/main/statemachines.go index 50a7685b0..81a157511 100644 --- a/cmd/dotmesh-server/pkg/main/statemachines.go +++ b/cmd/dotmesh-server/pkg/main/statemachines.go @@ -38,11 +38,11 @@ func newFilesystemMachine(filesystemId string, s *InMemoryState) *fsMachine { snapshotsModified: make(chan bool), state: s, snapshotsLock: &sync.Mutex{}, - newSnapsOnServers: NewObserver(), + newSnapsOnServers: NewObserver(fmt.Sprintf("newSnapsOnServers:%s", filesystemId)), currentState: "discovering", status: "", lastTransitionTimestamp: time.Now().UnixNano(), - transitionObserver: NewObserver(), + transitionObserver: NewObserver(fmt.Sprintf("transitionObserver:%s", filesystemId)), lastTransferRequest: TransferRequest{}, // In the case where we're receiving a push (pushPeerState), it's the // POST handler on our http server which handles the receiving of the diff --git a/cmd/dotmesh-server/pkg/main/utils.go b/cmd/dotmesh-server/pkg/main/utils.go index 1c939c3af..1076bf102 100644 --- a/cmd/dotmesh-server/pkg/main/utils.go +++ b/cmd/dotmesh-server/pkg/main/utils.go @@ -281,7 +281,7 @@ func runForever(f func() error, label string, errorBackoff, successBackoff time. } } -var deathObserver *Observer = NewObserver() +var deathObserver *Observer = NewObserver("deathObserver") // run while filesystem lives func runWhileFilesystemLives(f func() error, label string, filesystemId string, errorBackoff, successBackoff time.Duration) { From 915489aedd27c704034a0f8c3795ad6fd181ead7 Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Wed, 20 Jun 2018 17:33:03 +0100 Subject: [PATCH 55/60] #422: Fix goroutine leak for deleted filesystems. When a filesystem is deleted, `updateEtcdAboutSnapshots` will block forever on the `snapshotsModified` channel, which will never send anything as it's been deleted. However, `updateEtcdAboutSnapshots` runs inside `runWhileFilesystemLives` which has subscribed to the filesystem death observer, so an `Observer.Publish` goroutine will block forever trying to notify it that the filesystem is dead. Result: Two goroutines leaked per deleted filesystem. --- cmd/dotmesh-server/pkg/main/statemachines.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cmd/dotmesh-server/pkg/main/statemachines.go b/cmd/dotmesh-server/pkg/main/statemachines.go index 81a157511..9479a815e 100644 --- a/cmd/dotmesh-server/pkg/main/statemachines.go +++ b/cmd/dotmesh-server/pkg/main/statemachines.go @@ -283,9 +283,13 @@ func (f *fsMachine) updateEtcdAboutSnapshots() error { ) // wait until the state machine notifies us that it's changed the - // snapshots - _ = <-f.snapshotsModified - log.Printf("[updateEtcdAboutSnapshots] going 'round the loop") + // snapshots, but have a timeout in case this filesystem is deleted so we don't block forever + select { + case _ = <-f.snapshotsModified: + log.Printf("[updateEtcdAboutSnapshots] going 'round the loop") + case <-time.After(60 * time.Second): + } + return nil } From a69cbc9bf397b4c273cd1439f9aba20528e25fd4 Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Thu, 21 Jun 2018 12:22:58 +0100 Subject: [PATCH 56/60] #422: Re-use the `EtcdKeysAPI`. This shouldn't make a huge difference, as it's a very lightweight wrapper around the etcd client, and we already re-used a single instance of that. But it was a bit wasteful. --- cmd/dotmesh-server/pkg/main/etcd.go | 80 +++++++++++++---------------- 1 file changed, 36 insertions(+), 44 deletions(-) diff --git a/cmd/dotmesh-server/pkg/main/etcd.go b/cmd/dotmesh-server/pkg/main/etcd.go index 1a461da94..957184688 100644 --- a/cmd/dotmesh-server/pkg/main/etcd.go +++ b/cmd/dotmesh-server/pkg/main/etcd.go @@ -21,48 +21,11 @@ import ( // etcd related pieces, including the parts of InMemoryState which interact with etcd -func getEtcdKeysApi() (client.KeysAPI, error) { - c, err := getEtcd() - if err != nil { - return nil, err - } - return client.NewKeysAPI(c), nil -} - -var etcdClient client.Client -var once Once -var onceAgain Once - -func transportFromTLS(certFile, keyFile, caFile string) (*http.Transport, error) { - // Load client cert - cert, err := tls.LoadX509KeyPair(certFile, keyFile) - if err != nil { - return nil, err - } - - // Load CA cert - caCert, err := ioutil.ReadFile(caFile) - if err != nil { - return nil, err - } - caCertPool := x509.NewCertPool() - caCertPool.AppendCertsFromPEM(caCert) - - // Setup HTTPS client - tlsConfig := &tls.Config{ - Certificates: []tls.Certificate{cert}, - RootCAs: caCertPool, - } - tlsConfig.BuildNameToCertificate() - transport := &http.Transport{ - TLSClientConfig: tlsConfig, - } - return transport, nil -} +var etcdKeysAPI client.KeysAPI +var etcdConnectionOnce Once -// TODO maybe connection pooling -func getEtcd() (client.Client, error) { - once.Do(func() { +func getEtcdKeysApi() (client.KeysAPI, error) { + etcdConnectionOnce.Do(func() { var err error endpoint := os.Getenv("DOTMESH_ETCD_ENDPOINT") if endpoint == "" { @@ -92,14 +55,43 @@ func getEtcd() (client.Client, error) { // unavailable HeaderTimeoutPerRequest: time.Second * 10, } - etcdClient, err = client.New(cfg) + etcdClient, err := client.New(cfg) if err != nil { // maybe retry, instead of ending it all panic(err) } + etcdKeysAPI = client.NewKeysAPI(etcdClient) }) - // TODO: change signature, never errors - return etcdClient, nil + return etcdKeysAPI, nil +} + +var onceAgain Once + +func transportFromTLS(certFile, keyFile, caFile string) (*http.Transport, error) { + // Load client cert + cert, err := tls.LoadX509KeyPair(certFile, keyFile) + if err != nil { + return nil, err + } + + // Load CA cert + caCert, err := ioutil.ReadFile(caFile) + if err != nil { + return nil, err + } + caCertPool := x509.NewCertPool() + caCertPool.AppendCertsFromPEM(caCert) + + // Setup HTTPS client + tlsConfig := &tls.Config{ + Certificates: []tls.Certificate{cert}, + RootCAs: caCertPool, + } + tlsConfig.BuildNameToCertificate() + transport := &http.Transport{ + TLSClientConfig: tlsConfig, + } + return transport, nil } func guessIPv4Addresses() ([]string, error) { From 14d13a6490ae7147fedcc695eaa305b8ba1405d8 Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Thu, 21 Jun 2018 12:23:33 +0100 Subject: [PATCH 57/60] NFC: Utility function for debugging/tracing, which returns its stack trace as a string. This is handy for checking who's calling a widely-used utility function, dynamically. --- cmd/dotmesh-server/pkg/main/utils.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/cmd/dotmesh-server/pkg/main/utils.go b/cmd/dotmesh-server/pkg/main/utils.go index 1076bf102..d0bc1a2d5 100644 --- a/cmd/dotmesh-server/pkg/main/utils.go +++ b/cmd/dotmesh-server/pkg/main/utils.go @@ -12,6 +12,7 @@ import ( "net/http" "os" "os/exec" + "runtime" "strings" "sync" "sync/atomic" @@ -671,3 +672,16 @@ func quietLogger(logMessage string) { log.Printf(logMessage) } } + +func getMyStack() string { + len := 1024 + for { + buf := make([]byte, len) + used := runtime.Stack(buf, false) + if used < len { + return string(buf[:used]) + } else { + len = len * 2 + } + } +} From 345e4b7b13b38f550982b09242faf03ee4343c1c Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Thu, 21 Jun 2018 16:11:52 +0100 Subject: [PATCH 58/60] ops#77: Logging of the reason for entering `backoffState` from `receiving` (can be extended to other states easily) --- cmd/dotmesh-server/pkg/main/statemachines.go | 61 ++++++++++---------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/cmd/dotmesh-server/pkg/main/statemachines.go b/cmd/dotmesh-server/pkg/main/statemachines.go index 9479a815e..5ed954c2a 100644 --- a/cmd/dotmesh-server/pkg/main/statemachines.go +++ b/cmd/dotmesh-server/pkg/main/statemachines.go @@ -250,6 +250,7 @@ func (f *fsMachine) updateEtcdAboutSnapshots() error { if err != nil { return err } + // as soon as we're connected, eagerly: if we know about some // snapshots, **or the absence of them**, set this in etcd. serialized, err := func() ([]byte, error) { @@ -1254,6 +1255,7 @@ func missingState(f *fsMachine) stateFn { f.transitionedTo("missing", "waiting for snapshots or requests") select { case _ = <-newSnapsOnMaster: + f.transitionedTo("missing", "new snapshots found on master") return receivingState case e := <-f.innerRequests: f.transitionedTo("missing", fmt.Sprintf("handling %s", e.Name)) @@ -1384,6 +1386,19 @@ func missingState(f *fsMachine) stateFn { return backoffState } +func backoffStateWithReason(reason string) func(f *fsMachine) stateFn { + return func(f *fsMachine) stateFn { + f.transitionedTo("backoff", fmt.Sprintf("pausing due to %s", reason)) + log.Printf("entering backoff state for %s", f.filesystemId) + // TODO if we know that we're supposed to be mounted or unmounted, based on + // etcd state, actually put us back into the required state rather than + // just passively going back into discovering... or maybe, do that in + // discoveringState? + time.Sleep(time.Second) + return discoveringState + } +} + func backoffState(f *fsMachine) stateFn { f.transitionedTo("backoff", "pausing") log.Printf("entering backoff state for %s", f.filesystemId) @@ -1531,19 +1546,17 @@ func receivingState(f *fsMachine) stateFn { if err != nil { switch err := err.(type) { case *ToSnapsUpToDate: - log.Printf("receivingState: ToSnapsUpToDate %s got %s", f.filesystemId, err) // this is fine, we're up-to-date - return backoffState + return backoffStateWithReason(fmt.Sprintf("receivingState: ToSnapsUpToDate %s got %s", f.filesystemId, err)) case *NoFromSnaps: - log.Printf("receivingState: NoFromSnaps %s got %s", f.filesystemId, err) // this is fine, no snaps; can't replicate yet, but will - return backoffState + return backoffStateWithReason(fmt.Sprintf("receivingState: NoFromSnaps %s got %s", f.filesystemId, err)) case *ToSnapsAhead: log.Printf("receivingState: ToSnapsAhead %s got %s", f.filesystemId, err) // erk, slave is ahead of master errx := f.recoverFromDivergence(err.latestCommonSnapshot) if errx != nil { - log.Printf("receivingState(%s): Unable to recover from divergence: %+v", f.filesystemId, errx) + return backoffStateWithReason(fmt.Sprintf("receivingState(%s): Unable to recover from divergence: %+v", f.filesystemId, errx)) } // Go to discovering state, to update the world with our recent ZFS actions. return backoffState @@ -1551,19 +1564,17 @@ func receivingState(f *fsMachine) stateFn { log.Printf("receivingState: ToSnapsDiverged %s got %s", f.filesystemId, err) errx := f.recoverFromDivergence(err.latestCommonSnapshot) if errx != nil { - log.Printf("receivingState(%s): Unable to recover from divergence: %+v", f.filesystemId, errx) + return backoffStateWithReason(fmt.Sprintf("receivingState(%s): Unable to recover from divergence: %+v", f.filesystemId, errx)) } // Go to discovering state, to update the world with our recent ZFS actions. return backoffState case *NoCommonSnapshots: - log.Printf("receivingState: NoCommonSnapshots %s got %s", f.filesystemId, err) // erk, no common snapshots between master and slave // TODO: create a new local clone (branch), then delete the current // filesystem to enable replication to continue - return backoffState + return backoffStateWithReason(fmt.Sprintf("receivingState: NoCommonSnapshots %s got %+v", f.filesystemId, err)) default: - log.Printf("receivingState: default error handler %s got %s", f.filesystemId, err) - return backoffState + return backoffStateWithReason(fmt.Sprintf("receivingState: default error handler %s got %s", f.filesystemId, err)) } } @@ -1580,8 +1591,7 @@ func receivingState(f *fsMachine) stateFn { case NoSuchClone: // Normal case for non-clone filesystems, continue. default: - log.Printf("Error trying to lookup clone by id: %s", err) - return backoffState + return backoffStateWithReason(fmt.Sprintf("receivingState: Error trying to lookup clone by id: %+v", err)) } } else { // Found a clone, let's base our pull on it @@ -1597,19 +1607,16 @@ func receivingState(f *fsMachine) stateFn { f.state.masterFor(f.filesystemId), ) if len(addresses) == 0 { - log.Printf("No known address for current master of %s", f.filesystemId) - return backoffState + return backoffStateWithReason(fmt.Sprintf("receivingState: No known address for current master of %s", f.filesystemId)) } _, _, apiKey, err := getPasswords("admin") if err != nil { - log.Printf("Attempting to pull %s got %s", f.filesystemId, err) - return backoffState + return backoffStateWithReason(fmt.Sprintf("receivingState: Attempting to pull %s got %+v", f.filesystemId, err)) } url, err := deduceUrl(context.Background(), addresses, "internal", "admin", apiKey) if err != nil { - log.Printf("%s", err) - return backoffState + return backoffStateWithReason(fmt.Sprintf("receivingState: deduceUrl failed with %+v", err)) } req, err := http.NewRequest( @@ -1623,15 +1630,13 @@ func receivingState(f *fsMachine) stateFn { nil, ) if err != nil { - log.Printf("Attempting to pull %s got %s", f.filesystemId, err) - return backoffState + return backoffStateWithReason(fmt.Sprintf("receivingState: Attempting to pull %s got %+v", f.filesystemId, err)) } req.SetBasicAuth("admin", apiKey) client := &http.Client{} resp, err := client.Do(req) if err != nil { - log.Printf("Attempting to pull %s got %s", f.filesystemId, err) - return backoffState + return backoffStateWithReason(fmt.Sprintf("receivingState: Attempting to pull %s got %+v", f.filesystemId, err)) } log.Printf( "Debug: curl -u admin:[pw] %s/filesystems/%s/%s/%s", @@ -1676,7 +1681,7 @@ func receivingState(f *fsMachine) stateFn { prelude, err := consumePrelude(pipeReader) if err != nil { _ = <-finished - return backoffState + return backoffStateWithReason(fmt.Sprintf("receivingState: error consuming prelude: %+v", err)) } log.Printf("[pull] Got prelude %v", prelude) @@ -1688,18 +1693,16 @@ func receivingState(f *fsMachine) stateFn { f.transitionedTo("receiving", "finished pipe") if err != nil { - log.Printf( - "Got error %s when running zfs recv for %s, check zfs-recv-stderr.log", + return backoffStateWithReason(fmt.Sprintf("receivingState: Got error %+v when running zfs recv for %s, check zfs-recv-stderr.log", err, f.filesystemId, - ) - return backoffState + )) } else { log.Printf("Successfully received %s => %s for %s", fromSnap, snapRange.toSnap.Id) } log.Printf("[pull] about to start applying prelude on %v", pipeReader) err = applyPrelude(prelude, fq(f.filesystemId)) if err != nil { - return backoffState + return backoffStateWithReason(fmt.Sprintf("receivingState: Error applying prelude: %+v", err)) } return discoveringState } @@ -2625,8 +2628,6 @@ func (f *fsMachine) push( log.Printf("[actualPush:%s] Got response body while pushing: status %d, body %s", filesystemId, resp.StatusCode, string(responseBody)) if resp.StatusCode != 200 { - log.Printf("ABS TEST: Aborting!") - go func() { _ = <-errch }() From a6b31a97909eb4e9ce96dd81d5a8e293c4f0509b Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Thu, 21 Jun 2018 16:12:17 +0100 Subject: [PATCH 59/60] ops#77: Go to discoveringState rather than backoffState after healing a divergence. --- cmd/dotmesh-server/pkg/main/statemachines.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/dotmesh-server/pkg/main/statemachines.go b/cmd/dotmesh-server/pkg/main/statemachines.go index 5ed954c2a..109b96483 100644 --- a/cmd/dotmesh-server/pkg/main/statemachines.go +++ b/cmd/dotmesh-server/pkg/main/statemachines.go @@ -1559,7 +1559,7 @@ func receivingState(f *fsMachine) stateFn { return backoffStateWithReason(fmt.Sprintf("receivingState(%s): Unable to recover from divergence: %+v", f.filesystemId, errx)) } // Go to discovering state, to update the world with our recent ZFS actions. - return backoffState + return discoveringState case *ToSnapsDiverged: log.Printf("receivingState: ToSnapsDiverged %s got %s", f.filesystemId, err) errx := f.recoverFromDivergence(err.latestCommonSnapshot) @@ -1567,7 +1567,7 @@ func receivingState(f *fsMachine) stateFn { return backoffStateWithReason(fmt.Sprintf("receivingState(%s): Unable to recover from divergence: %+v", f.filesystemId, errx)) } // Go to discovering state, to update the world with our recent ZFS actions. - return backoffState + return discoveringState case *NoCommonSnapshots: // erk, no common snapshots between master and slave // TODO: create a new local clone (branch), then delete the current From 15bb55eb6fc494f0418265c3b971b41d06f43df5 Mon Sep 17 00:00:00 2001 From: Alaric Snell-Pym Date: Thu, 21 Jun 2018 16:13:07 +0100 Subject: [PATCH 60/60] ops#77: Don't say there's new snapshots on the master when there's NO snapshots on the master. This just caused fsMachines in `missingState` to leap into `receivingState` when there's nothing to receive, causing a backoff. --- cmd/dotmesh-server/pkg/main/etcd.go | 38 ++++++++++++++--------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/cmd/dotmesh-server/pkg/main/etcd.go b/cmd/dotmesh-server/pkg/main/etcd.go index 957184688..ff1a1ca65 100644 --- a/cmd/dotmesh-server/pkg/main/etcd.go +++ b/cmd/dotmesh-server/pkg/main/etcd.go @@ -778,28 +778,26 @@ func (s *InMemoryState) updateSnapshotsFromKnownState( filesystem, s.masterFor(filesystem), server, ) if s.masterFor(filesystem) == server { - // notify any interested parties that there are some new snapshots on - // the master - var latest snapshot if len(*snapshots) > 0 { - latest = (*snapshots)[len(*snapshots)-1] - } else { - latest = snapshot{} + // notify any interested parties that there are some new snapshots on + // the master + + latest := (*snapshots)[len(*snapshots)-1] + log.Printf( + "[updateSnapshots] publishing latest snapshot %s on %s", + latest, filesystem, + ) + go func() { + err := s.newSnapsOnMaster.Publish(filesystem, latest) + if err != nil { + log.Printf( + "[updateSnapshotsFromKnownState] "+ + "error publishing to newSnapsOnMaster: %s", + err, + ) + } + }() } - log.Printf( - "[updateSnapshots] publishing latest snapshot %s on %s", - latest, filesystem, - ) - go func() { - err := s.newSnapsOnMaster.Publish(filesystem, latest) - if err != nil { - log.Printf( - "[updateSnapshotsFromKnownState] "+ - "error publishing to newSnapsOnMaster: %s", - err, - ) - } - }() } // also slice it filesystem-wise, and publish to any observers // listening on a per-filesystem observer parameterized on server