cmd/swarm/swarm-smoke: sliding window test should not time out (#19152)

6 years ago · 34c85def3e
parent 81ed700157
commit 34c85def3e
2 changed files with 45 additions and 30 deletions
--- a/cmd/swarm/flags.go
+++ b/cmd/swarm/flags.go
@ -149,8 +149,9 @@ var (
 	}
 	SwarmStoreCacheCapacity = cli.UintFlag{
 		Name:   "store.cache.size",
-		Usage:  "Number of recent chunks cached in memory (default 5000)",
+		Usage:  "Number of recent chunks cached in memory",
 		EnvVar: SwarmEnvStoreCacheCapacity,
+		Value:  10000,
 	}
 	SwarmCompressedFlag = cli.BoolFlag{
 		Name:  "compressed",
--- a/cmd/swarm/swarm-smoke/sliding_window.go
+++ b/cmd/swarm/swarm-smoke/sliding_window.go
@ -42,23 +42,16 @@ func slidingWindowCmd(ctx *cli.Context, tuid string) error {
 		errc <- slidingWindow(ctx, tuid)
 	}()

-	select {
-	case err := <-errc:
+	err := <-errc
 	if err != nil {
 		metrics.GetOrRegisterCounter(fmt.Sprintf("%s.fail", commandName), nil).Inc(1)
 	}
 	return err
-	case <-time.After(time.Duration(timeout) * time.Second):
-		metrics.GetOrRegisterCounter(fmt.Sprintf("%s.timeout", commandName), nil).Inc(1)
-
-		return fmt.Errorf("timeout after %v sec", timeout)
-	}
 }

 func slidingWindow(ctx *cli.Context, tuid string) error {
 	var hashes []uploadResult //swarm hashes of the uploads
 	nodes := len(hosts)
-	const iterationTimeout = 30 * time.Second
 	log.Info("sliding window test started", "tuid", tuid, "nodes", nodes, "filesize(kb)", filesize, "timeout", timeout)
 	uploadedBytes := 0
 	networkDepth := 0
@ -66,6 +59,7 @@ func slidingWindow(ctx *cli.Context, tuid string) error {

 outer:
 	for {
+		seed = int(time.Now().UTC().UnixNano())
 		log.Info("uploading to "+httpEndpoint(hosts[0])+" and syncing", "seed", seed)

 		t1 := time.Now()
@ -79,6 +73,7 @@ outer:
 		}

 		metrics.GetOrRegisterResettingTimer("sliding-window.upload-time", nil).UpdateSince(t1)
+		metrics.GetOrRegisterGauge("sliding-window.upload-depth", nil).Update(int64(len(hashes)))

 		fhash, err := digest(bytes.NewReader(randomBytes))
 		if err != nil {
@ -90,37 +85,56 @@ outer:
 		hashes = append(hashes, uploadResult{hash: hash, digest: fhash})
 		time.Sleep(time.Duration(syncDelay) * time.Second)
 		uploadedBytes += filesize * 1000
-
+		q := make(chan struct{}, 1)
+		d := make(chan struct{})
+		defer close(q)
+		defer close(d)
 		for i, v := range hashes {
-			timeout := time.After(time.Duration(timeout) * time.Second)
+			timeoutC := time.After(time.Duration(timeout) * time.Second)
 			errored = false

-		inner:
+		task:
 			for {
 				select {
-				case <-timeout:
-					errored = true
-					log.Error("error retrieving hash. timeout", "hash idx", i, "err", err)
-					metrics.GetOrRegisterCounter("sliding-window.single.error", nil).Inc(1)
-					break inner
-				default:
+				case q <- struct{}{}:
+					go func() {
+						var start time.Time
+						done := false
+						for !done {
+							log.Info("trying to retrieve hash", "hash", v.hash)
 							idx := 1 + rand.Intn(len(hosts)-1)
 							ruid := uuid.New()[:8]
-					start := time.Now()
+							start = time.Now()
+							// fetch hangs when swarm dies out, so we have to jump through a bit more hoops to actually
+							// catch the timeout, but also allow this retry logic
 							err := fetch(v.hash, httpEndpoint(hosts[idx]), v.digest, ruid, "")
 							if err != nil {
-						continue inner
+								log.Error("error fetching hash", "err", err)
+								continue
+							}
+							done = true
 						}
 						metrics.GetOrRegisterResettingTimer("sliding-window.single.fetch-time", nil).UpdateSince(start)
-					break inner
+						d <- struct{}{}
+					}()
+				case <-d:
+					<-q
+					break task
+				case <-timeoutC:
+					errored = true
+					log.Error("error retrieving hash. timeout", "hash idx", i)
+					metrics.GetOrRegisterCounter("sliding-window.single.error", nil).Inc(1)
+					break outer
+				default:
 				}
 			}

-			if errored {
-				break outer
-			}
 			networkDepth = i
 			metrics.GetOrRegisterGauge("sliding-window.network-depth", nil).Update(int64(networkDepth))
+			log.Info("sliding window test successfully fetched file", "currentDepth", networkDepth)
+			// this test might take a long time to finish - but we'd like to see metrics while they accumulate and not just when
+			// the test finishes. therefore emit the metrics on each iteration
+			emitMetrics(ctx)
 		}
 	}