From 04c6369a09baa5267a01713663f7c1cbb08896c9 Mon Sep 17 00:00:00 2001
From: Felix Lange <fjl@twurst.com>
Date: Mon, 7 Dec 2015 12:06:49 +0100
Subject: [PATCH] p2p, p2p/discover: track bootstrap state in p2p/discover

This change simplifies the dial scheduling logic because it
no longer needs to track whether the discovery table has been
bootstrapped.
---
 p2p/dial.go           |  33 +++----------
 p2p/dial_test.go      |  28 ++++-------
 p2p/discover/node.go  |  20 +++++++-
 p2p/discover/table.go | 105 +++++++++++++++++++++++++++---------------
 p2p/discover/udp.go   |  12 ++---
 p2p/server.go         |   3 ++
 6 files changed, 110 insertions(+), 91 deletions(-)

diff --git a/p2p/dial.go b/p2p/dial.go
index 0fd3a4cf5..bdc9f852c 100644
--- a/p2p/dial.go
+++ b/p2p/dial.go
@@ -46,7 +46,6 @@ type dialstate struct {
 	ntab        discoverTable
 
 	lookupRunning bool
-	bootstrapped  bool
 
 	dialing     map[discover.NodeID]connFlag
 	lookupBuf   []*discover.Node // current discovery lookup results
@@ -58,7 +57,6 @@ type dialstate struct {
 type discoverTable interface {
 	Self() *discover.Node
 	Close()
-	Bootstrap([]*discover.Node)
 	Lookup(target discover.NodeID) []*discover.Node
 	ReadRandomNodes([]*discover.Node) int
 }
@@ -84,13 +82,9 @@ type dialTask struct {
 
 // discoverTask runs discovery table operations.
 // Only one discoverTask is active at any time.
-//
-// If bootstrap is true, the task runs Table.Bootstrap,
-// otherwise it performs a random lookup and leaves the
-// results in the task.
+// discoverTask.Do performs a random lookup.
 type discoverTask struct {
-	bootstrap bool
-	results   []*discover.Node
+	results []*discover.Node
 }
 
 // A waitExpireTask is generated if there are no other tasks
@@ -154,7 +148,7 @@ func (s *dialstate) newTasks(nRunning int, peers map[discover.NodeID]*Peer, now
 	// Use random nodes from the table for half of the necessary
 	// dynamic dials.
 	randomCandidates := needDynDials / 2
-	if randomCandidates > 0 && s.bootstrapped {
+	if randomCandidates > 0 {
 		n := s.ntab.ReadRandomNodes(s.randomNodes)
 		for i := 0; i < randomCandidates && i < n; i++ {
 			if addDial(dynDialedConn, s.randomNodes[i]) {
@@ -171,12 +165,10 @@ func (s *dialstate) newTasks(nRunning int, peers map[discover.NodeID]*Peer, now
 		}
 	}
 	s.lookupBuf = s.lookupBuf[:copy(s.lookupBuf, s.lookupBuf[i:])]
-	// Launch a discovery lookup if more candidates are needed. The
-	// first discoverTask bootstraps the table and won't return any
-	// results.
+	// Launch a discovery lookup if more candidates are needed.
 	if len(s.lookupBuf) < needDynDials && !s.lookupRunning {
 		s.lookupRunning = true
-		newtasks = append(newtasks, &discoverTask{bootstrap: !s.bootstrapped})
+		newtasks = append(newtasks, &discoverTask{})
 	}
 
 	// Launch a timer to wait for the next node to expire if all
@@ -196,9 +188,6 @@ func (s *dialstate) taskDone(t task, now time.Time) {
 		s.hist.add(t.dest.ID, now.Add(dialHistoryExpiration))
 		delete(s.dialing, t.dest.ID)
 	case *discoverTask:
-		if t.bootstrap {
-			s.bootstrapped = true
-		}
 		s.lookupRunning = false
 		s.lookupBuf = append(s.lookupBuf, t.results...)
 	}
@@ -221,10 +210,6 @@ func (t *dialTask) String() string {
 }
 
 func (t *discoverTask) Do(srv *Server) {
-	if t.bootstrap {
-		srv.ntab.Bootstrap(srv.BootstrapNodes)
-		return
-	}
 	// newTasks generates a lookup task whenever dynamic dials are
 	// necessary. Lookups need to take some time, otherwise the
 	// event loop spins too fast.
@@ -238,12 +223,8 @@ func (t *discoverTask) Do(srv *Server) {
 	t.results = srv.ntab.Lookup(target)
 }
 
-func (t *discoverTask) String() (s string) {
-	if t.bootstrap {
-		s = "discovery bootstrap"
-	} else {
-		s = "discovery lookup"
-	}
+func (t *discoverTask) String() string {
+	s := "discovery lookup"
 	if len(t.results) > 0 {
 		s += fmt.Sprintf(" (%d results)", len(t.results))
 	}
diff --git a/p2p/dial_test.go b/p2p/dial_test.go
index d24e03e29..0127b2d87 100644
--- a/p2p/dial_test.go
+++ b/p2p/dial_test.go
@@ -76,15 +76,10 @@ func runDialTest(t *testing.T, test dialtest) {
 
 type fakeTable []*discover.Node
 
-func (t fakeTable) Self() *discover.Node       { return new(discover.Node) }
-func (t fakeTable) Close()                     {}
-func (t fakeTable) Bootstrap([]*discover.Node) {}
-func (t fakeTable) Lookup(target discover.NodeID) []*discover.Node {
-	return nil
-}
-func (t fakeTable) ReadRandomNodes(buf []*discover.Node) int {
-	return copy(buf, t)
-}
+func (t fakeTable) Self() *discover.Node                     { return new(discover.Node) }
+func (t fakeTable) Close()                                   {}
+func (t fakeTable) Lookup(discover.NodeID) []*discover.Node  { return nil }
+func (t fakeTable) ReadRandomNodes(buf []*discover.Node) int { return copy(buf, t) }
 
 // This test checks that dynamic dials are launched from discovery results.
 func TestDialStateDynDial(t *testing.T) {
@@ -98,7 +93,7 @@ func TestDialStateDynDial(t *testing.T) {
 					{rw: &conn{flags: dynDialedConn, id: uintID(1)}},
 					{rw: &conn{flags: dynDialedConn, id: uintID(2)}},
 				},
-				new: []task{&discoverTask{bootstrap: true}},
+				new: []task{&discoverTask{}},
 			},
 			// Dynamic dials are launched when it completes.
 			{
@@ -108,7 +103,7 @@ func TestDialStateDynDial(t *testing.T) {
 					{rw: &conn{flags: dynDialedConn, id: uintID(2)}},
 				},
 				done: []task{
-					&discoverTask{bootstrap: true, results: []*discover.Node{
+					&discoverTask{results: []*discover.Node{
 						{ID: uintID(2)}, // this one is already connected and not dialed.
 						{ID: uintID(3)},
 						{ID: uintID(4)},
@@ -238,22 +233,15 @@ func TestDialStateDynDialFromTable(t *testing.T) {
 	runDialTest(t, dialtest{
 		init: newDialState(nil, table, 10),
 		rounds: []round{
-			// Discovery bootstrap is launched.
-			{
-				new: []task{&discoverTask{bootstrap: true}},
-			},
 			// 5 out of 8 of the nodes returned by ReadRandomNodes are dialed.
 			{
-				done: []task{
-					&discoverTask{bootstrap: true},
-				},
 				new: []task{
 					&dialTask{dynDialedConn, &discover.Node{ID: uintID(1)}},
 					&dialTask{dynDialedConn, &discover.Node{ID: uintID(2)}},
 					&dialTask{dynDialedConn, &discover.Node{ID: uintID(3)}},
 					&dialTask{dynDialedConn, &discover.Node{ID: uintID(4)}},
 					&dialTask{dynDialedConn, &discover.Node{ID: uintID(5)}},
-					&discoverTask{bootstrap: false},
+					&discoverTask{},
 				},
 			},
 			// Dialing nodes 1,2 succeeds. Dials from the lookup are launched.
@@ -275,7 +263,7 @@ func TestDialStateDynDialFromTable(t *testing.T) {
 					&dialTask{dynDialedConn, &discover.Node{ID: uintID(10)}},
 					&dialTask{dynDialedConn, &discover.Node{ID: uintID(11)}},
 					&dialTask{dynDialedConn, &discover.Node{ID: uintID(12)}},
-					&discoverTask{bootstrap: false},
+					&discoverTask{},
 				},
 			},
 			// Dialing nodes 3,4,5 fails. The dials from the lookup succeed.
diff --git a/p2p/discover/node.go b/p2p/discover/node.go
index fac493f01..c4a3b5011 100644
--- a/p2p/discover/node.go
+++ b/p2p/discover/node.go
@@ -80,6 +80,24 @@ func (n *Node) Incomplete() bool {
 	return n.IP == nil
 }
 
+// checks whether n is a valid complete node.
+func (n *Node) validateComplete() error {
+	if n.Incomplete() {
+		return errors.New("incomplete node")
+	}
+	if n.UDP == 0 {
+		return errors.New("missing UDP port")
+	}
+	if n.TCP == 0 {
+		return errors.New("missing TCP port")
+	}
+	if n.IP.IsMulticast() || n.IP.IsUnspecified() {
+		return errors.New("invalid IP (multicast/unspecified)")
+	}
+	_, err := n.ID.Pubkey() // validate the key (on curve, etc.)
+	return err
+}
+
 // The string representation of a Node is a URL.
 // Please see ParseNode for a description of the format.
 func (n *Node) String() string {
@@ -249,7 +267,7 @@ func (id NodeID) Pubkey() (*ecdsa.PublicKey, error) {
 	p.X.SetBytes(id[:half])
 	p.Y.SetBytes(id[half:])
 	if !p.Curve.IsOnCurve(p.X, p.Y) {
-		return nil, errors.New("not a point on the S256 curve")
+		return nil, errors.New("id is invalid secp256k1 curve point")
 	}
 	return p, nil
 }
diff --git a/p2p/discover/table.go b/p2p/discover/table.go
index efa6e8eea..abb7980f8 100644
--- a/p2p/discover/table.go
+++ b/p2p/discover/table.go
@@ -25,6 +25,7 @@ package discover
 import (
 	"crypto/rand"
 	"encoding/binary"
+	"fmt"
 	"net"
 	"sort"
 	"sync"
@@ -56,7 +57,7 @@ type Table struct {
 	nursery []*Node           // bootstrap nodes
 	db      *nodeDB           // database of known nodes
 
-	refreshReq chan struct{}
+	refreshReq chan chan struct{}
 	closeReq   chan struct{}
 	closed     chan struct{}
 
@@ -102,7 +103,7 @@ func newTable(t transport, ourID NodeID, ourAddr *net.UDPAddr, nodeDBPath string
 		self:       NewNode(ourID, ourAddr.IP, uint16(ourAddr.Port), uint16(ourAddr.Port)),
 		bonding:    make(map[NodeID]*bondproc),
 		bondslots:  make(chan struct{}, maxBondingPingPongs),
-		refreshReq: make(chan struct{}),
+		refreshReq: make(chan chan struct{}),
 		closeReq:   make(chan struct{}),
 		closed:     make(chan struct{}),
 	}
@@ -179,21 +180,27 @@ func (tab *Table) Close() {
 	}
 }
 
-// Bootstrap sets the bootstrap nodes. These nodes are used to connect
-// to the network if the table is empty. Bootstrap will also attempt to
-// fill the table by performing random lookup operations on the
-// network.
-func (tab *Table) Bootstrap(nodes []*Node) {
+// SetFallbackNodes sets the initial points of contact. These nodes
+// are used to connect to the network if the table is empty and there
+// are no known nodes in the database.
+func (tab *Table) SetFallbackNodes(nodes []*Node) error {
+	for _, n := range nodes {
+		if err := n.validateComplete(); err != nil {
+			return fmt.Errorf("bad bootstrap/fallback node %q (%v)", n, err)
+		}
+	}
 	tab.mutex.Lock()
-	// TODO: maybe filter nodes with bad fields (nil, etc.) to avoid strange crashes
 	tab.nursery = make([]*Node, 0, len(nodes))
 	for _, n := range nodes {
 		cpy := *n
+		// Recompute cpy.sha because the node might not have been
+		// created by NewNode or ParseNode.
 		cpy.sha = crypto.Sha3Hash(n.ID[:])
 		tab.nursery = append(tab.nursery, &cpy)
 	}
 	tab.mutex.Unlock()
-	tab.requestRefresh()
+	tab.refresh()
+	return nil
 }
 
 // Resolve searches for a specific node with the given ID.
@@ -224,26 +231,36 @@ func (tab *Table) Resolve(targetID NodeID) *Node {
 // The given target does not need to be an actual node
 // identifier.
 func (tab *Table) Lookup(targetID NodeID) []*Node {
+	return tab.lookup(targetID, true)
+}
+
+func (tab *Table) lookup(targetID NodeID, refreshIfEmpty bool) []*Node {
 	var (
 		target         = crypto.Sha3Hash(targetID[:])
 		asked          = make(map[NodeID]bool)
 		seen           = make(map[NodeID]bool)
 		reply          = make(chan []*Node, alpha)
 		pendingQueries = 0
+		result         *nodesByDistance
 	)
 	// don't query further if we hit ourself.
 	// unlikely to happen often in practice.
 	asked[tab.self.ID] = true
 
-	tab.mutex.Lock()
-	// generate initial result set
-	result := tab.closest(target, bucketSize)
-	tab.mutex.Unlock()
-
-	// If the result set is empty, all nodes were dropped, refresh.
-	if len(result.entries) == 0 {
-		tab.requestRefresh()
-		return nil
+	for {
+		tab.mutex.Lock()
+		// generate initial result set
+		result = tab.closest(target, bucketSize)
+		tab.mutex.Unlock()
+		if len(result.entries) > 0 || !refreshIfEmpty {
+			break
+		}
+		// The result set is empty, all nodes were dropped, refresh.
+		// We actually wait for the refresh to complete here. The very
+		// first query will hit this case and run the bootstrapping
+		// logic.
+		<-tab.refresh()
+		refreshIfEmpty = false
 	}
 
 	for {
@@ -287,24 +304,24 @@ func (tab *Table) Lookup(targetID NodeID) []*Node {
 	return result.entries
 }
 
-func (tab *Table) requestRefresh() {
+func (tab *Table) refresh() <-chan struct{} {
+	done := make(chan struct{})
 	select {
-	case tab.refreshReq <- struct{}{}:
+	case tab.refreshReq <- done:
 	case <-tab.closed:
+		close(done)
 	}
+	return done
 }
 
+// refreshLoop schedules doRefresh runs and coordinates shutdown.
 func (tab *Table) refreshLoop() {
-	defer func() {
-		tab.db.close()
-		if tab.net != nil {
-			tab.net.close()
-		}
-		close(tab.closed)
-	}()
-
-	timer := time.NewTicker(autoRefreshInterval)
-	var done chan struct{}
+	var (
+		timer   = time.NewTicker(autoRefreshInterval)
+		waiting []chan struct{} // accumulates waiting callers while doRefresh runs
+		done    chan struct{}   // where doRefresh reports completion
+	)
+loop:
 	for {
 		select {
 		case <-timer.C:
@@ -312,20 +329,34 @@ func (tab *Table) refreshLoop() {
 				done = make(chan struct{})
 				go tab.doRefresh(done)
 			}
-		case <-tab.refreshReq:
+		case req := <-tab.refreshReq:
+			waiting = append(waiting, req)
 			if done == nil {
 				done = make(chan struct{})
 				go tab.doRefresh(done)
 			}
 		case <-done:
+			for _, ch := range waiting {
+				close(ch)
+			}
+			waiting = nil
 			done = nil
 		case <-tab.closeReq:
-			if done != nil {
-				<-done
-			}
-			return
+			break loop
 		}
 	}
+
+	if tab.net != nil {
+		tab.net.close()
+	}
+	if done != nil {
+		<-done
+	}
+	for _, ch := range waiting {
+		close(ch)
+	}
+	tab.db.close()
+	close(tab.closed)
 }
 
 // doRefresh performs a lookup for a random target to keep buckets
@@ -342,7 +373,7 @@ func (tab *Table) doRefresh(done chan struct{}) {
 	// We perform a lookup with a random target instead.
 	var target NodeID
 	rand.Read(target[:])
-	result := tab.Lookup(target)
+	result := tab.lookup(target, false)
 	if len(result) > 0 {
 		return
 	}
@@ -366,7 +397,7 @@ func (tab *Table) doRefresh(done chan struct{}) {
 	tab.mutex.Unlock()
 
 	// Finally, do a self lookup to fill up the buckets.
-	tab.Lookup(tab.self.ID)
+	tab.lookup(tab.self.ID, false)
 }
 
 // closest returns the n nodes in the table that are closest to the
diff --git a/p2p/discover/udp.go b/p2p/discover/udp.go
index e93949c56..72b2a45e5 100644
--- a/p2p/discover/udp.go
+++ b/p2p/discover/udp.go
@@ -114,13 +114,11 @@ func makeEndpoint(addr *net.UDPAddr, tcpPort uint16) rpcEndpoint {
 	return rpcEndpoint{IP: ip, UDP: uint16(addr.Port), TCP: tcpPort}
 }
 
-func nodeFromRPC(rn rpcNode) (n *Node, valid bool) {
+func nodeFromRPC(rn rpcNode) (*Node, error) {
 	// TODO: don't accept localhost, LAN addresses from internet hosts
-	// TODO: check public key is on secp256k1 curve
-	if rn.IP.IsMulticast() || rn.IP.IsUnspecified() || rn.UDP == 0 {
-		return nil, false
-	}
-	return NewNode(rn.ID, rn.IP, rn.UDP, rn.TCP), true
+	n := NewNode(rn.ID, rn.IP, rn.UDP, rn.TCP)
+	err := n.validateComplete()
+	return n, err
 }
 
 func nodeToRPC(n *Node) rpcNode {
@@ -271,7 +269,7 @@ func (t *udp) findnode(toid NodeID, toaddr *net.UDPAddr, target NodeID) ([]*Node
 		reply := r.(*neighbors)
 		for _, rn := range reply.Nodes {
 			nreceived++
-			if n, valid := nodeFromRPC(rn); valid {
+			if n, err := nodeFromRPC(rn); err == nil {
 				nodes = append(nodes, n)
 			}
 		}
diff --git a/p2p/server.go b/p2p/server.go
index 7991585f1..52d1be677 100644
--- a/p2p/server.go
+++ b/p2p/server.go
@@ -334,6 +334,9 @@ func (srv *Server) Start() (err error) {
 		if err != nil {
 			return err
 		}
+		if err := ntab.SetFallbackNodes(srv.BootstrapNodes); err != nil {
+			return err
+		}
 		srv.ntab = ntab
 	}