Browse Source

封装了rpc超时,修复并发candidate时的异常

ld
augurier 6 months ago
parent
commit
053dbe107f
5 changed files with 67 additions and 37 deletions
  1. +7
    -6
      internal/client/client_node.go
  2. +41
    -2
      internal/nodes/init.go
  3. +7
    -3
      internal/nodes/replica.go
  4. +3
    -3
      internal/nodes/server_node.go
  5. +9
    -23
      internal/nodes/vote.go

+ 7
- 6
internal/client/client_node.go View File

@ -42,10 +42,11 @@ func (client *Client) FindActiveNode() *rpc.Client {
var c *rpc.Client var c *rpc.Client
for { // 直到找到一个可连接的节点(保证至少一个节点活着) for { // 直到找到一个可连接的节点(保证至少一个节点活着)
addr := getRandomAddress(client.Address) addr := getRandomAddress(client.Address)
c, err = rpc.DialHTTP("tcp", addr)
c, err = nodes.DialHTTPWithTimeout("tcp", addr)
if err != nil { if err != nil {
log.Error("dialing: ", zap.Error(err)) log.Error("dialing: ", zap.Error(err))
} else { } else {
log.Sugar().Info("client发现活跃节点地址[%s]", addr)
return c return c
} }
} }
@ -67,7 +68,7 @@ func (client *Client) Write(kvCall nodes.LogEntryCall) Status {
var err error var err error
for !reply.Isleader { // 根据存活节点的反馈,直到找到leader for !reply.Isleader { // 根据存活节点的反馈,直到找到leader
callErr := c.Call("Node.WriteKV", kvCall, &reply) // RPC
callErr := nodes.CallWithTimeout(c, "Node.WriteKV", &kvCall, &reply) // RPC
if callErr != nil { // dial和call之间可能崩溃,重新找存活节点 if callErr != nil { // dial和call之间可能崩溃,重新找存活节点
log.Error("dialing: ", zap.Error(callErr)) log.Error("dialing: ", zap.Error(callErr))
client.CloseRpcClient(c) client.CloseRpcClient(c)
@ -78,7 +79,7 @@ func (client *Client) Write(kvCall nodes.LogEntryCall) Status {
if !reply.Isleader { // 对方不是leader,根据反馈找leader if !reply.Isleader { // 对方不是leader,根据反馈找leader
addr := reply.LeaderAddress addr := reply.LeaderAddress
client.CloseRpcClient(c) client.CloseRpcClient(c)
c, err = rpc.DialHTTP("tcp", addr)
c, err = nodes.DialHTTPWithTimeout("tcp", addr)
for err != nil { // 重新找下一个存活节点 for err != nil { // 重新找下一个存活节点
c = client.FindActiveNode() c = client.FindActiveNode()
} }
@ -101,7 +102,7 @@ func (client *Client) Read(key string, value *string) Status { // 查不到value
c = client.FindActiveNode() c = client.FindActiveNode()
var reply nodes.ServerReply var reply nodes.ServerReply
callErr := c.Call("Node.ReadKey", key, &reply) // RPC
callErr := nodes.CallWithTimeout(c, "Node.ReadKey", &key, &reply) // RPC
if callErr != nil { if callErr != nil {
log.Error("dialing: ", zap.Error(callErr)) log.Error("dialing: ", zap.Error(callErr))
client.CloseRpcClient(c) client.CloseRpcClient(c)
@ -128,7 +129,7 @@ func (client *Client) FindLeader() string {
var err error var err error
for !reply.Isleader { // 根据存活节点的反馈,直到找到leader for !reply.Isleader { // 根据存活节点的反馈,直到找到leader
callErr := c.Call("Node.FindLeader", &arg, &reply) // RPC
callErr := nodes.CallWithTimeout(c, "Node.FindLeader", &arg, &reply) // RPC
if callErr != nil { // dial和call之间可能崩溃,重新找存活节点 if callErr != nil { // dial和call之间可能崩溃,重新找存活节点
log.Error("dialing: ", zap.Error(callErr)) log.Error("dialing: ", zap.Error(callErr))
client.CloseRpcClient(c) client.CloseRpcClient(c)
@ -139,7 +140,7 @@ func (client *Client) FindLeader() string {
if !reply.Isleader { // 对方不是leader,根据反馈找leader if !reply.Isleader { // 对方不是leader,根据反馈找leader
addr := client.Address[reply.LeaderId] addr := client.Address[reply.LeaderId]
client.CloseRpcClient(c) client.CloseRpcClient(c)
c, err = rpc.DialHTTP("tcp", addr)
c, err = nodes.DialHTTPWithTimeout("tcp", addr)
for err != nil { // 重新找下一个存活节点 for err != nil { // 重新找下一个存活节点
c = client.FindActiveNode() c = client.FindActiveNode()
} }

+ 41
- 2
internal/nodes/init.go View File

@ -1,6 +1,7 @@
package nodes package nodes
import ( import (
// "context"
"fmt" "fmt"
"net" "net"
"net/http" "net/http"
@ -30,7 +31,7 @@ func Init(selfId string, nodeAddr map[string]string, db *leveldb.DB, rstorage *R
// 创建节点 // 创建节点
node := &Node{ node := &Node{
selfId: selfId, selfId: selfId,
leaderId: "",
leaderId: "",
nodes: ns, nodes: ns,
maxLogId: -1, // 后来发现论文中是从1开始的(初始0),但不想改了 maxLogId: -1, // 后来发现论文中是从1开始的(初始0),但不想改了
currTerm: 1, currTerm: 1,
@ -40,7 +41,7 @@ func Init(selfId string, nodeAddr map[string]string, db *leveldb.DB, rstorage *R
nextIndex: make(map[string]int), nextIndex: make(map[string]int),
matchIndex: make(map[string]int), matchIndex: make(map[string]int),
db: db, db: db,
storage: rstorage,
storage: rstorage,
} }
node.initLeaderState() node.initLeaderState()
if isRestart { if isRestart {
@ -74,6 +75,7 @@ func Start(node *Node) {
case Leader: case Leader:
// 发送心跳 // 发送心跳
fmt.Printf("[%s] is the leader, 发送心跳...\n", node.selfId) fmt.Printf("[%s] is the leader, 发送心跳...\n", node.selfId)
node.resetElectionTimer() // leader不主动触发选举
node.BroadCastKV(Normal) node.BroadCastKV(Normal)
} }
time.Sleep(50 * time.Millisecond) time.Sleep(50 * time.Millisecond)
@ -81,7 +83,9 @@ func Start(node *Node) {
}() }()
} }
// 初始时注册rpc方法
func (node *Node) Rpc(port string) { func (node *Node) Rpc(port string) {
err := rpc.Register(node) err := rpc.Register(node)
if err != nil { if err != nil {
log.Fatal("rpc register failed", zap.Error(err)) log.Fatal("rpc register failed", zap.Error(err))
@ -99,3 +103,38 @@ func (node *Node) Rpc(port string) {
} }
}() }()
} }
// 封装有超时的dial
func DialHTTPWithTimeout(network, address string) (*rpc.Client, error) {
done := make(chan struct{})
var client *rpc.Client
var err error
go func() {
client, err = rpc.DialHTTP(network, address)
close(done)
}()
select {
case <-done:
return client, err
case <-time.After(50 * time.Millisecond):
return nil, fmt.Errorf("dial timeout: %s", address)
}
}
// 封装有超时的call
func CallWithTimeout[T1 any, T2 any](client *rpc.Client, serviceMethod string, args *T1, reply *T2) error {
done := make(chan error, 1)
go func() {
done <- client.Call(serviceMethod, args, reply)
}()
select {
case err := <-done:
return err
case <-time.After(50 * time.Millisecond):
return fmt.Errorf("call timeout: %s", serviceMethod)
}
}

+ 7
- 3
internal/nodes/replica.go View File

@ -47,7 +47,7 @@ func (node *Node) sendKV(id string, callMode CallMode) {
default: default:
} }
client, err := rpc.DialHTTP("tcp", node.nodes[id].address)
client, err := DialHTTPWithTimeout("tcp", node.nodes[id].address)
if err != nil { if err != nil {
log.Error("dialing: ", zap.Error(err)) log.Error("dialing: ", zap.Error(err))
return return
@ -82,7 +82,7 @@ func (node *Node) sendKV(id string, callMode CallMode) {
if arg.PrevLogIndex >= 0 { if arg.PrevLogIndex >= 0 {
arg.PrevLogTerm = node.log[arg.PrevLogIndex].Term arg.PrevLogTerm = node.log[arg.PrevLogIndex].Term
} }
callErr := client.Call("Node.AppendEntries", arg, &appendReply) // RPC
callErr := CallWithTimeout(client, "Node.AppendEntries", &arg, &appendReply) // RPC
if callErr != nil { if callErr != nil {
log.Error("dialing node_"+ id +"fail: ", zap.Error(callErr)) log.Error("dialing node_"+ id +"fail: ", zap.Error(callErr))
} }
@ -142,7 +142,11 @@ func (node *Node) applyCommittedLogs() {
} }
// RPC call // RPC call
func (node *Node) AppendEntries(arg AppendEntriesArg, reply *AppendEntriesReply) error {
func (node *Node) AppendEntries(arg *AppendEntriesArg, reply *AppendEntriesReply) error {
// start := time.Now()
// defer func() {
// log.Sugar().Infof("AppendEntries 处理时间: %v", time.Since(start))
// }()
node.mu.Lock() node.mu.Lock()
defer node.mu.Unlock() defer node.mu.Unlock()

+ 3
- 3
internal/nodes/server_node.go View File

@ -14,7 +14,7 @@ type ServerReply struct{
Value string Value string
} }
// RPC call // RPC call
func (node *Node) WriteKV(kvCall LogEntryCall, reply *ServerReply) error {
func (node *Node) WriteKV(kvCall *LogEntryCall, reply *ServerReply) error {
log.Sugar().Infof("[%s]收到客户端write请求", node.selfId) log.Sugar().Infof("[%s]收到客户端write请求", node.selfId)
// 自己不是leader,转交leader地址回复 // 自己不是leader,转交leader地址回复
@ -43,10 +43,10 @@ func (node *Node) WriteKV(kvCall LogEntryCall, reply *ServerReply) error {
} }
// RPC call // RPC call
func (node *Node) ReadKey(key string, reply *ServerReply) error {
func (node *Node) ReadKey(key *string, reply *ServerReply) error {
log.Sugar().Infof("[%s]收到客户端read请求", node.selfId) log.Sugar().Infof("[%s]收到客户端read请求", node.selfId)
// 先只读自己(无论自己是不是leader),也方便测试 // 先只读自己(无论自己是不是leader),也方便测试
value, err := node.db.Get([]byte(key), nil)
value, err := node.db.Get([]byte(*key), nil)
if err == leveldb.ErrNotFound { if err == leveldb.ErrNotFound {
reply.HaveValue = false reply.HaveValue = false
} else { } else {

+ 9
- 23
internal/nodes/vote.go View File

@ -56,7 +56,6 @@ func (n *Node) startElection() {
// 并行向其他节点发送请求投票 // 并行向其他节点发送请求投票
var mu sync.Mutex var mu sync.Mutex
cond := sync.NewCond(&mu)
totalNodes := len(n.nodes) totalNodes := len(n.nodes)
grantedVotes := 1 // 自己的票 grantedVotes := 1 // 自己的票
@ -86,7 +85,6 @@ func (n *Node) startElection() {
n.state = Leader n.state = Leader
log.Sugar().Infof("[%s] 当选 Leader!", n.selfId) log.Sugar().Infof("[%s] 当选 Leader!", n.selfId)
n.initLeaderState() n.initLeaderState()
cond.Broadcast()
} }
mu.Unlock() mu.Unlock()
@ -95,31 +93,19 @@ func (n *Node) startElection() {
} }
// 等待选举结果 // 等待选举结果
timeout := time.After(300 * time.Millisecond)
for {
mu.Lock()
if n.state != Candidate { // 选举成功或回退,不再等待
mu.Unlock()
return
}
select {
case <-timeout:
log.Sugar().Infof("[%s] 选举超时,重新发起选举", n.selfId)
n.state = Follower
n.resetElectionTimer()
mu.Unlock()
return
default:
cond.Wait()
}
mu.Unlock()
time.Sleep(300 * time.Millisecond)
mu.Lock()
if n.state == Candidate {
log.Sugar().Infof("[%s] 选举超时,重新发起选举", n.selfId)
// n.state = Follower 这里不修改,如果appendentries收到term合理的心跳,再变回follower
n.resetElectionTimer()
} }
mu.Unlock()
} }
func (node *Node) sendRequestVote(peerId string, args *RequestVoteArgs, reply *RequestVoteReply) bool { func (node *Node) sendRequestVote(peerId string, args *RequestVoteArgs, reply *RequestVoteReply) bool {
log.Sugar().Infof("[%s] 请求 [%s] 投票给自己", node.selfId, peerId) log.Sugar().Infof("[%s] 请求 [%s] 投票给自己", node.selfId, peerId)
client, err := rpc.DialHTTP("tcp", node.nodes[peerId].address)
client, err := DialHTTPWithTimeout("tcp", node.nodes[peerId].address)
if err != nil { if err != nil {
log.Error("dialing: ", zap.Error(err)) log.Error("dialing: ", zap.Error(err))
return false return false
@ -132,7 +118,7 @@ func (node *Node) sendRequestVote(peerId string, args *RequestVoteArgs, reply *R
} }
}(client) }(client)
callErr := client.Call("Node.RequestVote", args, reply) // RPC
callErr := CallWithTimeout(client, "Node.RequestVote", args, reply) // RPC
if callErr != nil { if callErr != nil {
log.Error("dialing node_"+peerId+"fail: ", zap.Error(callErr)) log.Error("dialing node_"+peerId+"fail: ", zap.Error(callErr))
} }

Loading…
Cancel
Save