ソースを参照

修复几个bug,节点只剩一个的时候只能get不能set

xbase 3 週間 前
コミット
11c56e1310
8 ファイル変更737 行追加21 行削除
  1. 1 0
      .gitignore
  2. 7 11
      README.md
  3. 211 0
      example/node1/main.go
  4. 210 0
      example/node2/main.go
  5. 210 0
      example/node3/main.go
  6. 4 0
      go.mod
  7. 86 1
      raft.go
  8. 8 9
      server.go

+ 1 - 0
.gitignore

@@ -24,3 +24,4 @@ _testmain.go
 *.test
 *.prof
 
+data/

+ 7 - 11
README.md

@@ -10,10 +10,10 @@
 - ✅ **日志压缩 (Snapshot)** - 自动触发,支持分块传输,动态阈值防止压缩风暴
 - ✅ **线性一致性读 (ReadIndex)** - 保证读取最新已提交数据
 - ✅ **Leadership Transfer** - 主动转移 Leader 角色
-- ✅ **请求转发** - Follower 自动转发写请求到 Leader
+- ✅ **自动请求路由** - Follower 自动转发读写请求到 Leader,无需客户端处理
+- ✅ **Leader 租约检查 (Lease Check)** - 防止网络分区的 Leader 接受无法提交的写请求
 - ✅ **持久化存储** - 日志和状态持久化到磁盘
 - ✅ **健康检查 & 监控指标** - 内置 Metrics 支持
-- ✅ **远程读取 (Get RPC)** - 支持从任意节点远程读取数据
 
 ## 快速开始
 
@@ -356,7 +356,7 @@ import "errors"
 // 常见错误
 var (
     raft.ErrNoLeader       // 没有 Leader
-    raft.ErrNotLeader      // 当前节点不是 Leader
+    raft.ErrNotLeader      // 当前节点不是 Leader (内部使用,会自动重试/转发)
     raft.ErrConfigInFlight // 配置变更正在进行中
     raft.ErrTimeout        // 操作超时
     raft.ErrShutdown       // 节点正在关闭
@@ -365,14 +365,9 @@ var (
 
 // 错误处理示例
 err := server.Set("key", "value")
-if errors.Is(err, raft.ErrNotLeader) {
-    // 重试或转发到 Leader
-}
-
-// RaftError 包含额外信息
-var raftErr *raft.RaftError
-if errors.As(err, &raftErr) {
-    fmt.Printf("Leader hint: %s, Retry in: %v\n", raftErr.LeaderID, raftErr.RetryIn)
+if err != nil {
+    // 客户端通常不需要手动处理 ErrNotLeader,因为 KVServer 会自动路由
+    log.Printf("Set failed: %v", err)
 }
 ```
 
@@ -614,6 +609,7 @@ ReadIndex succeeded: readIndex=19884
 | **选举限制** | 日志不完整的节点无法成为 Leader |
 | **PreVote** | 防止分区节点扰乱集群 |
 | **No-op 提交** | 新 Leader 立即提交前任 term 日志 |
+| **Lease Check** | Leader 必须保持多数派连接才能接受写请求,防止分区写入 |
 | **单配置变更** | 一次只允许一个成员变更 |
 | **动态压缩阈值** | 防止压缩风暴 |
 | **原子快照写入** | 使用 temp file + rename 保证原子性 |

+ 211 - 0
example/node1/main.go

@@ -0,0 +1,211 @@
+package main
+
+import (
+	"bufio"
+	"fmt"
+	"log"
+	"os"
+	"os/signal"
+	"strings"
+	"syscall"
+	"time"
+
+	"igit.com/xbase/raft"
+)
+
+const (
+	ColorReset  = "\033[0m"
+	ColorDim    = "\033[90m" // Dark Gray
+	ColorRed    = "\033[31m"
+	ColorGreen  = "\033[32m"
+	ColorYellow = "\033[33m"
+	ColorBlue   = "\033[34m"
+	ColorCyan   = "\033[36m"
+)
+
+func main() {
+	// Configuration
+	nodeID := "node1"
+	addr := "localhost:5001"
+	dataDir := "data/node1"
+
+	// Cluster configuration
+	clusterNodes := map[string]string{
+		"node1": "localhost:5001",
+		"node2": "localhost:5002",
+		"node3": "localhost:5003",
+	}
+
+	config := raft.DefaultConfig()
+	config.NodeID = nodeID
+	config.ListenAddr = addr
+	config.DataDir = dataDir
+	config.ClusterNodes = clusterNodes
+	// Log level: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR
+	config.Logger = raft.NewConsoleLogger(nodeID, 1)
+
+	// Ensure data directory exists
+	if err := os.MkdirAll(dataDir, 0755); err != nil {
+		log.Fatalf("Failed to create data directory: %v", err)
+	}
+
+	// Create KV Server
+	server, err := raft.NewKVServer(config)
+	if err != nil {
+		log.Fatalf("Failed to create server: %v", err)
+	}
+
+	// Start server
+	if err := server.Start(); err != nil {
+		log.Fatalf("Failed to start server: %v", err)
+	}
+
+	fmt.Printf("Node %s%s%s started on %s\n", ColorGreen, nodeID, ColorReset, addr)
+	fmt.Println("Commands: set <key> <val>, get <key>, del <key>")
+
+	// State Monitor Loop (Real-time status updates)
+	go func() {
+		var lastState string
+		var lastTerm uint64
+		// Initial state
+		stats := server.GetStats()
+		lastState = stats.State
+		lastTerm = stats.Term
+
+		ticker := time.NewTicker(100 * time.Millisecond)
+		defer ticker.Stop()
+
+		for range ticker.C {
+			stats := server.GetStats()
+			if stats.State != lastState || stats.Term != lastTerm {
+				fmt.Printf("\n%s[State Change] %s (Term %d) -> %s (Term %d)%s\n> ",
+					ColorYellow, lastState, lastTerm, stats.State, stats.Term, ColorReset)
+				lastState = stats.State
+				lastTerm = stats.Term
+			}
+		}
+	}()
+
+	// Simple test loop
+	go func() {
+		ticker := time.NewTicker(10 * time.Second)
+		defer ticker.Stop()
+		for range ticker.C {
+			fmt.Println() // Extra newline
+			if server.IsLeader() {
+				fmt.Printf("--- I am the %sLEADER%s ---\n", ColorGreen, ColorReset)
+			} else {
+				leaderID := server.GetLeaderID()
+				fmt.Printf("--- I am %sFOLLOWER%s (Leader: %s%s%s) ---\n", 
+					ColorYellow, ColorReset, ColorCyan, leaderID, ColorReset)
+			}
+
+			// We always try to Set and Get, letting the server handle routing
+			// Write a value
+			key := fmt.Sprintf("key-%d", time.Now().Unix())
+			val := fmt.Sprintf("val-%s", nodeID)
+			
+			err := server.Set(key, val)
+			
+			if server.IsLeader() {
+				if err != nil {
+					fmt.Printf("%sSet failed:%s %v\n", ColorRed, ColorReset, err)
+				} else {
+					fmt.Printf("%sSet%s %s%s%s%s%s=%s%s%s%s\n", 
+						ColorGreen, ColorReset, 
+						ColorCyan, key, ColorReset, 
+						ColorDim, ":", ColorReset, 
+						ColorYellow, val, ColorReset)
+				}
+				
+				// Read it back
+				if v, ok, err := server.GetLinear(key); err != nil {
+					fmt.Printf("%sGetLinear failed:%s %v\n", ColorRed, ColorReset, err)
+				} else if ok {
+					fmt.Printf("%sGet%s %s%s%s%s%s=%s%s%s%s\n", 
+						ColorGreen, ColorReset,
+						ColorCyan, key, ColorReset, 
+						ColorDim, ":", ColorReset, 
+						ColorYellow, v, ColorReset)
+				}
+			}
+
+			// Print stats
+			metrics := server.GetMetrics()
+			health := server.HealthCheck()
+			fmt.Printf("Term: %s%d%s (M:%d), CommitIndex: %s%d%s, Applied: %s%d%s\n", 
+				ColorBlue, health.Term, ColorReset, metrics.Term, 
+				ColorGreen, health.CommitIndex, ColorReset, 
+				ColorGreen, health.LastApplied, ColorReset)
+			fmt.Print("> ") // Prompt
+		}
+	}()
+
+	// CLI Loop
+	go func() {
+		scanner := bufio.NewScanner(os.Stdin)
+		fmt.Print("> ")
+		for scanner.Scan() {
+			text := strings.TrimSpace(scanner.Text())
+			if text == "" {
+				fmt.Print("> ")
+				continue
+			}
+			parts := strings.Fields(text)
+			cmd := strings.ToLower(parts[0])
+
+			switch cmd {
+			case "set":
+				if len(parts) != 3 {
+					fmt.Println("Usage: set <key> <value>")
+					break
+				}
+				key, val := parts[1], parts[2]
+				if err := server.Set(key, val); err != nil {
+					fmt.Printf("%sError:%s %v\n", ColorRed, ColorReset, err)
+				} else {
+					fmt.Printf("%sOK%s\n", ColorGreen, ColorReset)
+				}
+			case "get":
+				if len(parts) != 2 {
+					fmt.Println("Usage: get <key>")
+					break
+				}
+				key := parts[1]
+				if val, ok, err := server.GetLinear(key); err != nil {
+					fmt.Printf("%sError:%s %v\n", ColorRed, ColorReset, err)
+				} else if !ok {
+					fmt.Printf("%sNot Found%s\n", ColorYellow, ColorReset)
+				} else {
+					// Fixed: 7 verbs for 7 args
+					fmt.Printf("%s%s%s%s%s=%s%s\n", ColorDim, key, ColorReset, ColorDim, ":", ColorReset, val)
+				}
+			case "del", "delete":
+				if len(parts) != 2 {
+					fmt.Println("Usage: del <key>")
+					break
+				}
+				key := parts[1]
+				if err := server.Del(key); err != nil {
+					fmt.Printf("%sError:%s %v\n", ColorRed, ColorReset, err)
+				} else {
+					fmt.Printf("%sDeleted%s\n", ColorGreen, ColorReset)
+				}
+			case "help":
+				fmt.Println("Commands: set <key> <val>, get <key>, del <key>")
+			default:
+				fmt.Println("Unknown command")
+			}
+			fmt.Print("> ")
+		}
+	}()
+
+	// Handle shutdown
+	sigCh := make(chan os.Signal, 1)
+	signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
+	<-sigCh
+
+	fmt.Println("\nShutting down...")
+	server.Stop()
+}
+

+ 210 - 0
example/node2/main.go

@@ -0,0 +1,210 @@
+package main
+
+import (
+	"bufio"
+	"fmt"
+	"log"
+	"os"
+	"os/signal"
+	"strings"
+	"syscall"
+	"time"
+
+	"igit.com/xbase/raft"
+)
+
+const (
+	ColorReset  = "\033[0m"
+	ColorDim    = "\033[90m" // Dark Gray
+	ColorRed    = "\033[31m"
+	ColorGreen  = "\033[32m"
+	ColorYellow = "\033[33m"
+	ColorBlue   = "\033[34m"
+	ColorCyan   = "\033[36m"
+)
+
+func main() {
+	// Configuration
+	nodeID := "node2"
+	addr := "localhost:5002"
+	dataDir := "data/node2"
+
+	// Cluster configuration
+	clusterNodes := map[string]string{
+		"node1": "localhost:5001",
+		"node2": "localhost:5002",
+		"node3": "localhost:5003",
+	}
+
+	config := raft.DefaultConfig()
+	config.NodeID = nodeID
+	config.ListenAddr = addr
+	config.DataDir = dataDir
+	config.ClusterNodes = clusterNodes
+	// Log level: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR
+	config.Logger = raft.NewConsoleLogger(nodeID, 1)
+
+	// Ensure data directory exists
+	if err := os.MkdirAll(dataDir, 0755); err != nil {
+		log.Fatalf("Failed to create data directory: %v", err)
+	}
+
+	// Create KV Server
+	server, err := raft.NewKVServer(config)
+	if err != nil {
+		log.Fatalf("Failed to create server: %v", err)
+	}
+
+	// Start server
+	if err := server.Start(); err != nil {
+		log.Fatalf("Failed to start server: %v", err)
+	}
+
+	fmt.Printf("Node %s%s%s started on %s\n", ColorGreen, nodeID, ColorReset, addr)
+	fmt.Println("Commands: set <key> <val>, get <key>, del <key>")
+
+	// State Monitor Loop (Real-time status updates)
+	go func() {
+		var lastState string
+		var lastTerm uint64
+		// Initial state
+		stats := server.GetStats()
+		lastState = stats.State
+		lastTerm = stats.Term
+
+		ticker := time.NewTicker(100 * time.Millisecond)
+		defer ticker.Stop()
+
+		for range ticker.C {
+			stats := server.GetStats()
+			if stats.State != lastState || stats.Term != lastTerm {
+				fmt.Printf("\n%s[State Change] %s (Term %d) -> %s (Term %d)%s\n> ",
+					ColorYellow, lastState, lastTerm, stats.State, stats.Term, ColorReset)
+				lastState = stats.State
+				lastTerm = stats.Term
+			}
+		}
+	}()
+
+	// Simple test loop
+	go func() {
+		ticker := time.NewTicker(10 * time.Second)
+		defer ticker.Stop()
+		for range ticker.C {
+			fmt.Println() // Extra newline
+			if server.IsLeader() {
+				fmt.Printf("--- I am the %sLEADER%s ---\n", ColorGreen, ColorReset)
+			} else {
+				leaderID := server.GetLeaderID()
+				fmt.Printf("--- I am %sFOLLOWER%s (Leader: %s%s%s) ---\n", 
+					ColorYellow, ColorReset, ColorCyan, leaderID, ColorReset)
+			}
+
+			// We always try to Set and Get, letting the server handle routing
+			// Write a value
+			key := fmt.Sprintf("key-%d", time.Now().Unix())
+			val := fmt.Sprintf("val-%s", nodeID)
+			
+			err := server.Set(key, val)
+			
+			if server.IsLeader() {
+				if err != nil {
+					fmt.Printf("%sSet failed:%s %v\n", ColorRed, ColorReset, err)
+				} else {
+					fmt.Printf("%sSet%s %s%s%s%s%s=%s%s%s%s\n", 
+						ColorGreen, ColorReset, 
+						ColorCyan, key, ColorReset, 
+						ColorDim, ":", ColorReset, 
+						ColorYellow, val, ColorReset)
+				}
+				
+				// Read it back
+				if v, ok, err := server.GetLinear(key); err != nil {
+					fmt.Printf("%sGetLinear failed:%s %v\n", ColorRed, ColorReset, err)
+				} else if ok {
+					fmt.Printf("%sGet%s %s%s%s%s%s=%s%s%s%s\n", 
+						ColorGreen, ColorReset,
+						ColorCyan, key, ColorReset, 
+						ColorDim, ":", ColorReset, 
+						ColorYellow, v, ColorReset)
+				}
+			}
+
+			// Print stats
+			metrics := server.GetMetrics()
+			health := server.HealthCheck()
+			fmt.Printf("Term: %s%d%s (M:%d), CommitIndex: %s%d%s, Applied: %s%d%s\n", 
+				ColorBlue, health.Term, ColorReset, metrics.Term, 
+				ColorGreen, health.CommitIndex, ColorReset, 
+				ColorGreen, health.LastApplied, ColorReset)
+			fmt.Print("> ") // Prompt
+		}
+	}()
+
+	// CLI Loop
+	go func() {
+		scanner := bufio.NewScanner(os.Stdin)
+		fmt.Print("> ")
+		for scanner.Scan() {
+			text := strings.TrimSpace(scanner.Text())
+			if text == "" {
+				fmt.Print("> ")
+				continue
+			}
+			parts := strings.Fields(text)
+			cmd := strings.ToLower(parts[0])
+
+			switch cmd {
+			case "set":
+				if len(parts) != 3 {
+					fmt.Println("Usage: set <key> <value>")
+					break
+				}
+				key, val := parts[1], parts[2]
+				if err := server.Set(key, val); err != nil {
+					fmt.Printf("%sError:%s %v\n", ColorRed, ColorReset, err)
+				} else {
+					fmt.Printf("%sOK%s\n", ColorGreen, ColorReset)
+				}
+			case "get":
+				if len(parts) != 2 {
+					fmt.Println("Usage: get <key>")
+					break
+				}
+				key := parts[1]
+				if val, ok, err := server.GetLinear(key); err != nil {
+					fmt.Printf("%sError:%s %v\n", ColorRed, ColorReset, err)
+				} else if !ok {
+					fmt.Printf("%sNot Found%s\n", ColorYellow, ColorReset)
+				} else {
+					// Fixed: 7 verbs for 7 args
+					fmt.Printf("%s%s%s%s%s=%s%s\n", ColorDim, key, ColorReset, ColorDim, ":", ColorReset, val)
+				}
+			case "del", "delete":
+				if len(parts) != 2 {
+					fmt.Println("Usage: del <key>")
+					break
+				}
+				key := parts[1]
+				if err := server.Del(key); err != nil {
+					fmt.Printf("%sError:%s %v\n", ColorRed, ColorReset, err)
+				} else {
+					fmt.Printf("%sDeleted%s\n", ColorGreen, ColorReset)
+				}
+			case "help":
+				fmt.Println("Commands: set <key> <val>, get <key>, del <key>")
+			default:
+				fmt.Println("Unknown command")
+			}
+			fmt.Print("> ")
+		}
+	}()
+
+	// Handle shutdown
+	sigCh := make(chan os.Signal, 1)
+	signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
+	<-sigCh
+
+	fmt.Println("\nShutting down...")
+	server.Stop()
+}

+ 210 - 0
example/node3/main.go

@@ -0,0 +1,210 @@
+package main
+
+import (
+	"bufio"
+	"fmt"
+	"log"
+	"os"
+	"os/signal"
+	"strings"
+	"syscall"
+	"time"
+
+	"igit.com/xbase/raft"
+)
+
+const (
+	ColorReset  = "\033[0m"
+	ColorDim    = "\033[90m" // Dark Gray
+	ColorRed    = "\033[31m"
+	ColorGreen  = "\033[32m"
+	ColorYellow = "\033[33m"
+	ColorBlue   = "\033[34m"
+	ColorCyan   = "\033[36m"
+)
+
+func main() {
+	// Configuration
+	nodeID := "node3"
+	addr := "localhost:5003"
+	dataDir := "data/node3"
+
+	// Cluster configuration
+	clusterNodes := map[string]string{
+		"node1": "localhost:5001",
+		"node2": "localhost:5002",
+		"node3": "localhost:5003",
+	}
+
+	config := raft.DefaultConfig()
+	config.NodeID = nodeID
+	config.ListenAddr = addr
+	config.DataDir = dataDir
+	config.ClusterNodes = clusterNodes
+	// Log level: 0=DEBUG, 1=INFO, 2=WARN, 3=ERROR
+	config.Logger = raft.NewConsoleLogger(nodeID, 1)
+
+	// Ensure data directory exists
+	if err := os.MkdirAll(dataDir, 0755); err != nil {
+		log.Fatalf("Failed to create data directory: %v", err)
+	}
+
+	// Create KV Server
+	server, err := raft.NewKVServer(config)
+	if err != nil {
+		log.Fatalf("Failed to create server: %v", err)
+	}
+
+	// Start server
+	if err := server.Start(); err != nil {
+		log.Fatalf("Failed to start server: %v", err)
+	}
+
+	fmt.Printf("Node %s%s%s started on %s\n", ColorGreen, nodeID, ColorReset, addr)
+	fmt.Println("Commands: set <key> <val>, get <key>, del <key>")
+
+	// State Monitor Loop (Real-time status updates)
+	go func() {
+		var lastState string
+		var lastTerm uint64
+		// Initial state
+		stats := server.GetStats()
+		lastState = stats.State
+		lastTerm = stats.Term
+
+		ticker := time.NewTicker(100 * time.Millisecond)
+		defer ticker.Stop()
+
+		for range ticker.C {
+			stats := server.GetStats()
+			if stats.State != lastState || stats.Term != lastTerm {
+				fmt.Printf("\n%s[State Change] %s (Term %d) -> %s (Term %d)%s\n> ",
+					ColorYellow, lastState, lastTerm, stats.State, stats.Term, ColorReset)
+				lastState = stats.State
+				lastTerm = stats.Term
+			}
+		}
+	}()
+
+	// Simple test loop
+	go func() {
+		ticker := time.NewTicker(10 * time.Second)
+		defer ticker.Stop()
+		for range ticker.C {
+			fmt.Println() // Extra newline
+			if server.IsLeader() {
+				fmt.Printf("--- I am the %sLEADER%s ---\n", ColorGreen, ColorReset)
+			} else {
+				leaderID := server.GetLeaderID()
+				fmt.Printf("--- I am %sFOLLOWER%s (Leader: %s%s%s) ---\n", 
+					ColorYellow, ColorReset, ColorCyan, leaderID, ColorReset)
+			}
+
+			// We always try to Set and Get, letting the server handle routing
+			// Write a value
+			key := fmt.Sprintf("key-%d", time.Now().Unix())
+			val := fmt.Sprintf("val-%s", nodeID)
+			
+			err := server.Set(key, val)
+			
+			if server.IsLeader() {
+				if err != nil {
+					fmt.Printf("%sSet failed:%s %v\n", ColorRed, ColorReset, err)
+				} else {
+					fmt.Printf("%sSet%s %s%s%s%s%s=%s%s%s%s\n", 
+						ColorGreen, ColorReset, 
+						ColorCyan, key, ColorReset, 
+						ColorDim, ":", ColorReset, 
+						ColorYellow, val, ColorReset)
+				}
+				
+				// Read it back
+				if v, ok, err := server.GetLinear(key); err != nil {
+					fmt.Printf("%sGetLinear failed:%s %v\n", ColorRed, ColorReset, err)
+				} else if ok {
+					fmt.Printf("%sGet%s %s%s%s%s%s=%s%s%s%s\n", 
+						ColorGreen, ColorReset,
+						ColorCyan, key, ColorReset, 
+						ColorDim, ":", ColorReset, 
+						ColorYellow, v, ColorReset)
+				}
+			}
+
+			// Print stats
+			metrics := server.GetMetrics()
+			health := server.HealthCheck()
+			fmt.Printf("Term: %s%d%s (M:%d), CommitIndex: %s%d%s, Applied: %s%d%s\n", 
+				ColorBlue, health.Term, ColorReset, metrics.Term, 
+				ColorGreen, health.CommitIndex, ColorReset, 
+				ColorGreen, health.LastApplied, ColorReset)
+			fmt.Print("> ") // Prompt
+		}
+	}()
+
+	// CLI Loop
+	go func() {
+		scanner := bufio.NewScanner(os.Stdin)
+		fmt.Print("> ")
+		for scanner.Scan() {
+			text := strings.TrimSpace(scanner.Text())
+			if text == "" {
+				fmt.Print("> ")
+				continue
+			}
+			parts := strings.Fields(text)
+			cmd := strings.ToLower(parts[0])
+
+			switch cmd {
+			case "set":
+				if len(parts) != 3 {
+					fmt.Println("Usage: set <key> <value>")
+					break
+				}
+				key, val := parts[1], parts[2]
+				if err := server.Set(key, val); err != nil {
+					fmt.Printf("%sError:%s %v\n", ColorRed, ColorReset, err)
+				} else {
+					fmt.Printf("%sOK%s\n", ColorGreen, ColorReset)
+				}
+			case "get":
+				if len(parts) != 2 {
+					fmt.Println("Usage: get <key>")
+					break
+				}
+				key := parts[1]
+				if val, ok, err := server.GetLinear(key); err != nil {
+					fmt.Printf("%sError:%s %v\n", ColorRed, ColorReset, err)
+				} else if !ok {
+					fmt.Printf("%sNot Found%s\n", ColorYellow, ColorReset)
+				} else {
+					// Fixed: 7 verbs for 7 args
+					fmt.Printf("%s%s%s%s%s=%s%s\n", ColorDim, key, ColorReset, ColorDim, ":", ColorReset, val)
+				}
+			case "del", "delete":
+				if len(parts) != 2 {
+					fmt.Println("Usage: del <key>")
+					break
+				}
+				key := parts[1]
+				if err := server.Del(key); err != nil {
+					fmt.Printf("%sError:%s %v\n", ColorRed, ColorReset, err)
+				} else {
+					fmt.Printf("%sDeleted%s\n", ColorGreen, ColorReset)
+				}
+			case "help":
+				fmt.Println("Commands: set <key> <val>, get <key>, del <key>")
+			default:
+				fmt.Println("Unknown command")
+			}
+			fmt.Print("> ")
+		}
+	}()
+
+	// Handle shutdown
+	sigCh := make(chan os.Signal, 1)
+	signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
+	<-sigCh
+
+	fmt.Println("\nShutting down...")
+	server.Stop()
+}

+ 4 - 0
go.mod

@@ -0,0 +1,4 @@
+module igit.com/xbase/raft
+
+go 1.20
+

+ 86 - 1
raft.go

@@ -4,6 +4,8 @@ import (
 	"context"
 	"fmt"
 	"math/rand"
+	"os"
+	"strings"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -105,6 +107,9 @@ type Raft struct {
 
 	// Snapshot receiving state (for chunked transfer)
 	pendingSnapshot *pendingSnapshotState
+
+	// Last contact time for each peer (for leader check)
+	lastContact map[string]time.Time
 }
 
 // readIndexRequest represents a pending read index request
@@ -256,8 +261,12 @@ func NewRaft(config *Config, transport Transport, applyCh chan ApplyMsg) (*Raft,
 		logger:        config.Logger,
 		readIndexCh:   make(chan *readIndexRequest, 100),
 		lastHeartbeat: time.Now(),
+		lastContact:   make(map[string]time.Time),
 	}
 
+	// Initialize metrics
+	r.metrics.Term = state.CurrentTerm
+
 	// Set RPC handler
 	transport.SetRPCHandler(r)
 
@@ -281,7 +290,12 @@ func (r *Raft) Start() error {
 	// Restore FSM from snapshot if exists
 	// This must happen before starting apply loop to ensure FSM state is restored
 	if err := r.restoreFromSnapshot(); err != nil {
-		r.logger.Warn("Failed to restore from snapshot: %v", err)
+		// Suppress warning if it's just a missing file (first start)
+		if !os.IsNotExist(err) && !strings.Contains(err.Error(), "no such file") {
+			r.logger.Warn("Failed to restore from snapshot: %v", err)
+		} else {
+			r.logger.Info("No snapshot found, starting with empty state")
+		}
 		// Continue anyway - the node can still function, just without historical state
 	}
 
@@ -409,6 +423,7 @@ func (r *Raft) runFollower() {
 				}
 				r.logger.Debug("Election timeout, becoming candidate")
 				r.state = Candidate
+				r.leaderID = "" // Clear leaderID so we can vote for self in Pre-Vote
 			}
 			r.mu.Unlock()
 			return
@@ -441,6 +456,9 @@ func (r *Raft) runCandidate() {
 	r.votedFor = r.nodeID
 	r.leaderID = ""
 	currentTerm := r.currentTerm
+	// Update metrics
+	atomic.StoreUint64(&r.metrics.Term, currentTerm)
+	
 	if err := r.persistState(); err != nil {
 		r.logger.Error("Failed to persist state during election: %v", err)
 		r.mu.Unlock()
@@ -704,6 +722,9 @@ func (r *Raft) becomeFollower(term uint64) {
 
 	r.state = Follower
 	r.currentTerm = term
+	// Update metrics
+	atomic.StoreUint64(&r.metrics.Term, term)
+
 	r.votedFor = ""
 	r.leaderID = ""
 	// Must persist before responding - use mustPersistState for critical transitions
@@ -854,6 +875,9 @@ func (r *Raft) replicateToPeer(peer string, term, leaderCommit uint64) {
 	}
 
 	if reply.Success {
+		// Update contact time
+		r.lastContact[peer] = time.Now()
+
 		// Update nextIndex and matchIndex
 		if len(entries) > 0 {
 			newMatchIndex := entries[len(entries)-1].Index
@@ -869,6 +893,9 @@ func (r *Raft) replicateToPeer(peer string, term, leaderCommit uint64) {
 			}
 		}
 	} else {
+		// Even if failed (log conflict), we contacted the peer
+		r.lastContact[peer] = time.Now()
+
 		// Decrement nextIndex and retry
 		if reply.ConflictTerm > 0 {
 			// Find the last entry of ConflictTerm in our log
@@ -1657,6 +1684,28 @@ func (r *Raft) Propose(command []byte) (uint64, uint64, bool) {
 		return 0, 0, false
 	}
 
+	// Check connectivity (Lease Check)
+	// If we haven't heard from a majority of peers within ElectionTimeout,
+	// we shouldn't accept new commands because we might be partitioned.
+	if len(r.clusterNodes) > 1 {
+		activePeers := 1 // Self
+		now := time.Now()
+		timeout := r.config.ElectionTimeoutMax
+
+		for _, peer := range r.peers {
+			if last, ok := r.lastContact[peer]; ok && now.Sub(last) < timeout {
+				activePeers++
+			}
+		}
+
+		// Check majority
+		needed := len(r.clusterNodes)/2 + 1
+		if activePeers < needed {
+			r.logger.Warn("Rejecting Propose: lost contact with majority (active: %d, needed: %d)", activePeers, needed)
+			return 0, 0, false
+		}
+	}
+
 	index, err := r.log.AppendCommand(r.currentTerm, command)
 	if err != nil {
 		r.logger.Error("Failed to append command: %v", err)
@@ -1761,6 +1810,42 @@ func (r *Raft) ProposeWithForward(command []byte) (index uint64, term uint64, er
 	return reply.Index, reply.Term, nil
 }
 
+// ForwardGet forwards a get request to the leader
+func (r *Raft) ForwardGet(key string) (string, bool, error) {
+	// Check if we are leader (local read)
+	if r.state == Leader {
+		if r.config.GetHandler != nil {
+			val, found := r.config.GetHandler(key)
+			return val, found, nil
+		}
+		return "", false, fmt.Errorf("get handler not configured")
+	}
+
+	r.mu.RLock()
+	leaderID := r.leaderID
+	leaderAddr := r.clusterNodes[leaderID]
+	r.mu.RUnlock()
+
+	if leaderID == "" {
+		return "", false, ErrNoLeader
+	}
+	if leaderAddr == "" {
+		return "", false, fmt.Errorf("leader %s address not found", leaderID)
+	}
+
+	// Forward to leader
+	ctx, cancel := context.WithTimeout(context.Background(), r.config.RPCTimeout)
+	defer cancel()
+
+	args := &GetArgs{Key: key}
+	reply, err := r.transport.ForwardGet(ctx, leaderAddr, args)
+	if err != nil {
+		return "", false, fmt.Errorf("forward failed: %w", err)
+	}
+
+	return reply.Value, reply.Found, nil
+}
+
 // HandlePropose handles forwarded propose requests
 func (r *Raft) HandlePropose(args *ProposeArgs) *ProposeReply {
 	index, term, isLeader := r.Propose(args.Command)

+ 8 - 9
server.go

@@ -2,6 +2,7 @@ package raft
 
 import (
 	"encoding/json"
+	"errors"
 	"fmt"
 	"sort"
 	"strings"
@@ -131,7 +132,7 @@ func (s *KVServer) GetLinear(key string) (string, bool, error) {
 	_, err := s.Raft.ReadIndex()
 	if err != nil {
 		// If we're not leader, try forwarding
-		if err == ErrNotLeader {
+		if errors.Is(err, ErrNotLeader) {
 			return s.forwardGet(key)
 		}
 		return "", false, err
@@ -143,14 +144,7 @@ func (s *KVServer) GetLinear(key string) (string, bool, error) {
 
 // forwardGet forwards a get request to the leader
 func (s *KVServer) forwardGet(key string) (string, bool, error) {
-	leaderID := s.Raft.GetLeaderID()
-	if leaderID == "" {
-		return "", false, ErrNoLeader
-	}
-
-	// For now, return an error asking client to retry on leader
-	// A full implementation would forward the request
-	return "", false, NewRaftError(ErrNotLeader, leaderID, 100*time.Millisecond)
+	return s.Raft.ForwardGet(key)
 }
 
 // Join joins an existing cluster
@@ -181,6 +175,11 @@ func (s *KVServer) HealthCheck() HealthStatus {
 	return s.Raft.HealthCheck()
 }
 
+// GetStats returns runtime statistics
+func (s *KVServer) GetStats() Stats {
+	return s.Raft.GetStats()
+}
+
 // GetMetrics returns runtime metrics
 func (s *KVServer) GetMetrics() Metrics {
 	return s.Raft.GetMetrics()