腾讯真题:Redis集群扩容方案
面试重要程度:⭐⭐⭐⭐⭐
真题来源:腾讯2024社招技术面试
考察重点:Redis集群架构、数据迁移、高可用设计
预计阅读时间:45分钟
真题背景
面试官: "我们的Redis集群目前有6个节点(3主3从),存储了约500GB数据,QPS达到10万。由于业务快速增长,需要扩容到12个节点(6主6从)。请详细设计扩容方案,包括数据迁移策略、服务可用性保证、回滚预案等。另外,如果在扩容过程中发现某个节点出现故障,应该如何处理?"
考察意图:
- Redis集群架构的深度理解
- 大规模数据迁移的工程实践能力
- 高可用系统设计思维
- 故障处理和应急响应能力
- 生产环境运维经验
🎯 现状分析与扩容规划
当前集群状态分析
集群拓扑:
# 当前集群状态 redis-cli --cluster info 127.0.0.1:7000 # 节点分布 Master1 (7000): slots 0-5460 (5461 slots) Master2 (7001): slots 5461-10922 (5462 slots) Master3 (7002): slots 10923-16383 (5461 slots) Slave1 (7003): replicates Master1 Slave2 (7004): replicates Master2 Slave3 (7005): replicates Master3
性能指标分析:
/** * 集群性能监控 */ @Component public class ClusterMonitor { @Autowired private RedisClusterConnection clusterConnection; /** * 获取集群性能指标 */ public ClusterMetrics getClusterMetrics() { ClusterMetrics metrics = new ClusterMetrics(); // 获取所有主节点 Iterable<RedisClusterNode> masters = clusterConnection.clusterGetNodes() .stream() .filter(RedisClusterNode::isMaster) .collect(Collectors.toList()); for (RedisClusterNode master : masters) { NodeMetrics nodeMetrics = getNodeMetrics(master); metrics.addNodeMetrics(nodeMetrics); } return metrics; } private NodeMetrics getNodeMetrics(RedisClusterNode node) { Properties info = clusterConnection.info(node); NodeMetrics metrics = new NodeMetrics(); metrics.setNodeId(node.getId()); metrics.setHost(node.getHost()); metrics.setPort(node.getPort()); // 内存使用情况 metrics.setUsedMemory(Long.parseLong(info.getProperty("used_memory", "0"))); metrics.setMaxMemory(Long.parseLong(info.getProperty("maxmemory", "0"))); // QPS统计 metrics.setCommandsProcessed(Long.parseLong(info.getProperty("total_commands_processed", "0"))); metrics.setConnectedClients(Integer.parseInt(info.getProperty("connected_clients", "0"))); // 槽位信息 Set<SlotRange> slotRanges = node.getSlotRange(); metrics.setSlotCount(slotRanges.stream() .mapToInt(range -> range.getEnd() - range.getStart() + 1) .sum()); return metrics; } } @Data public class ClusterMetrics { private List<NodeMetrics> nodeMetrics = new ArrayList<>(); private long totalMemoryUsed; private long totalQPS; private int totalSlots = 16384; public void addNodeMetrics(NodeMetrics nodeMetrics) { this.nodeMetrics.add(nodeMetrics); this.totalMemoryUsed += nodeMetrics.getUsedMemory(); this.totalQPS += nodeMetrics.getQps(); } /** * 分析是否需要扩容 */ public boolean needsExpansion() { // 内存使用率超过70% boolean memoryPressure = nodeMetrics.stream() .anyMatch(node -> node.getMemoryUsageRatio() > 0.7); // 单节点QPS超过3万 boolean qpsPressure = nodeMetrics.stream() .anyMatch(node -> node.getQps() > 30000); // 连接数超过5000 boolean connectionPressure = nodeMetrics.stream() .anyMatch(node -> node.getConnectedClients() > 5000); return memoryPressure || qpsPressure || connectionPressure; } }
扩容目标规划
扩容后集群架构:
/** * 扩容规划 */ @Component public class ExpansionPlanner { /** * 制定扩容计划 */ public ExpansionPlan createExpansionPlan(ClusterMetrics currentMetrics) { ExpansionPlan plan = new ExpansionPlan(); // 目标:6主6从架构 plan.setTargetMasterCount(6); plan.setTargetSlaveCount(6); // 新增节点规划 List<NodeConfig> newNodes = Arrays.asList( new NodeConfig("192.168.1.10", 7006, NodeType.MASTER), new NodeConfig("192.168.1.11", 7007, NodeType.MASTER), new NodeConfig("192.168.1.12", 7008, NodeType.MASTER), new NodeConfig("192.168.1.13", 7009, NodeType.SLAVE), new NodeConfig("192.168.1.14", 7010, NodeType.SLAVE), new NodeConfig("192.168.1.15", 7011, NodeType.SLAVE) ); plan.setNewNodes(newNodes); // 槽位重新分配计划 plan.setSlotReallocation(calculateSlotReallocation()); // 预估迁移时间 plan.setEstimatedMigrationTime(estimateMigrationTime(currentMetrics)); return plan; } private Map<String, SlotRange> calculateSlotReallocation() { Map<String, SlotRange> allocation = new HashMap<>(); // 6个主节点,每个节点约2731个槽位 int slotsPerMaster = 16384 / 6; int remainder = 16384 % 6; for (int i = 0; i < 6; i++) { int startSlot = i * slotsPerMaster; int endSlot = (i + 1) * slotsPerMaster - 1; // 前remainder个节点多分配1个槽位 if (i < remainder) { endSlot++; } allocation.put("master" + i, new SlotRange(startSlot, endSlot)); } return allocation; } private Duration estimateMigrationTime(ClusterMetrics metrics) { // 根据数据量和网络带宽估算 long totalDataSize = metrics.getTotalMemoryUsed(); long networkBandwidth = 1000 * 1024 * 1024; // 1GB/s long migrationBandwidth = networkBandwidth / 4; // 预留75%带宽给业务 long estimatedSeconds = totalDataSize / migrationBandwidth; return Duration.ofSeconds(estimatedSeconds); } }
🚀 扩容实施方案
阶段一:环境准备
新节点部署:
#!/bin/bash # 新节点部署脚本 # 1. 创建新节点配置文件 create_node_config() { local port=$1 local node_dir="/opt/redis/node-${port}" mkdir -p ${node_dir} cat > ${node_dir}/redis.conf << EOF port ${port} cluster-enabled yes cluster-config-file nodes-${port}.conf cluster-node-timeout 5000 appendonly yes appendfilename "appendonly-${port}.aof" dir ${node_dir} logfile ${node_dir}/redis-${port}.log pidfile /var/run/redis_${port}.pid # 内存配置 maxmemory 8gb maxmemory-policy allkeys-lru # 网络配置 tcp-keepalive 300 timeout 0 # 持久化配置 save 900 1 save 300 10 save 60 10000 EOF } # 2. 启动新节点 start_new_nodes() { for port in 7006 7007 7008 7009 7010 7011; do echo "Starting Redis node on port ${port}..." create_node_config ${port} redis-server /opt/redis/node-${port}/redis.conf & sleep 2 done } # 3. 验证节点状态 verify_nodes() { for port in 7006 7007 7008 7009 7010 7011; do if redis-cli -p ${port} ping | grep -q PONG; then echo "Node ${port}: OK" else echo "Node ${port}: FAILED" exit 1 fi done } start_new_nodes verify_nodes
环境检查清单:
/** * 扩容前环境检查 */ @Component public class PreExpansionChecker { /** * 执行扩容前检查 */ public CheckResult performPreExpansionCheck() { CheckResult result = new CheckResult(); // 1. 集群健康检查 result.addCheck("cluster_health", checkClusterHealth()); // 2. 节点资源检查 result.addCheck("node_resources", checkNodeResources()); // 3. 网络连通性检查 result.addCheck("network_connectivity", checkNetworkConnectivity()); // 4. 备份验证 result.addCheck("backup_verification", checkBackupStatus()); // 5. 监控系统检查 result.addCheck("monitoring_system", checkMonitoringSystem()); return result; } private boolean checkClusterHealth() { try { // 检查所有节点状态 Iterable<RedisClusterNode> nodes = clusterConnection.clusterGetNodes(); for (RedisClusterNode node : nodes) { if (node.getFlags().contains(RedisClusterNode.Flag.FAIL)) { log.error("Node {} is in FAIL state", node.getId()); return false; } } // 检查槽位分配 Properties clusterInfo = clusterConnection.clusterGetClusterInfo(); String clusterState = clusterInfo.getProperty("cluster_state"); return "ok".equals(clusterState); } catch (Exception e) { log.error("Cluster health check failed", e); return false; } } private boolean checkNodeResources() { // 检查CPU、内存、磁盘空间 return true; // 简化实现 } private boolean checkNetworkConnectivity() { // 检查新旧节点间网络连通性 return true; // 简化实现 } private boolean checkBackupStatus() { // 验证最近的备份是否可用 return true; // 简化实现 } private boolean checkMonitoringSystem() { // 确保监控系统正常工作 return true; // 简化实现 } }
阶段二:节点加入集群
添加新主节点:
/** * 集群扩容执行器 */ @Component public class ClusterExpansionExecutor { @Autowired private RedisClusterConnection clusterConnection; /** * 添加新主节点到集群 */ public void addMasterNodes(List<NodeConfig> masterNodes) { for (NodeConfig nodeConfig : masterNodes) { try { log.info("Adding master node: {}:{}", nodeConfig.getHost(), nodeConfig.getPort()); // 1. 将新节点加入集群 clusterConnection.clusterMeet(nodeConfig.getHost(), nodeConfig.getPort()); // 2. 等待节点握手完成 waitForNodeHandshake(nodeConfig); // 3. 验证节点状态 verifyNodeStatus(nodeConfig); log.info("Master node {}:{} added successfully", nodeConfig.getHost(), nodeConfig.getPort()); } catch (Exception e) { log.error("Failed to add master node {}:{}", nodeConfig.getHost(), nodeConfig.getPort(), e); throw new ExpansionException("Failed to add master node", e); } } } /** * 添加新从节点 */ public void addSlaveNodes(List<NodeConfig> slaveNodes, Map<String, String> masterSlaveMapping) { for (NodeConfig slaveConfig : slaveNodes) { try { log.info("Adding slave node: {}:{}", slaveConfig.getHost(), slaveConfig.getPort()); // 1. 将从节点加入集群 clusterConnection.clusterMeet(slaveConfig.getHost(), slaveConfig.getPort()); // 2. 等待握手完成 waitForNodeHandshake(slaveConfig); // 3. 设置主从关系 String masterId = masterSlaveMapping.get(slaveConfig.getNodeId()); clusterConnection.clusterReplicate(slaveConfig.getNodeId(), masterId); // 4. 验证主从关系 verifyReplicationStatus(slaveConfig, masterId); log.info("Slave node {}:{} added successfully", slaveConfig.getHost(), slaveConfig.getPort()); } catch (Exception e) { log.error("Failed to add slave node {}:{}", slaveConfig.getHost(), slaveConfig.getPort(), e); throw new ExpansionException("Failed to add slave node", e); } } } private void waitForNodeHandshake(NodeConfig nodeConfig) throws InterruptedException { int maxRetries = 30; int retryCount = 0; while (retryCount < maxRetries) { try { RedisClusterNode node = findNodeById(nodeConfig.getNodeId()); if (node != null && !node.getFlags().contains(RedisClusterNode.Flag.HANDSHAKE)) { return; // 握手完成 } } catch (Exception e) { // 忽略异常,继续重试 } Thread.sleep(1000); retryCount++; } throw new ExpansionException("Node handshake timeout: " + nodeConfig.getNodeId()); } }
阶段三:槽位迁移
槽位迁移策略:
/** * 槽位迁移管理器 */ @Component public class SlotMigrationManager { @Autowired private RedisClusterConnection clusterConnection; /** * 执行槽位迁移 */ public void migrateSlots(Map<String, SlotRange> reallocationPlan) { // 按批次迁移,避免对业务造成太大影响 int batchSize = 100; // 每批迁移100个槽位 for (Map.Entry<String, SlotRange> entry : reallocationPlan.entrySet()) { String targetNodeId = entry.getKey(); SlotRange slotRange = entry.getValue(); migrateSlotRange(targetNodeId, slotRange, batchSize); } } private void migrateSlotRange(String targetNodeId, SlotRange slotRange, int batchSize) { int startSlot = slotRange.getStart(); int endSlot = slotRange.getEn
剩余60%内容,订阅专栏后可继续查看/也可单篇购买
Java面试圣经 文章被收录于专栏
Java面试圣经,带你练透java圣经