腾讯真题:Redis集群扩容方案

面试重要程度:⭐⭐⭐⭐⭐

真题来源:腾讯2024社招技术面试

考察重点:Redis集群架构、数据迁移、高可用设计

预计阅读时间:45分钟

真题背景

面试官: "我们的Redis集群目前有6个节点(3主3从),存储了约500GB数据,QPS达到10万。由于业务快速增长,需要扩容到12个节点(6主6从)。请详细设计扩容方案,包括数据迁移策略、服务可用性保证、回滚预案等。另外,如果在扩容过程中发现某个节点出现故障,应该如何处理?"

考察意图:

  • Redis集群架构的深度理解
  • 大规模数据迁移的工程实践能力
  • 高可用系统设计思维
  • 故障处理和应急响应能力
  • 生产环境运维经验

🎯 现状分析与扩容规划

当前集群状态分析

集群拓扑:

# 当前集群状态
redis-cli --cluster info 127.0.0.1:7000

# 节点分布
Master1 (7000): slots 0-5460     (5461 slots) 
Master2 (7001): slots 5461-10922 (5462 slots)
Master3 (7002): slots 10923-16383 (5461 slots)

Slave1  (7003): replicates Master1
Slave2  (7004): replicates Master2  
Slave3  (7005): replicates Master3

性能指标分析:

/**
 * 集群性能监控
 */
@Component
public class ClusterMonitor {
    
    @Autowired
    private RedisClusterConnection clusterConnection;
    
    /**
     * 获取集群性能指标
     */
    public ClusterMetrics getClusterMetrics() {
        ClusterMetrics metrics = new ClusterMetrics();
        
        // 获取所有主节点
        Iterable<RedisClusterNode> masters = clusterConnection.clusterGetNodes()
            .stream()
            .filter(RedisClusterNode::isMaster)
            .collect(Collectors.toList());
        
        for (RedisClusterNode master : masters) {
            NodeMetrics nodeMetrics = getNodeMetrics(master);
            metrics.addNodeMetrics(nodeMetrics);
        }
        
        return metrics;
    }
    
    private NodeMetrics getNodeMetrics(RedisClusterNode node) {
        Properties info = clusterConnection.info(node);
        
        NodeMetrics metrics = new NodeMetrics();
        metrics.setNodeId(node.getId());
        metrics.setHost(node.getHost());
        metrics.setPort(node.getPort());
        
        // 内存使用情况
        metrics.setUsedMemory(Long.parseLong(info.getProperty("used_memory", "0")));
        metrics.setMaxMemory(Long.parseLong(info.getProperty("maxmemory", "0")));
        
        // QPS统计
        metrics.setCommandsProcessed(Long.parseLong(info.getProperty("total_commands_processed", "0")));
        metrics.setConnectedClients(Integer.parseInt(info.getProperty("connected_clients", "0")));
        
        // 槽位信息
        Set<SlotRange> slotRanges = node.getSlotRange();
        metrics.setSlotCount(slotRanges.stream()
            .mapToInt(range -> range.getEnd() - range.getStart() + 1)
            .sum());
        
        return metrics;
    }
}

@Data
public class ClusterMetrics {
    private List<NodeMetrics> nodeMetrics = new ArrayList<>();
    private long totalMemoryUsed;
    private long totalQPS;
    private int totalSlots = 16384;
    
    public void addNodeMetrics(NodeMetrics nodeMetrics) {
        this.nodeMetrics.add(nodeMetrics);
        this.totalMemoryUsed += nodeMetrics.getUsedMemory();
        this.totalQPS += nodeMetrics.getQps();
    }
    
    /**
     * 分析是否需要扩容
     */
    public boolean needsExpansion() {
        // 内存使用率超过70%
        boolean memoryPressure = nodeMetrics.stream()
            .anyMatch(node -> node.getMemoryUsageRatio() > 0.7);
        
        // 单节点QPS超过3万
        boolean qpsPressure = nodeMetrics.stream()
            .anyMatch(node -> node.getQps() > 30000);
        
        // 连接数超过5000
        boolean connectionPressure = nodeMetrics.stream()
            .anyMatch(node -> node.getConnectedClients() > 5000);
        
        return memoryPressure || qpsPressure || connectionPressure;
    }
}

扩容目标规划

扩容后集群架构:

/**
 * 扩容规划
 */
@Component
public class ExpansionPlanner {
    
    /**
     * 制定扩容计划
     */
    public ExpansionPlan createExpansionPlan(ClusterMetrics currentMetrics) {
        ExpansionPlan plan = new ExpansionPlan();
        
        // 目标:6主6从架构
        plan.setTargetMasterCount(6);
        plan.setTargetSlaveCount(6);
        
        // 新增节点规划
        List<NodeConfig> newNodes = Arrays.asList(
            new NodeConfig("192.168.1.10", 7006, NodeType.MASTER),
            new NodeConfig("192.168.1.11", 7007, NodeType.MASTER),
            new NodeConfig("192.168.1.12", 7008, NodeType.MASTER),
            new NodeConfig("192.168.1.13", 7009, NodeType.SLAVE),
            new NodeConfig("192.168.1.14", 7010, NodeType.SLAVE),
            new NodeConfig("192.168.1.15", 7011, NodeType.SLAVE)
        );
        plan.setNewNodes(newNodes);
        
        // 槽位重新分配计划
        plan.setSlotReallocation(calculateSlotReallocation());
        
        // 预估迁移时间
        plan.setEstimatedMigrationTime(estimateMigrationTime(currentMetrics));
        
        return plan;
    }
    
    private Map<String, SlotRange> calculateSlotReallocation() {
        Map<String, SlotRange> allocation = new HashMap<>();
        
        // 6个主节点,每个节点约2731个槽位
        int slotsPerMaster = 16384 / 6;
        int remainder = 16384 % 6;
        
        for (int i = 0; i < 6; i++) {
            int startSlot = i * slotsPerMaster;
            int endSlot = (i + 1) * slotsPerMaster - 1;
            
            // 前remainder个节点多分配1个槽位
            if (i < remainder) {
                endSlot++;
            }
            
            allocation.put("master" + i, new SlotRange(startSlot, endSlot));
        }
        
        return allocation;
    }
    
    private Duration estimateMigrationTime(ClusterMetrics metrics) {
        // 根据数据量和网络带宽估算
        long totalDataSize = metrics.getTotalMemoryUsed();
        long networkBandwidth = 1000 * 1024 * 1024; // 1GB/s
        long migrationBandwidth = networkBandwidth / 4; // 预留75%带宽给业务
        
        long estimatedSeconds = totalDataSize / migrationBandwidth;
        return Duration.ofSeconds(estimatedSeconds);
    }
}

🚀 扩容实施方案

阶段一:环境准备

新节点部署:

#!/bin/bash
# 新节点部署脚本

# 1. 创建新节点配置文件
create_node_config() {
    local port=$1
    local node_dir="/opt/redis/node-${port}"
    
    mkdir -p ${node_dir}
    
    cat > ${node_dir}/redis.conf << EOF
port ${port}
cluster-enabled yes
cluster-config-file nodes-${port}.conf
cluster-node-timeout 5000
appendonly yes
appendfilename "appendonly-${port}.aof"
dir ${node_dir}
logfile ${node_dir}/redis-${port}.log
pidfile /var/run/redis_${port}.pid

# 内存配置
maxmemory 8gb
maxmemory-policy allkeys-lru

# 网络配置
tcp-keepalive 300
timeout 0

# 持久化配置
save 900 1
save 300 10
save 60 10000
EOF
}

# 2. 启动新节点
start_new_nodes() {
    for port in 7006 7007 7008 7009 7010 7011; do
        echo "Starting Redis node on port ${port}..."
        create_node_config ${port}
        redis-server /opt/redis/node-${port}/redis.conf &
        sleep 2
    done
}

# 3. 验证节点状态
verify_nodes() {
    for port in 7006 7007 7008 7009 7010 7011; do
        if redis-cli -p ${port} ping | grep -q PONG; then
            echo "Node ${port}: OK"
        else
            echo "Node ${port}: FAILED"
            exit 1
        fi
    done
}

start_new_nodes
verify_nodes

环境检查清单:

/**
 * 扩容前环境检查
 */
@Component
public class PreExpansionChecker {
    
    /**
     * 执行扩容前检查
     */
    public CheckResult performPreExpansionCheck() {
        CheckResult result = new CheckResult();
        
        // 1. 集群健康检查
        result.addCheck("cluster_health", checkClusterHealth());
        
        // 2. 节点资源检查
        result.addCheck("node_resources", checkNodeResources());
        
        // 3. 网络连通性检查
        result.addCheck("network_connectivity", checkNetworkConnectivity());
        
        // 4. 备份验证
        result.addCheck("backup_verification", checkBackupStatus());
        
        // 5. 监控系统检查
        result.addCheck("monitoring_system", checkMonitoringSystem());
        
        return result;
    }
    
    private boolean checkClusterHealth() {
        try {
            // 检查所有节点状态
            Iterable<RedisClusterNode> nodes = clusterConnection.clusterGetNodes();
            
            for (RedisClusterNode node : nodes) {
                if (node.getFlags().contains(RedisClusterNode.Flag.FAIL)) {
                    log.error("Node {} is in FAIL state", node.getId());
                    return false;
                }
            }
            
            // 检查槽位分配
            Properties clusterInfo = clusterConnection.clusterGetClusterInfo();
            String clusterState = clusterInfo.getProperty("cluster_state");
            
            return "ok".equals(clusterState);
            
        } catch (Exception e) {
            log.error("Cluster health check failed", e);
            return false;
        }
    }
    
    private boolean checkNodeResources() {
        // 检查CPU、内存、磁盘空间
        return true; // 简化实现
    }
    
    private boolean checkNetworkConnectivity() {
        // 检查新旧节点间网络连通性
        return true; // 简化实现
    }
    
    private boolean checkBackupStatus() {
        // 验证最近的备份是否可用
        return true; // 简化实现
    }
    
    private boolean checkMonitoringSystem() {
        // 确保监控系统正常工作
        return true; // 简化实现
    }
}

阶段二:节点加入集群

添加新主节点:

/**
 * 集群扩容执行器
 */
@Component
public class ClusterExpansionExecutor {
    
    @Autowired
    private RedisClusterConnection clusterConnection;
    
    /**
     * 添加新主节点到集群
     */
    public void addMasterNodes(List<NodeConfig> masterNodes) {
        for (NodeConfig nodeConfig : masterNodes) {
            try {
                log.info("Adding master node: {}:{}", nodeConfig.getHost(), nodeConfig.getPort());
                
                // 1. 将新节点加入集群
                clusterConnection.clusterMeet(nodeConfig.getHost(), nodeConfig.getPort());
                
                // 2. 等待节点握手完成
                waitForNodeHandshake(nodeConfig);
                
                // 3. 验证节点状态
                verifyNodeStatus(nodeConfig);
                
                log.info("Master node {}:{} added successfully", 
                    nodeConfig.getHost(), nodeConfig.getPort());
                
            } catch (Exception e) {
                log.error("Failed to add master node {}:{}", 
                    nodeConfig.getHost(), nodeConfig.getPort(), e);
                throw new ExpansionException("Failed to add master node", e);
            }
        }
    }
    
    /**
     * 添加新从节点
     */
    public void addSlaveNodes(List<NodeConfig> slaveNodes, Map<String, String> masterSlaveMapping) {
        for (NodeConfig slaveConfig : slaveNodes) {
            try {
                log.info("Adding slave node: {}:{}", slaveConfig.getHost(), slaveConfig.getPort());
                
                // 1. 将从节点加入集群
                clusterConnection.clusterMeet(slaveConfig.getHost(), slaveConfig.getPort());
                
                // 2. 等待握手完成
                waitForNodeHandshake(slaveConfig);
                
                // 3. 设置主从关系
                String masterId = masterSlaveMapping.get(slaveConfig.getNodeId());
                clusterConnection.clusterReplicate(slaveConfig.getNodeId(), masterId);
                
                // 4. 验证主从关系
                verifyReplicationStatus(slaveConfig, masterId);
                
                log.info("Slave node {}:{} added successfully", 
                    slaveConfig.getHost(), slaveConfig.getPort());
                
            } catch (Exception e) {
                log.error("Failed to add slave node {}:{}", 
                    slaveConfig.getHost(), slaveConfig.getPort(), e);
                throw new ExpansionException("Failed to add slave node", e);
            }
        }
    }
    
    private void waitForNodeHandshake(NodeConfig nodeConfig) throws InterruptedException {
        int maxRetries = 30;
        int retryCount = 0;
        
        while (retryCount < maxRetries) {
            try {
                RedisClusterNode node = findNodeById(nodeConfig.getNodeId());
                if (node != null && !node.getFlags().contains(RedisClusterNode.Flag.HANDSHAKE)) {
                    return; // 握手完成
                }
            } catch (Exception e) {
                // 忽略异常,继续重试
            }
            
            Thread.sleep(1000);
            retryCount++;
        }
        
        throw new ExpansionException("Node handshake timeout: " + nodeConfig.getNodeId());
    }
}

阶段三:槽位迁移

槽位迁移策略:

/**
 * 槽位迁移管理器
 */
@Component
public class SlotMigrationManager {
    
    @Autowired
    private RedisClusterConnection clusterConnection;
    
    /**
     * 执行槽位迁移
     */
    public void migrateSlots(Map<String, SlotRange> reallocationPlan) {
        // 按批次迁移,避免对业务造成太大影响
        int batchSize = 100; // 每批迁移100个槽位
        
        for (Map.Entry<String, SlotRange> entry : reallocationPlan.entrySet()) {
            String targetNodeId = entry.getKey();
            SlotRange slotRange = entry.getValue();
            
            migrateSlotRange(targetNodeId, slotRange, batchSize);
        }
    }
    
    private void migrateSlotRange(String targetNodeId, SlotRange slotRange, int batchSize) {
        int startSlot = slotRange.getStart();
        int endSlot = slotRange.getEn

剩余60%内容,订阅专栏后可继续查看/也可单篇购买

Java面试圣经 文章被收录于专栏

Java面试圣经,带你练透java圣经

全部评论
欢迎讨论
点赞 回复 分享
发布于 09-06 11:27 江西

相关推荐

评论
点赞
1
分享

创作者周榜

更多
牛客网
牛客网在线编程
牛客网题解
牛客企业服务