martin 2 years ago
parent
commit
281f35d145

+ 7 - 14
simulation-resource-scheduler/src/main/java/com/css/simulation/resource/scheduler/manager/TaskManager.java

@@ -81,18 +81,20 @@ public class TaskManager {
     @Resource
     ApiClient apiClient;
 
+    public static final String SIMULATION_NAMESPACE = "default";
+
     @SneakyThrows
     @Transactional
     public boolean isProjectCompleted(PrefixTO redisPrefix, String projectId, String taskId, String state, String podName) {
-        if ("Running".equals(state)) {
+        if ("Running".equals(state)) {  // 运行中的 pod 无需删除
             // 将运行中的任务的 pod 名称放入 redis
             stringRedisTemplate.opsForValue().set(redisPrefix.getTaskPodKey(), podName);
             taskTick(taskId); // 刷新一下心跳
             log.info("TaskManager--state 修改任务 " + taskId + " 的状态为 " + state + ",pod 名称为:" + podName);
             taskMapper.updateStateWithStartTime(taskId, state, TimeUtil.getNowForMysql());
             return false;
-        } else {
-//            String podDeleteCommand = "kubectl delete pod " + podName;
+        } else { // 出错和结束的 pod 都直接删除
+            KubernetesUtil.deletePod(apiClient, SIMULATION_NAMESPACE, podName);
             log.info("TaskManager--state 修改任务 " + taskId + "的状态为 " + state + ",pod 名称为:" + podName + ",并删除 pod。");
             if ("Aborted".equals(state)) {
                 if (retry(projectId, taskId, redisPrefix.getTaskRetryKey(), redisPrefix.getTaskMessageKey())) {
@@ -128,15 +130,7 @@ public class TaskManager {
                 taskMapper.updateFailStateWithStopTime(taskId, state, TimeUtil.getNowForMysql(), DictConstants.TASK_ERROR_REASON_3);
             } else if ("PendingAnalysis".equals(state)) {
                 taskMapper.updateSuccessStateWithStopTime(taskId, state, TimeUtil.getNowForMysql());
-            } else {
-                if (retry(projectId, taskId, redisPrefix.getTaskRetryKey(), redisPrefix.getTaskMessageKey())) {
-                    taskMapper.updateStateById(DictConstants.TASK_RUNNING, taskId);
-                    return false;
-                }
-                taskMapper.updateFailStateWithStopTime(taskId, state, TimeUtil.getNowForMysql(), DictConstants.TASK_ERROR_REASON_4);
             }
-//            SshUtil.execute(session, podDeleteCommand);
-            KubernetesUtil.deletePod(apiClient, "default", podName);
         }
         int taskNum = taskMapper.selectTaskNumByProjectId(projectId);
         int endTaskNum = taskMapper.selectEndTaskNumByProjectId(projectId);    // 查询已结束的任务 'Aborted', 'PendingAnalysis', 'Terminated'
@@ -144,7 +138,6 @@ public class TaskManager {
         log.info("TaskManager--isProjectCompleted 项目 " + projectId + " 完成进度为:" + endTaskNum + "/" + taskNum);
         // 已结束任务数等于所有任务数量,才会准备打分;否则退出。
         return taskNum == endTaskNum;
-
     }
 
     public boolean retry(String projectId, String taskId, String taskRetryKey, String taskMessageKey) {
@@ -153,8 +146,8 @@ public class TaskManager {
             //1 首先查看任务是否重试过 3 次
             String retryString = stringRedisTemplate.opsForValue().get(taskRetryKey);
             int retry = Integer.parseInt(Objects.requireNonNull(retryString));
-            //2 如果重试次数没有超过 3 次,则重试
-            if (retry > 3) {
+            //2 如果重试次数超过 3 次,则不再重试
+            if (retry >= 3) {
                 return false;
             }
             String taskJson = stringRedisTemplate.opsForValue().get(taskMessageKey);