martin 3 年之前
父节点
当前提交
9850417e59

+ 2 - 3
simulation-resource-scheduler/src/main/java/com/css/simulation/resource/scheduler/configuration/kubernetes/KubernetesConfiguration.java

@@ -5,7 +5,6 @@ import io.kubernetes.client.util.ClientBuilder;
 import io.kubernetes.client.util.KubeConfig;
 import org.springframework.context.annotation.Bean;
 import org.springframework.context.annotation.Configuration;
-import org.springframework.util.ResourceUtils;
 
 import java.io.File;
 import java.io.FileReader;
@@ -16,9 +15,9 @@ public class KubernetesConfiguration {
 
     @Bean
     public ApiClient apiClient() throws IOException {
-        File config = ResourceUtils.getFile("classpath:kubernetes/config");  // 开发环境可用,生产环境不行,无法从jar 包读取
+//        File config = ResourceUtils.getFile("classpath:kubernetes/config");  // 开发环境可用,生产环境不行,无法从jar 包读取
 //        File config = new File("D:\\idea-project\\simulation-cloud\\simulation-resource-scheduler\\src\\main\\resources\\kubernetes\\config");  //windows
-//        File config = new File("/root/.kube/config");   //linux
+        File config = new File("/root/.kube/config");   //linux
 //
 //        ClassPathResource classPathResource = new ClassPathResource("kubernetes/config");
 //        InputStream inputStream = classPathResource.getInputStream();

+ 1 - 1
simulation-resource-scheduler/src/main/java/com/css/simulation/resource/scheduler/mapper/TaskMapper.java

@@ -80,7 +80,7 @@ public interface TaskMapper {
     @Update("update simulation_manual_project_task\n" +
             "set run_state = #{runState}," +
             "    run_end_time = #{runStopTime}," +
-            "    run_result='Success'," +
+            "    run_result = 'Success'," +
             "    return_scene_id = #{task.returnSceneId},\n" +
             "    score           = #{task.score},\n" +
             "    target_evaluate = #{task.targetEvaluate},\n" +

+ 8 - 1
simulation-resource-scheduler/src/main/java/com/css/simulation/resource/scheduler/scheduler/TickScheduler.java

@@ -59,8 +59,15 @@ public class TickScheduler {
     KafkaTemplate<String, String> kafkaTemplate;
 
 
+    /**
+     * 解决 pod 莫名奇妙关闭的问题
+     * @throws ApiException 异常
+     */
     @Scheduled(fixedDelay = 60 * 1000)
     public void retry() throws ApiException {
+
+
+        log.info("------- TickScheduler--retry 检查 pod 是否需要重试!");
         //1 从 redis 获取 手动运行项目的 key 列表
         Set<String> keys = redisTemplate.keys("manualProject:*");
 
@@ -144,7 +151,6 @@ public class TickScheduler {
                         SshUtil.execute(session, podDeleteCommand);
 //            taskManager.updateFailStateWithStopTime(taskId, state, TimeUtil.getNowForMysql()); // 如果任务 abort 代表项目失败
                         taskMapper.updateFailStateWithStopTime(taskId, DictConstants.TASK_ABORTED, TimeUtil.getNowForMysql(), DictConstants.TASK_ERROR_REASON_1);
-                        redisTemplate.delete("podName:" + taskId);
                     }
                 }
             }
@@ -155,6 +161,7 @@ public class TickScheduler {
 
 
     /**
+     * 解决 pod 莫名全部关闭但是 job 还在的问题
      * 检查如果有 job 在运行但是 pod 全部关闭的情况,此时需要重启一下 job
      */
     @Scheduled(fixedDelay = 30 * 1000)

+ 12 - 11
simulation-resource-scheduler/src/main/java/com/css/simulation/resource/scheduler/service/TaskService.java

@@ -97,13 +97,6 @@ public class TaskService {
     @Value("${simulation-cloud.evaluation-level-uri}")
     String evaluationLevelUri;
 
-    public void taskTick(String taskId) {
-        log.info("TaskService--taskTick 任务 " + taskId + "心跳!");
-        // 刷新 redis 心跳时间
-        ProjectPO projectPO = manualProjectMapper.selectById(taskId);
-        String projectId = projectPO.getId();
-        redisTemplate.opsForValue().set(manualProjectTopic + ":" + projectId + ":" + taskId, TimeUtil.getNowString());
-    }
 
     @SneakyThrows
     public void taskState(String taskId, String state, String podName) {
@@ -121,7 +114,6 @@ public class TaskService {
                     + ",并执行删除 pod 命令:" + podDeleteCommand);
             SshUtil.execute(session, podDeleteCommand);
 //            taskManager.updateFailStateWithStopTime(taskId, state, TimeUtil.getNowForMysql()); // 如果任务 abort 代表项目失败
-            redisTemplate.delete("podName:" + taskId);
             //result-path-minio: /project/manual-project/
             String minioPathOfErrorLog = resultPathMinio + projectId + "/" + taskId + "error.log";
             boolean objectExist = MinioUtil.isObjectExist(minioClient, bucketName, minioPathOfErrorLog);
@@ -152,17 +144,14 @@ public class TaskService {
                     + ",并执行删除 pod 命令:" + podDeleteCommand);
             SshUtil.execute(session, podDeleteCommand);
             taskMapper.updateFailStateWithStopTime(taskId, state, TimeUtil.getNowForMysql(), DictConstants.TASK_ERROR_REASON_3);
-            redisTemplate.delete("podName:" + taskId);
         } else if ("PendingAnalysis".equals(state)) {
             log.info("TaskService--state 修改任务 " + taskId + "的状态为 PendingAnalysis,pod 名称为:" + podName
                     + ",并执行删除 pod 命令:" + podDeleteCommand);
             SshUtil.execute(session, podDeleteCommand);
             taskMapper.updateSuccessStateWithStopTime(taskId, state, TimeUtil.getNowForMysql());
-            redisTemplate.delete("podName:" + taskId);
         } else {
             log.error("TaskService--state 出现了未知状态:" + state);
             taskMapper.updateFailStateWithStopTime(taskId, state, TimeUtil.getNowForMysql(), DictConstants.TASK_ERROR_REASON_4);
-            redisTemplate.delete("podName:" + taskId);
         }
         ProjectPO projectPO = manualProjectMapper.selectById(taskId);
         if (projectPO == null) {
@@ -410,4 +399,16 @@ public class TaskService {
             computeFirst(parentIndexList, projectId);
         }
     }
+
+    public void taskTick(String taskId) {
+        log.info("TaskService--taskTick 任务 " + taskId + "心跳!");
+        // 刷新 redis 心跳时间
+        ProjectPO projectPO = manualProjectMapper.selectById(taskId);
+        String projectId = projectPO.getId();
+        redisTemplate.opsForValue().set(manualProjectTopic + ":" + projectId + ":" + taskId, TimeUtil.getNowString());
+    }
+
 }
+
+
+