|
@@ -2,7 +2,10 @@ package com.css.simulation.resource.scheduler.scheduler;
|
|
|
|
|
|
import api.common.pojo.constants.DictConstants;
|
|
import api.common.pojo.constants.DictConstants;
|
|
import api.common.pojo.dto.ProjectMessageDTO;
|
|
import api.common.pojo.dto.ProjectMessageDTO;
|
|
-import api.common.util.*;
|
|
|
|
|
|
+import api.common.util.CollectionUtil;
|
|
|
|
+import api.common.util.JsonUtil;
|
|
|
|
+import api.common.util.StringUtil;
|
|
|
|
+import api.common.util.TimeUtil;
|
|
import com.css.simulation.resource.scheduler.consumer.ProjectConsumer;
|
|
import com.css.simulation.resource.scheduler.consumer.ProjectConsumer;
|
|
import com.css.simulation.resource.scheduler.mapper.ClusterMapper;
|
|
import com.css.simulation.resource.scheduler.mapper.ClusterMapper;
|
|
import com.css.simulation.resource.scheduler.mapper.ManualProjectMapper;
|
|
import com.css.simulation.resource.scheduler.mapper.ManualProjectMapper;
|
|
@@ -13,12 +16,11 @@ import com.css.simulation.resource.scheduler.pojo.po.ProjectPO;
|
|
import com.css.simulation.resource.scheduler.pojo.po.TaskPO;
|
|
import com.css.simulation.resource.scheduler.pojo.po.TaskPO;
|
|
import com.css.simulation.resource.scheduler.pojo.to.PrefixTO;
|
|
import com.css.simulation.resource.scheduler.pojo.to.PrefixTO;
|
|
import com.css.simulation.resource.scheduler.service.TaskService;
|
|
import com.css.simulation.resource.scheduler.service.TaskService;
|
|
|
|
+import com.css.simulation.resource.scheduler.util.KubernetesUtil;
|
|
import com.css.simulation.resource.scheduler.util.ProjectUtil;
|
|
import com.css.simulation.resource.scheduler.util.ProjectUtil;
|
|
import io.kubernetes.client.openapi.ApiClient;
|
|
import io.kubernetes.client.openapi.ApiClient;
|
|
import lombok.SneakyThrows;
|
|
import lombok.SneakyThrows;
|
|
import lombok.extern.slf4j.Slf4j;
|
|
import lombok.extern.slf4j.Slf4j;
|
|
-import org.apache.sshd.client.SshClient;
|
|
|
|
-import org.apache.sshd.client.session.ClientSession;
|
|
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
|
import org.springframework.beans.factory.annotation.Value;
|
|
import org.springframework.beans.factory.annotation.Value;
|
|
import org.springframework.data.redis.core.StringRedisTemplate;
|
|
import org.springframework.data.redis.core.StringRedisTemplate;
|
|
@@ -26,7 +28,6 @@ import org.springframework.kafka.core.KafkaTemplate;
|
|
import org.springframework.scheduling.annotation.Scheduled;
|
|
import org.springframework.scheduling.annotation.Scheduled;
|
|
import org.springframework.stereotype.Component;
|
|
import org.springframework.stereotype.Component;
|
|
|
|
|
|
-import java.io.IOException;
|
|
|
|
import java.util.List;
|
|
import java.util.List;
|
|
import java.util.Set;
|
|
import java.util.Set;
|
|
import java.util.stream.Collectors;
|
|
import java.util.stream.Collectors;
|
|
@@ -139,17 +140,13 @@ public class ProjectScheduler {
|
|
/**
|
|
/**
|
|
* 处理 pod 超时
|
|
* 处理 pod 超时
|
|
* 同时也可处理 pod 莫名关闭,因为关闭之后也会超时
|
|
* 同时也可处理 pod 莫名关闭,因为关闭之后也会超时
|
|
- *
|
|
|
|
- * @throws IOException 超时时间
|
|
|
|
*/
|
|
*/
|
|
@Scheduled(fixedDelay = 60 * 1000)
|
|
@Scheduled(fixedDelay = 60 * 1000)
|
|
- public void taskTimeout() throws IOException {
|
|
|
|
|
|
+ public void taskTimeout() {
|
|
long timeout = 2 * 60 * 1000L;
|
|
long timeout = 2 * 60 * 1000L;
|
|
- SshClient client = SshUtil.getClient();
|
|
|
|
- ClientSession session = SshUtil.getSession(client, hostname, username, password);
|
|
|
|
List<TaskPO> executingTaskList = taskMapper.selectByRunState(DictConstants.TASK_RUNNING);
|
|
List<TaskPO> executingTaskList = taskMapper.selectByRunState(DictConstants.TASK_RUNNING);
|
|
- log.info("ProjectScheduler--taskTimeout 正在运行的任务有:" + executingTaskList);
|
|
|
|
if (executingTaskList != null && executingTaskList.size() > 0) {
|
|
if (executingTaskList != null && executingTaskList.size() > 0) {
|
|
|
|
+ log.info("ProjectScheduler--taskTimeout 正在运行的任务有:" + executingTaskList);
|
|
for (TaskPO task : executingTaskList) {
|
|
for (TaskPO task : executingTaskList) {
|
|
String userId = task.getCreateUserId();
|
|
String userId = task.getCreateUserId();
|
|
String projectId = task.getPId();
|
|
String projectId = task.getPId();
|
|
@@ -169,8 +166,6 @@ public class ProjectScheduler {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- session.close();
|
|
|
|
- client.stop();
|
|
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|
|
@@ -180,52 +175,90 @@ public class ProjectScheduler {
|
|
@Scheduled(fixedDelay = 30 * 1000)
|
|
@Scheduled(fixedDelay = 30 * 1000)
|
|
@SneakyThrows
|
|
@SneakyThrows
|
|
public void projectCheck() {
|
|
public void projectCheck() {
|
|
- SshClient client = SshUtil.getClient();
|
|
|
|
- ClientSession session = SshUtil.getSession(client, hostname, username, password);
|
|
|
|
-
|
|
|
|
//1 查询出正在运行中的 project
|
|
//1 查询出正在运行中的 project
|
|
List<ProjectPO> projectIdList = manualProjectMapper.selectByNowRunState(DictConstants.PROJECT_RUNNING);
|
|
List<ProjectPO> projectIdList = manualProjectMapper.selectByNowRunState(DictConstants.PROJECT_RUNNING);
|
|
- log.info("ProjectScheduler--projectCheck 运行中的项目有:" + projectIdList);
|
|
|
|
//2 根据 projectId 获取 pod
|
|
//2 根据 projectId 获取 pod
|
|
for (ProjectPO project : projectIdList) {
|
|
for (ProjectPO project : projectIdList) {
|
|
String projectId = project.getId();
|
|
String projectId = project.getId();
|
|
String userId = project.getCreateUserId();
|
|
String userId = project.getCreateUserId();
|
|
PrefixTO redisPrefix = projectUtil.getRedisPrefixByUserIdAndProjectId(userId, projectId);
|
|
PrefixTO redisPrefix = projectUtil.getRedisPrefixByUserIdAndProjectId(userId, projectId);
|
|
- String checkKey = redisPrefix.getProjectCheckKey();
|
|
|
|
- String lastNowString = redisTemplate.opsForValue().get(checkKey);
|
|
|
|
- String podList = SshUtil.execute(session, "kubectl get pod | grep project-" + projectId);
|
|
|
|
- log.info("ProjectScheduler--projectCheck 项目 " + projectId + " 正在运行的 pod 为:\n" + podList);
|
|
|
|
- int taskNumber = StringUtil.countSubString(podList, "project");
|
|
|
|
- if (StringUtil.isEmpty(lastNowString) && taskNumber == 0) { // 为空代表第一次,先设置时间
|
|
|
|
- redisTemplate.opsForValue().set(checkKey, TimeUtil.getNowString());
|
|
|
|
|
|
+ String lastNowString = redisTemplate.opsForValue().get(redisPrefix.getProjectCheckKey());
|
|
|
|
+ // 获取正在运行的 pod 列表
|
|
|
|
+ List<String> podList = KubernetesUtil.getPod(apiClient, "");
|
|
|
|
+ int taskNumber = podList.size();
|
|
|
|
+ // 如果没有检查过且 pod 列表为空,则正式开始检查,设置第一次检查时间
|
|
|
|
+ if (StringUtil.isEmpty(lastNowString) && taskNumber == 0) {
|
|
|
|
+ log.info("ProjectScheduler--projectCheck 开始检查项目 " + projectId);
|
|
|
|
+ redisTemplate.opsForValue().set(redisPrefix.getProjectCheckKey(), TimeUtil.getNowString());
|
|
|
|
+ return;
|
|
}
|
|
}
|
|
- if (StringUtil.isNotEmpty(lastNowString) && taskNumber == 0) { // 非空则开始检查
|
|
|
|
- // 判断两次是否超过2分钟
|
|
|
|
- //3 如果 pod 为空,则重启 job
|
|
|
|
- long lastNow = Long.parseLong(lastNowString);
|
|
|
|
- long now = Long.parseLong(TimeUtil.getNowString());
|
|
|
|
- if (now - lastNow > (long) 120 * 1000) {
|
|
|
|
- redisTemplate.opsForValue().set(checkKey, TimeUtil.getNowString());
|
|
|
|
- SshUtil.execute(session, "kubectl delete job project-" + projectId);
|
|
|
|
- Thread.sleep(15000);
|
|
|
|
- while (true) {
|
|
|
|
- log.info("ProjectScheduler--projectCheck 准备重启项目 " + projectId);
|
|
|
|
- String podList2 = SshUtil.execute(session, "kubectl get pod | grep project-" + projectId);
|
|
|
|
- log.info("ProjectScheduler--projectCheck 项目 " + projectId + " 剩余的 pod 信息为:\n" + podList2);
|
|
|
|
- int taskNumber2 = StringUtil.countSubString(podList2, "project");
|
|
|
|
- if (taskNumber2 == 0) {
|
|
|
|
- break;
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- Thread.sleep(15000);
|
|
|
|
- log.info("ProjectScheduler--projectCheck 重新执行项目" + projectId);
|
|
|
|
- String jobTemplateYamlPathTarget = jobYaml + "project-" + projectId + ".yaml";
|
|
|
|
- SshUtil.execute(session, "kubectl apply -f " + jobTemplateYamlPathTarget);
|
|
|
|
- }
|
|
|
|
|
|
+ log.info("ProjectScheduler--projectCheck 项目 " + projectId + " 正在运行的 pod 为:\n" + podList);
|
|
|
|
+ // 如果两次检查时间超过了 2 分钟,且仍然没有 pod 执行,则准备重启
|
|
|
|
+ if (StringUtil.isNotEmpty(lastNowString) && taskNumber == 0 && Long.parseLong(TimeUtil.getNowString()) - Long.parseLong(lastNowString) > (long) 120 * 1000) {
|
|
|
|
+ // 删除检查
|
|
|
|
+ redisTemplate.delete(redisPrefix.getProjectCheckKey());
|
|
|
|
+ // 删除 job
|
|
|
|
+ KubernetesUtil.deleteJob(apiClient, "default", "project-" + projectId);
|
|
|
|
+ log.info("ProjectScheduler--projectCheck 重新执行项目" + projectId);
|
|
|
|
+ KubernetesUtil.applyYaml(hostname, username, password, jobYaml + "project-" + projectId + ".yaml");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- session.close();
|
|
|
|
- client.stop();
|
|
|
|
-
|
|
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+// /**
|
|
|
|
+// * 解决 pod 莫名全部关闭但是 job 还在的问题
|
|
|
|
+// * 检查如果有 job 在运行但是 pod 全部关闭的情况,此时需要重启一下 job
|
|
|
|
+// */
|
|
|
|
+// @Scheduled(fixedDelay = 30 * 1000)
|
|
|
|
+// @SneakyThrows
|
|
|
|
+// public void projectCheck() {
|
|
|
|
+// SshClient client = SshUtil.getClient();
|
|
|
|
+// ClientSession session = SshUtil.getSession(client, hostname, username, password);
|
|
|
|
+//
|
|
|
|
+// //1 查询出正在运行中的 project
|
|
|
|
+// List<ProjectPO> projectIdList = manualProjectMapper.selectByNowRunState(DictConstants.PROJECT_RUNNING);
|
|
|
|
+// log.info("ProjectScheduler--projectCheck 运行中的项目有:" + projectIdList);
|
|
|
|
+// //2 根据 projectId 获取 pod
|
|
|
|
+// for (ProjectPO project : projectIdList) {
|
|
|
|
+// String projectId = project.getId();
|
|
|
|
+// String userId = project.getCreateUserId();
|
|
|
|
+// PrefixTO redisPrefix = projectUtil.getRedisPrefixByUserIdAndProjectId(userId, projectId);
|
|
|
|
+// String checkKey = redisPrefix.getProjectCheckKey();
|
|
|
|
+// String lastNowString = redisTemplate.opsForValue().get(checkKey);
|
|
|
|
+// String podList = SshUtil.execute(session, "kubectl get pod | grep project-" + projectId);
|
|
|
|
+// log.info("ProjectScheduler--projectCheck 项目 " + projectId + " 正在运行的 pod 为:\n" + podList);
|
|
|
|
+// int taskNumber = StringUtil.countSubString(podList, "project");
|
|
|
|
+// if (StringUtil.isEmpty(lastNowString) && taskNumber == 0) { // 为空代表第一次,先设置时间
|
|
|
|
+// redisTemplate.opsForValue().set(checkKey, TimeUtil.getNowString());
|
|
|
|
+// }
|
|
|
|
+// if (StringUtil.isNotEmpty(lastNowString) && taskNumber == 0) { // 非空则开始检查
|
|
|
|
+// // 判断两次是否超过2分钟
|
|
|
|
+// //3 如果 pod 为空,则重启 job
|
|
|
|
+// long lastNow = Long.parseLong(lastNowString);
|
|
|
|
+// long now = Long.parseLong(TimeUtil.getNowString());
|
|
|
|
+// if (now - lastNow > (long) 120 * 1000) {
|
|
|
|
+// redisTemplate.opsForValue().set(checkKey, TimeUtil.getNowString());
|
|
|
|
+// SshUtil.execute(session, "kubectl delete job project-" + projectId);
|
|
|
|
+// Thread.sleep(15000);
|
|
|
|
+// while (true) {
|
|
|
|
+// log.info("ProjectScheduler--projectCheck 准备重启项目 " + projectId);
|
|
|
|
+// String podList2 = SshUtil.execute(session, "kubectl get pod | grep project-" + projectId);
|
|
|
|
+// log.info("ProjectScheduler--projectCheck 项目 " + projectId + " 剩余的 pod 信息为:\n" + podList2);
|
|
|
|
+// int taskNumber2 = StringUtil.countSubString(podList2, "project");
|
|
|
|
+// if (taskNumber2 == 0) {
|
|
|
|
+// break;
|
|
|
|
+// }
|
|
|
|
+// }
|
|
|
|
+// Thread.sleep(15000);
|
|
|
|
+// log.info("ProjectScheduler--projectCheck 重新执行项目" + projectId);
|
|
|
|
+// String jobTemplateYamlPathTarget = jobYaml + "project-" + projectId + ".yaml";
|
|
|
|
+// SshUtil.execute(session, "kubectl apply -f " + jobTemplateYamlPathTarget);
|
|
|
|
+// }
|
|
|
|
+// }
|
|
|
|
+// }
|
|
|
|
+// session.close();
|
|
|
|
+// client.stop();
|
|
|
|
+//
|
|
|
|
+// }
|
|
}
|
|
}
|