|
@@ -7,13 +7,11 @@ import api.common.util.JsonUtil;
|
|
|
import api.common.util.StringUtil;
|
|
|
import api.common.util.TimeUtil;
|
|
|
import com.css.simulation.resource.scheduler.consumer.ProjectConsumer;
|
|
|
-import com.css.simulation.resource.scheduler.mapper.ClusterMapper;
|
|
|
-import com.css.simulation.resource.scheduler.mapper.ManualProjectMapper;
|
|
|
-import com.css.simulation.resource.scheduler.mapper.TaskMapper;
|
|
|
-import com.css.simulation.resource.scheduler.mapper.UserMapper;
|
|
|
+import com.css.simulation.resource.scheduler.mapper.*;
|
|
|
import com.css.simulation.resource.scheduler.pojo.po.ClusterPO;
|
|
|
import com.css.simulation.resource.scheduler.pojo.po.ProjectPO;
|
|
|
import com.css.simulation.resource.scheduler.pojo.po.TaskPO;
|
|
|
+import com.css.simulation.resource.scheduler.pojo.to.KubernetesNodeTO;
|
|
|
import com.css.simulation.resource.scheduler.pojo.to.PrefixTO;
|
|
|
import com.css.simulation.resource.scheduler.service.TaskService;
|
|
|
import com.css.simulation.resource.scheduler.util.KubernetesUtil;
|
|
@@ -21,13 +19,13 @@ import com.css.simulation.resource.scheduler.util.ProjectUtil;
|
|
|
import io.kubernetes.client.openapi.ApiClient;
|
|
|
import lombok.SneakyThrows;
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
-import org.springframework.beans.factory.annotation.Autowired;
|
|
|
import org.springframework.beans.factory.annotation.Value;
|
|
|
import org.springframework.data.redis.core.StringRedisTemplate;
|
|
|
-import org.springframework.kafka.core.KafkaTemplate;
|
|
|
import org.springframework.scheduling.annotation.Scheduled;
|
|
|
import org.springframework.stereotype.Component;
|
|
|
|
|
|
+import javax.annotation.Resource;
|
|
|
+import java.util.ArrayList;
|
|
|
import java.util.List;
|
|
|
import java.util.Set;
|
|
|
|
|
@@ -44,25 +42,23 @@ public class ProjectScheduler {
|
|
|
@Value("${scheduler.linux-path.job-yaml}")
|
|
|
String jobYaml;
|
|
|
// -------------------------------- Comment --------------------------------
|
|
|
- @Autowired
|
|
|
- StringRedisTemplate redisTemplate;
|
|
|
- @Autowired
|
|
|
+ @Resource
|
|
|
+ StringRedisTemplate stringRedisTemplate;
|
|
|
+ @Resource
|
|
|
TaskService taskService;
|
|
|
- @Autowired
|
|
|
+ @Resource
|
|
|
TaskMapper taskMapper;
|
|
|
- @Autowired
|
|
|
+ @Resource
|
|
|
ClusterMapper clusterMapper;
|
|
|
- @Autowired
|
|
|
+ @Resource
|
|
|
ManualProjectMapper manualProjectMapper;
|
|
|
- @Autowired
|
|
|
+ @Resource
|
|
|
+ AutoSubProjectMapper autoSubProjectMapper;
|
|
|
+ @Resource
|
|
|
ApiClient apiClient;
|
|
|
- @Autowired
|
|
|
- KafkaTemplate<String, String> kafkaTemplate;
|
|
|
- @Autowired
|
|
|
+ @Resource
|
|
|
ProjectConsumer projectConsumer;
|
|
|
- @Autowired
|
|
|
- UserMapper userMapper;
|
|
|
- @Autowired
|
|
|
+ @Resource
|
|
|
ProjectUtil projectUtil;
|
|
|
|
|
|
|
|
@@ -73,10 +69,18 @@ public class ProjectScheduler {
|
|
|
@SneakyThrows
|
|
|
public void dispatchProject() {
|
|
|
|
|
|
- //1 查询已经排队的项目,即已经在页面上点击运行
|
|
|
- List<ProjectPO> projectList = manualProjectMapper.selectByNowRunState(DictConstants.PROJECT_RUNNING);
|
|
|
- for (ProjectPO project : projectList) {
|
|
|
+ //1 查询已经在页面上点击运行的项目(后面会判断是排队中还是已经与运行了)
|
|
|
+ List<ProjectPO> manualProjectList = manualProjectMapper.selectByNowRunState(DictConstants.PROJECT_RUNNING);
|
|
|
+ List<ProjectPO> autoSubProjectList = autoSubProjectMapper.selectByNowRunState(DictConstants.PROJECT_RUNNING);
|
|
|
+ List<ProjectPO> allProject = new ArrayList<>();
|
|
|
+ allProject.addAll(manualProjectList);
|
|
|
+ allProject.addAll(autoSubProjectList);
|
|
|
+
|
|
|
+
|
|
|
+ for (ProjectPO project : allProject) {
|
|
|
String projectId = project.getId();
|
|
|
+ String projectType = project.getProjectType();
|
|
|
+ long parallelism = Long.parseLong(project.getParallelism());
|
|
|
String userId = project.getCreateUserId();
|
|
|
ClusterPO clusterPO = clusterMapper.selectByUserId(userId);
|
|
|
if (clusterPO == null) {
|
|
@@ -85,55 +89,58 @@ public class ProjectScheduler {
|
|
|
}
|
|
|
String clusterId = clusterPO.getId();
|
|
|
PrefixTO redisPrefix = projectUtil.getRedisPrefixByClusterIdAndProjectId(clusterId, projectId);
|
|
|
- if (StringUtil.isNotEmpty(redisTemplate.opsForValue().get(redisPrefix.getProjectRunningKey()))) {
|
|
|
- continue; // 判断项目是否已经在执行,如果执行则 continue
|
|
|
+ // -------------------------------- 判断项目是否已经在执行,如果执行则 continue --------------------------------
|
|
|
+ if (StringUtil.isNotEmpty(stringRedisTemplate.opsForValue().get(redisPrefix.getProjectRunningKey()))) {
|
|
|
+ continue;
|
|
|
}
|
|
|
+ // -------------------------------- 项目没有执行说明等待中 --------------------------------
|
|
|
int simulationLicenseNumber = clusterPO.getNumSimulationLicense();
|
|
|
// 获取该用户正在运行的项目数量
|
|
|
- Set<String> clusterRunningKeySet = redisTemplate.keys(redisPrefix.getClusterRunningPrefix() + "*");
|
|
|
- List<String> runningProjectSet;
|
|
|
+ Set<String> clusterRunningKeySet = stringRedisTemplate.keys(redisPrefix.getClusterRunningPrefix() + "*");
|
|
|
+ List<String> runningProjectSet = null;
|
|
|
// cluster:${clusterId}:running:${projectId}
|
|
|
if (CollectionUtil.isNotEmpty(clusterRunningKeySet)) {
|
|
|
runningProjectSet = projectUtil.getRunningProjectList(clusterRunningKeySet);
|
|
|
if (CollectionUtil.isNotEmpty(runningProjectSet)) {
|
|
|
- log.info("ProjectScheduler--dispatchProject 运行中的项目的 key 有:"+runningProjectSet);
|
|
|
+ log.info("ProjectScheduler--dispatchProject 运行中的项目的 key 有:" + runningProjectSet);
|
|
|
long parallelismSum = 0;
|
|
|
for (String runningProjectKey : runningProjectSet) {
|
|
|
- parallelismSum += JsonUtil.jsonToBean(redisTemplate.opsForValue().get(runningProjectKey), ProjectMessageDTO.class).getParallelism();
|
|
|
+ parallelismSum += JsonUtil.jsonToBean(stringRedisTemplate.opsForValue().get(runningProjectKey), ProjectMessageDTO.class).getParallelism();
|
|
|
}
|
|
|
if (parallelismSum < simulationLicenseNumber) {
|
|
|
- Set<String> waitingProjectSet = redisTemplate.keys(redisPrefix.getClusterWaitingPrefix() + "*");
|
|
|
- if (CollectionUtil.isEmpty(waitingProjectSet)) {
|
|
|
+ if (parallelismSum + parallelism < simulationLicenseNumber) {
|
|
|
+ run(clusterId, projectId, projectType, redisPrefix.getProjectWaitingKey(), redisPrefix.getProjectRunningKey(), parallelism);
|
|
|
return;
|
|
|
}
|
|
|
- for (String waitingProjectKey : waitingProjectSet) {
|
|
|
- Long parallelism = JsonUtil.jsonToBean(redisTemplate.opsForValue().get(waitingProjectKey), ProjectMessageDTO.class).getParallelism();
|
|
|
- if (parallelismSum + parallelism < simulationLicenseNumber) {
|
|
|
- run(clusterId, projectId, redisPrefix.getProjectWaitingKey(), redisPrefix.getProjectRunningKey());
|
|
|
- return;
|
|
|
- }
|
|
|
- }
|
|
|
}
|
|
|
- } else {
|
|
|
- run(clusterId, projectId, redisPrefix.getProjectWaitingKey(), redisPrefix.getProjectRunningKey());
|
|
|
}
|
|
|
- } else {
|
|
|
- run(clusterId, projectId, redisPrefix.getProjectWaitingKey(), redisPrefix.getProjectRunningKey());
|
|
|
+ }
|
|
|
+ if ((CollectionUtil.isEmpty(clusterRunningKeySet) || CollectionUtil.isEmpty(runningProjectSet)) && parallelism < simulationLicenseNumber) {
|
|
|
+ run(clusterId, projectId, projectType, redisPrefix.getProjectWaitingKey(), redisPrefix.getProjectRunningKey(), parallelism);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- public void run(String clusterId, String projectId, String projectWaitingKey, String projectRunningKey) {
|
|
|
- String clusterPrefix = "cluster:" + clusterId;
|
|
|
- String projectJson = redisTemplate.opsForValue().get(projectWaitingKey);
|
|
|
- redisTemplate.delete(projectWaitingKey);
|
|
|
+
|
|
|
+ public void run(String clusterId, String projectId, String projectType, String projectWaitingKey, String projectRunningKey, long parallelism) {
|
|
|
+ String projectJson = stringRedisTemplate.opsForValue().get(projectWaitingKey);
|
|
|
if (StringUtil.isEmpty(projectJson)) {
|
|
|
log.error("ProjectScheduler--run 项目 " + projectId + " 的开始消息查询失败,key 为:" + projectWaitingKey);
|
|
|
return;
|
|
|
}
|
|
|
- redisTemplate.opsForValue().set(projectRunningKey, projectJson);
|
|
|
- log.info("ProjectScheduler--run 项目 " + projectId + " 从等待队列进入执行状态!");
|
|
|
- projectConsumer.parseProject(projectJson, clusterPrefix, projectRunningKey);
|
|
|
+ //1 获取一个剩余可用并行度最大的节点
|
|
|
+ KubernetesNodeTO maxParallelismPNodeTO = projectUtil.getMaxParallelismPNode();
|
|
|
+ String maxRestParallelismNode = maxParallelismPNodeTO.getName();
|
|
|
+ Long maxRestParallelism = maxParallelismPNodeTO.getMaxParallelism();
|
|
|
+
|
|
|
+ //2 判断剩余可用并行度是否大于项目并行度,否则加入扩充队列
|
|
|
+ if (maxRestParallelism > parallelism) {
|
|
|
+ log.info("ProjectConsumer--run 集群 " + clusterId + " 将项目 " + projectId + "在节点" + maxRestParallelismNode + " 执行!");
|
|
|
+ projectConsumer.parseProject(projectId, projectType, projectJson, "cluster:" + clusterId, projectRunningKey, maxRestParallelismNode, parallelism);
|
|
|
+ } else if (maxRestParallelism > 0) {
|
|
|
+ log.info("ProjectConsumer--run 集群 " + clusterId + " 将项目 " + projectId + "在节点" + maxRestParallelismNode + " 执行!");
|
|
|
+ projectConsumer.parseProject(projectId, projectType, projectJson, "cluster:" + clusterId, projectRunningKey, maxRestParallelismNode, maxRestParallelism);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
|
|
@@ -153,7 +160,7 @@ public class ProjectScheduler {
|
|
|
String taskId = task.getId();
|
|
|
PrefixTO redisPrefix = projectUtil.getRedisPrefixByUserIdAndProjectIdAndTaksId(userId, projectId, taskId);
|
|
|
// 获取心跳时间
|
|
|
- String tickTime = redisTemplate.opsForValue().get(redisPrefix.getTaskTickKey());
|
|
|
+ String tickTime = stringRedisTemplate.opsForValue().get(redisPrefix.getTaskTickKey());
|
|
|
if (StringUtil.isEmpty(tickTime)) {
|
|
|
log.error(redisPrefix.getTaskTickKey() + ",该 key 的心跳时间为空!");
|
|
|
continue;
|
|
@@ -161,7 +168,7 @@ public class ProjectScheduler {
|
|
|
long lastTickTime = Long.parseLong(tickTime);
|
|
|
// 如果心跳超时则更改任务状态为 Aborted
|
|
|
if (TimeUtil.getNow() - lastTickTime > timeout) {
|
|
|
- String podName = redisTemplate.opsForValue().get(redisPrefix.getTaskPodKey());
|
|
|
+ String podName = stringRedisTemplate.opsForValue().get(redisPrefix.getTaskPodKey());
|
|
|
taskService.taskState(taskId, DictConstants.TASK_ABORTED, podName);
|
|
|
}
|
|
|
}
|
|
@@ -182,21 +189,21 @@ public class ProjectScheduler {
|
|
|
String projectId = project.getId();
|
|
|
String userId = project.getCreateUserId();
|
|
|
PrefixTO redisPrefix = projectUtil.getRedisPrefixByUserIdAndProjectId(userId, projectId);
|
|
|
- String lastNowString = redisTemplate.opsForValue().get(redisPrefix.getProjectCheckKey());
|
|
|
- // 获取正在运行的 pod 列表
|
|
|
- List<String> podList = KubernetesUtil.getPod(apiClient, "");
|
|
|
- int taskNumber = podList.size();
|
|
|
+ String lastNowString = stringRedisTemplate.opsForValue().get(redisPrefix.getProjectCheckKey());
|
|
|
+ // 获取该项目的 job 中正在运行的 pod 数量
|
|
|
+ List<String> allPodList = KubernetesUtil.getPod(apiClient, "");
|
|
|
+ long taskNumber = allPodList.stream().filter(podName -> podName.contains(projectId)).count();
|
|
|
// 如果没有检查过且 pod 列表为空,则正式开始检查,设置第一次检查时间
|
|
|
- if (StringUtil.isEmpty(lastNowString) && taskNumber == 0) {
|
|
|
+ if (StringUtil.isEmpty(lastNowString) && taskNumber == 0L) {
|
|
|
log.info("ProjectScheduler--projectCheck 开始检查项目 " + projectId);
|
|
|
- redisTemplate.opsForValue().set(redisPrefix.getProjectCheckKey(), TimeUtil.getNowString());
|
|
|
+ stringRedisTemplate.opsForValue().set(redisPrefix.getProjectCheckKey(), TimeUtil.getNowString());
|
|
|
return;
|
|
|
}
|
|
|
- log.info("ProjectScheduler--projectCheck 项目 " + projectId + " 正在运行的 pod 为:\n" + podList);
|
|
|
+ log.info("ProjectScheduler--projectCheck kubernetes 的命名空间 default 中正在运行的 pod 有:" + allPodList + ",其中项目 " + projectId + " 的任务个数为:" + taskNumber);
|
|
|
// 如果两次检查时间超过了 2 分钟,且仍然没有 pod 执行,则准备重启
|
|
|
- if (StringUtil.isNotEmpty(lastNowString) && taskNumber == 0 && Long.parseLong(TimeUtil.getNowString()) - Long.parseLong(lastNowString) > (long) 120 * 1000) {
|
|
|
+ if (StringUtil.isNotEmpty(lastNowString) && taskNumber == 0L && Long.parseLong(TimeUtil.getNowString()) - Long.parseLong(lastNowString) > (long) 120 * 1000) {
|
|
|
// 删除检查
|
|
|
- redisTemplate.delete(redisPrefix.getProjectCheckKey());
|
|
|
+ stringRedisTemplate.delete(redisPrefix.getProjectCheckKey());
|
|
|
try {
|
|
|
// 删除 job
|
|
|
KubernetesUtil.deleteJob(apiClient, "default", "project-" + projectId);
|
|
@@ -208,61 +215,4 @@ public class ProjectScheduler {
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
-
|
|
|
-// /**
|
|
|
-// * 解决 pod 莫名全部关闭但是 job 还在的问题
|
|
|
-// * 检查如果有 job 在运行但是 pod 全部关闭的情况,此时需要重启一下 job
|
|
|
-// */
|
|
|
-// @Scheduled(fixedDelay = 30 * 1000)
|
|
|
-// @SneakyThrows
|
|
|
-// public void projectCheck() {
|
|
|
-// SshClient client = SshUtil.getClient();
|
|
|
-// ClientSession session = SshUtil.getSession(client, hostname, username, password);
|
|
|
-//
|
|
|
-// //1 查询出正在运行中的 project
|
|
|
-// List<ProjectPO> projectIdList = manualProjectMapper.selectByNowRunState(DictConstants.PROJECT_RUNNING);
|
|
|
-// log.info("ProjectScheduler--projectCheck 运行中的项目有:" + projectIdList);
|
|
|
-// //2 根据 projectId 获取 pod
|
|
|
-// for (ProjectPO project : projectIdList) {
|
|
|
-// String projectId = project.getId();
|
|
|
-// String userId = project.getCreateUserId();
|
|
|
-// PrefixTO redisPrefix = projectUtil.getRedisPrefixByUserIdAndProjectId(userId, projectId);
|
|
|
-// String checkKey = redisPrefix.getProjectCheckKey();
|
|
|
-// String lastNowString = redisTemplate.opsForValue().get(checkKey);
|
|
|
-// String podList = SshUtil.execute(session, "kubectl get pod | grep project-" + projectId);
|
|
|
-// log.info("ProjectScheduler--projectCheck 项目 " + projectId + " 正在运行的 pod 为:\n" + podList);
|
|
|
-// int taskNumber = StringUtil.countSubString(podList, "project");
|
|
|
-// if (StringUtil.isEmpty(lastNowString) && taskNumber == 0) { // 为空代表第一次,先设置时间
|
|
|
-// redisTemplate.opsForValue().set(checkKey, TimeUtil.getNowString());
|
|
|
-// }
|
|
|
-// if (StringUtil.isNotEmpty(lastNowString) && taskNumber == 0) { // 非空则开始检查
|
|
|
-// // 判断两次是否超过2分钟
|
|
|
-// //3 如果 pod 为空,则重启 job
|
|
|
-// long lastNow = Long.parseLong(lastNowString);
|
|
|
-// long now = Long.parseLong(TimeUtil.getNowString());
|
|
|
-// if (now - lastNow > (long) 120 * 1000) {
|
|
|
-// redisTemplate.opsForValue().set(checkKey, TimeUtil.getNowString());
|
|
|
-// SshUtil.execute(session, "kubectl delete job project-" + projectId);
|
|
|
-// Thread.sleep(15000);
|
|
|
-// while (true) {
|
|
|
-// log.info("ProjectScheduler--projectCheck 准备重启项目 " + projectId);
|
|
|
-// String podList2 = SshUtil.execute(session, "kubectl get pod | grep project-" + projectId);
|
|
|
-// log.info("ProjectScheduler--projectCheck 项目 " + projectId + " 剩余的 pod 信息为:\n" + podList2);
|
|
|
-// int taskNumber2 = StringUtil.countSubString(podList2, "project");
|
|
|
-// if (taskNumber2 == 0) {
|
|
|
-// break;
|
|
|
-// }
|
|
|
-// }
|
|
|
-// Thread.sleep(15000);
|
|
|
-// log.info("ProjectScheduler--projectCheck 重新执行项目" + projectId);
|
|
|
-// String jobTemplateYamlPathTarget = jobYaml + "project-" + projectId + ".yaml";
|
|
|
-// SshUtil.execute(session, "kubectl apply -f " + jobTemplateYamlPathTarget);
|
|
|
-// }
|
|
|
-// }
|
|
|
-// }
|
|
|
-// session.close();
|
|
|
-// client.stop();
|
|
|
-//
|
|
|
-// }
|
|
|
}
|