|
@@ -1,6 +1,7 @@
|
|
|
package com.css.simulation.resource.scheduler.manager;
|
|
|
|
|
|
import api.common.pojo.constants.DictConstants;
|
|
|
+import api.common.pojo.dto.ProjectMessageDTO;
|
|
|
import api.common.util.*;
|
|
|
import com.css.simulation.resource.scheduler.mapper.AutoSubProjectMapper;
|
|
|
import com.css.simulation.resource.scheduler.mapper.IndexMapper;
|
|
@@ -18,6 +19,8 @@ import com.css.simulation.resource.scheduler.util.ProjectUtil;
|
|
|
import com.fasterxml.jackson.databind.JsonNode;
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
|
import io.kubernetes.client.openapi.ApiClient;
|
|
|
+import io.kubernetes.client.openapi.models.V1Pod;
|
|
|
+import io.kubernetes.client.util.Yaml;
|
|
|
import io.minio.MinioClient;
|
|
|
import lombok.SneakyThrows;
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
@@ -40,6 +43,9 @@ import java.util.stream.Collectors;
|
|
|
@Component
|
|
|
@Slf4j
|
|
|
public class TaskManager {
|
|
|
+
|
|
|
+ @Value("${scheduler.linux-path.pod-yaml-directory}")
|
|
|
+ String podYamlDirectory;
|
|
|
@Value("${minio.bucket-name}")
|
|
|
String bucketName;
|
|
|
@Value("${scheduler.linux-path.score-py}")
|
|
@@ -81,7 +87,9 @@ public class TaskManager {
|
|
|
@Resource
|
|
|
ApiClient apiClient;
|
|
|
|
|
|
- public static final String SIMULATION_NAMESPACE = "default";
|
|
|
+ @Value("${scheduler.kubernetes.namespace}")
|
|
|
+ String kubernetesNamespace;
|
|
|
+
|
|
|
|
|
|
@SneakyThrows
|
|
|
@Transactional
|
|
@@ -94,13 +102,11 @@ public class TaskManager {
|
|
|
taskMapper.updateStateWithStartTime(taskId, state, TimeUtil.getNowForMysql());
|
|
|
return false;
|
|
|
} else {
|
|
|
- KubernetesUtil.deletePod(apiClient, SIMULATION_NAMESPACE, podName);
|
|
|
+
|
|
|
+
|
|
|
+ KubernetesUtil.deletePod(apiClient, kubernetesNamespace, podName);
|
|
|
log.info("TaskManager--state 修改任务 " + taskId + "的状态为 " + state + ",pod 名称为:" + podName + ",并删除 pod。");
|
|
|
if ("Aborted".equals(state)) {
|
|
|
- if (retry(projectId, taskId, redisPrefix.getTaskRetryKey(), redisPrefix.getTaskMessageKey())) {
|
|
|
- taskMapper.updateStateById(DictConstants.TASK_RUNNING, taskId);
|
|
|
- return false;
|
|
|
- }
|
|
|
String minioPathOfErrorLog = resultPathMinio + projectId + "/" + taskId + "error.log";
|
|
|
boolean objectExist = MinioUtil.isObjectExist(minioClient, bucketName, minioPathOfErrorLog);
|
|
|
String targetEvaluate;
|
|
@@ -123,44 +129,41 @@ public class TaskManager {
|
|
|
}
|
|
|
taskMapper.updateFailStateWithStopTime(taskId, state, TimeUtil.getNowForMysql(), targetEvaluate);
|
|
|
} else if ("Terminated".equals(state)) {
|
|
|
- if (retry(projectId, taskId, redisPrefix.getTaskRetryKey(), redisPrefix.getTaskMessageKey())) {
|
|
|
- taskMapper.updateStateById(DictConstants.TASK_RUNNING, taskId);
|
|
|
- return false;
|
|
|
- }
|
|
|
taskMapper.updateFailStateWithStopTime(taskId, state, TimeUtil.getNowForMysql(), DictConstants.TASK_ERROR_REASON_3);
|
|
|
} else if ("PendingAnalysis".equals(state)) {
|
|
|
taskMapper.updateSuccessStateWithStopTime(taskId, state, TimeUtil.getNowForMysql());
|
|
|
}
|
|
|
- }
|
|
|
- int taskNum = taskMapper.selectTaskNumByProjectId(projectId);
|
|
|
- int endTaskNum = taskMapper.selectEndTaskNumByProjectId(projectId);
|
|
|
- manualProjectMapper.updateTaskCompleted(projectId, endTaskNum);
|
|
|
- log.info("TaskManager--isProjectCompleted 项目 " + projectId + " 完成进度为:" + endTaskNum + "/" + taskNum);
|
|
|
-
|
|
|
- return taskNum == endTaskNum;
|
|
|
- }
|
|
|
-
|
|
|
- public boolean retry(String projectId, String taskId, String taskRetryKey, String taskMessageKey) {
|
|
|
- try {
|
|
|
- log.info("TaskManager--retry 重试操作收到的参数为:projectId=" + projectId + ",taskId=" + taskId);
|
|
|
-
|
|
|
- String retryString = stringRedisTemplate.opsForValue().get(taskRetryKey);
|
|
|
- int retry = Integer.parseInt(Objects.requireNonNull(retryString));
|
|
|
-
|
|
|
- if (retry >= 3) {
|
|
|
+
|
|
|
+ ProjectMessageDTO projectMessageDTO = JsonUtil.jsonToBean(stringRedisTemplate.opsForValue().get(redisPrefix.getProjectRunningKey()), ProjectMessageDTO.class);
|
|
|
+ int taskTotal = projectMessageDTO.getTaskTotal();
|
|
|
+ int taskCompleted = projectMessageDTO.getTaskCompleted();
|
|
|
+ log.info("TaskManager--isProjectCompleted 项目 " + projectId + " 完成进度为:" + (taskCompleted + 1) + "/" + taskCompleted);
|
|
|
+ if (taskCompleted + 1 == taskTotal) {
|
|
|
+ return true;
|
|
|
+ } else {
|
|
|
+ projectMessageDTO.setTaskCompleted(taskCompleted + 1);
|
|
|
+ stringRedisTemplate.opsForValue().set(redisPrefix.getProjectRunningKey(), JsonUtil.beanToJson(projectMessageDTO));
|
|
|
return false;
|
|
|
}
|
|
|
- String taskJson = stringRedisTemplate.opsForValue().get(taskMessageKey);
|
|
|
- log.info("TaskManager--retry 重试项目 " + projectId + " 的任务 " + taskId + ",重试次数为:" + retry + ",重新发送的消息为:" + taskJson);
|
|
|
- retry++;
|
|
|
- stringRedisTemplate.opsForValue().set(taskRetryKey, retry + "");
|
|
|
- kafkaTemplate.send(projectId, taskJson);
|
|
|
- return true;
|
|
|
- } catch (Exception e) {
|
|
|
- log.error("TaskManager--retry 重试操作报错:", e);
|
|
|
- return false;
|
|
|
}
|
|
|
+ }
|
|
|
|
|
|
+
|
|
|
+ * 更改一个名字继续启动
|
|
|
+ *
|
|
|
+ * @param projectId
|
|
|
+ * @param podName
|
|
|
+ */
|
|
|
+ @SneakyThrows
|
|
|
+ public void createNextPod(String projectId, String podName) {
|
|
|
+ String lastPodString = FileUtil.read(podYamlDirectory + podName + ".yaml");
|
|
|
+ String nextPodName = "project-" + projectId + "-" + StringUtil.getRandomUUID();
|
|
|
+ String nextPodFileName = nextPodName + ".yaml";
|
|
|
+ String nextPodString = lastPodString.replace("pod-name", podName);
|
|
|
+ FileUtil.writeStringToLocalFile(nextPodString, podYamlDirectory + nextPodFileName);
|
|
|
+ V1Pod v1Pod = (V1Pod) Yaml.load(nextPodString);
|
|
|
+
|
|
|
+ KubernetesUtil.createPod(apiClient, kubernetesNamespace, v1Pod);
|
|
|
}
|
|
|
|
|
|
public void prepareScore(String projectRunningKey) {
|
|
@@ -416,16 +419,14 @@ public class TaskManager {
|
|
|
|
|
|
|
|
|
@SneakyThrows
|
|
|
- public void done(PrefixTO redisPrefix, SshClient sshClient, ClientSession clientSession, String projectId) {
|
|
|
+ public void done(PrefixTO redisPrefix, SshClient sshClient, ClientSession clientSession, String projectId, String podName) {
|
|
|
|
|
|
- manualProjectMapper.updateProjectState(projectId, DictConstants.PROJECT_COMPLETED, TimeUtil.getNowForMysql());
|
|
|
+
|
|
|
+ clientSession.close();
|
|
|
+ sshClient.stop();
|
|
|
|
|
|
- Set<String> keys = stringRedisTemplate.keys(redisPrefix.getProjectRunningKey() + "*");
|
|
|
- if (CollectionUtil.isNotEmpty(keys)) {
|
|
|
- keys.forEach(key -> stringRedisTemplate.delete(key));
|
|
|
- } else {
|
|
|
- log.error("TaskService--taskState 前缀为 " + redisPrefix.getProjectRunningKey() + " 的 key 为空!");
|
|
|
- }
|
|
|
+
|
|
|
+ manualProjectMapper.updateProjectState(projectId, DictConstants.PROJECT_COMPLETED, TimeUtil.getNowForMysql());
|
|
|
|
|
|
|
|
|
|
|
@@ -436,19 +437,32 @@ public class TaskManager {
|
|
|
|
|
|
|
|
|
|
|
|
-
|
|
|
- SshUtil.execute(clientSession, "kubectl delete job project-" + projectId);
|
|
|
- clientSession.close();
|
|
|
- sshClient.stop();
|
|
|
+ Map<String, Integer> nodeMap = projectUtil.getNodeMap();
|
|
|
+ List<String> podList = KubernetesUtil.getPodByPrefix(apiClient, kubernetesNamespace, "project-" + projectId);
|
|
|
+ for (String tempPodName : podList) {
|
|
|
+
|
|
|
+ KubernetesUtil.deletePod(apiClient, kubernetesNamespace, tempPodName);
|
|
|
+
|
|
|
+ String tempNodeName = stringRedisTemplate.opsForValue().get("pod:" + tempPodName + ":node");
|
|
|
+ stringRedisTemplate.delete("pod:" + tempPodName + ":node");
|
|
|
+ int restParallelism = nodeMap.get(tempNodeName);
|
|
|
+ nodeMap.put(tempNodeName, restParallelism + 1);
|
|
|
+ }
|
|
|
+ nodeMap.forEach((tempNodeName, tempParallelism) -> {
|
|
|
+ String restParallelismKey = "node:" + tempNodeName + ":parallelism";
|
|
|
+ stringRedisTemplate.opsForValue().set(restParallelismKey, tempParallelism + "");
|
|
|
+ });
|
|
|
+
|
|
|
+
|
|
|
+ Set<String> keys = stringRedisTemplate.keys(redisPrefix.getProjectRunningKey() + "*");
|
|
|
+ if (CollectionUtil.isNotEmpty(keys)) {
|
|
|
+ keys.forEach(key -> stringRedisTemplate.delete(key));
|
|
|
+ } else {
|
|
|
+ log.error("TaskService--taskState 前缀为 " + redisPrefix.getProjectRunningKey() + " 的 key 为空!");
|
|
|
+ }
|
|
|
|
|
|
-
|
|
|
- String nodeOfProject = "project:" + projectId + ":node";
|
|
|
- String restParallelismKey = "node:" + stringRedisTemplate.opsForValue().get(nodeOfProject) + ":parallelism";
|
|
|
- String usedParallelismKey = "project:" + projectId + ":parallelism";
|
|
|
- long restParallelism = Long.parseLong(Objects.requireNonNull(stringRedisTemplate.opsForValue().get(restParallelismKey)));
|
|
|
- long usedParallelism = Long.parseLong(Objects.requireNonNull(stringRedisTemplate.opsForValue().get(usedParallelismKey)));
|
|
|
- stringRedisTemplate.opsForValue().set(restParallelismKey, (restParallelism + usedParallelism) + "");
|
|
|
|
|
|
}
|
|
|
|
|
|
+
|
|
|
}
|