|
@@ -549,7 +549,7 @@ public class ProjectService {
|
|
|
remainderNodeMap.put(currentNodeName, cpuOrder);
|
|
|
}
|
|
|
// 只有准备启动(即 currentCount == 0)的时候才指定 cpu 编号
|
|
|
- log.info("创建任务 " + taskId + " 的 yaml:是否使用 gpu (0是1否)" + isChoiceGpu + ",当前节点已创建 yaml 个数为:" + currentCount + ",当前节点名称为:" + currentNodeName + ",当前 cpu 编号为:" + cpuOrder);
|
|
|
+ log.debug("创建任务 " + taskId + " 的 yaml:是否使用 gpu (0是1否)" + isChoiceGpu + ",当前节点已创建 yaml 个数为:" + currentCount + ",当前节点名称为:" + currentNodeName + ",当前 cpu 编号为:" + cpuOrder);
|
|
|
String yamlRedisKey = projectDomainService.createTempYaml(projectId, vehicleConfigId, modelType, algorithmDockerImage, currentNodeName, partition, offset, isChoiceGpu, cpuOrder);
|
|
|
if (currentCount == 0) {
|
|
|
yamlToRunRedisKeyList.add(yamlRedisKey);
|
|
@@ -557,11 +557,11 @@ public class ProjectService {
|
|
|
messageNumber++;
|
|
|
}
|
|
|
TimeUnit.SECONDS.sleep(10);
|
|
|
- log.info("项目 " + projectId + " 共发送了 " + messageNumber + " 条消息,准备首先启动 " + yamlToRunRedisKeyList);
|
|
|
+ log.debug("项目 " + projectId + " 共发送了 " + messageNumber + " 条消息,准备首先启动 " + yamlToRunRedisKeyList);
|
|
|
for (String redisKey : yamlToRunRedisKeyList) {
|
|
|
projectDomainService.createPodBegin(projectId, redisKey);
|
|
|
}
|
|
|
- log.info("项目 " + projectId + " 已经启动 " + yamlToRunRedisKeyList);
|
|
|
+ log.debug("项目 " + projectId + " 已经启动 " + yamlToRunRedisKeyList);
|
|
|
// 项目启动之后删除等待队列中的该项目
|
|
|
projectDomainService.removeWaitQueue(DictConstants.PROJECT_WAIT_TYPE_EXECUTE, projectId);
|
|
|
|
|
@@ -572,17 +572,21 @@ public class ProjectService {
|
|
|
log.info("扩充项目 {} {} 个并行度", projectId, expandParallelism);
|
|
|
//1 获取剩余并行度和即将使用的各node的并行度
|
|
|
Map<String, Integer> remainderNodeMap = projectDomainService.getRemainderNodeMap(isChoiceGpu);
|
|
|
+ log.info("剩余并行度为:" + remainderNodeMap);
|
|
|
Map<String, Integer> nodeMapToUse = projectDomainService.getNodeMapToUse(isChoiceGpu, expandParallelism);
|
|
|
+ log.info("即将使用的并行度为:" + nodeMapToUse);
|
|
|
//2 将指定 node 的并行度减少
|
|
|
nodeMapToUse.keySet().forEach(nodeName -> projectDomainService.decrementParallelism(isChoiceGpu, nodeName, nodeMapToUse.get(nodeName)));
|
|
|
//3 获取还未运行的任务 ("project:" + projectId + ":node:" + nodeName + ":yaml")
|
|
|
final Set<String> yamlPathCacheKeySet = customRedisClient.getKeySetByPrefixAndContent(stringRedisTemplate, "project:" + projectId + ":node", "yaml");
|
|
|
+ log.info("项目 {} 还未运行的 yaml 在缓存中的 key 有 {}", projectId, yamlPathCacheKeySet);
|
|
|
if (CollectionUtil.isNotEmpty(yamlPathCacheKeySet)) {
|
|
|
// 根据节点名分组
|
|
|
final Map<String, List<String>> yamlPathCacheKeyMapGroupByNodeName = yamlPathCacheKeySet.stream().collect(Collectors.groupingBy(key -> {
|
|
|
final String[] split = key.split(":");
|
|
|
return split[3];
|
|
|
}));
|
|
|
+ log.info("yaml缓存key根据节点分组之后为:" + yamlPathCacheKeyMapGroupByNodeName);
|
|
|
// 每个节点分出一部分给两个节点
|
|
|
yamlPathCacheKeyMapGroupByNodeName.forEach((nodeNameBefore, yamlPathCacheKeySetGroupByNodeName) -> {
|
|
|
final int yamlCount = yamlPathCacheKeySetGroupByNodeName.size();
|
|
@@ -608,6 +612,7 @@ public class ProjectService {
|
|
|
final String replace = read.replace("cpu-order", "\"" + cpuOrderString + "\"");
|
|
|
FileUtil.writeStringToLocalFile(replace, yamlPath);
|
|
|
// 创建 pod
|
|
|
+ log.info("扩充项目{}的一个并行度成功。", projectId);
|
|
|
projectDomainService.createPod(projectId, yamlPathCacheKeyAfter, cpuOrderString);
|
|
|
}
|
|
|
}
|
|
@@ -678,7 +683,7 @@ public class ProjectService {
|
|
|
String algorithmDirectoryLinuxTempPath;
|
|
|
String algorithmTarLinuxTempPath = null;
|
|
|
if (algorithmEntity != null) {
|
|
|
- log.info("项目" + projectId + "需要使用仿真平台自己的算法 " + algorithmEntity);
|
|
|
+ log.debug("项目" + projectId + "使用仿真平台自己的算法 " + algorithmEntity);
|
|
|
String algorithmCode = algorithmEntity.getAlgorithmCode();
|
|
|
String dockerImport = algorithmEntity.getDockerImport();
|
|
|
dockerImage = dockerConfiguration.getRegistry() + "/algorithm_" + algorithmCode + ":latest";
|
|
@@ -718,7 +723,7 @@ public class ProjectService {
|
|
|
throw new RuntimeException("算法 " + algorithmId + " 的 mysql 数据有误!");
|
|
|
}
|
|
|
} else {
|
|
|
- log.info("项目" + projectId + "需要使用索为平台算法 " + algorithmId);
|
|
|
+ log.debug("项目" + projectId + "使用索为平台算法 " + algorithmId);
|
|
|
algorithmTarLinuxTempPath = linuxTempPath + "algorithm/" + algorithmId + ".tar";
|
|
|
String dockerImageWithoutVersion = dockerConfiguration.getRegistry() + "/algorithm_" + algorithmId;
|
|
|
dockerImage = dockerImageWithoutVersion + ":latest";
|
|
@@ -754,6 +759,8 @@ public class ProjectService {
|
|
|
*/
|
|
|
@SneakyThrows
|
|
|
public void stopProject(String projectType, String projectId) {
|
|
|
+ // 删除等待队列中的项目
|
|
|
+ projectDomainService.removeWaitQueue(DictConstants.PROJECT_WAIT_TYPE_ALL, projectId);
|
|
|
String isChoiceGpu = projectDomainService.getIsChoiceGpuByProjectId(projectId);
|
|
|
//* -------------------------------- Comment --------------------------------
|
|
|
ProjectEntity projectEntity = projectDomainService.getProjectByProjectId(projectId);
|