|
@@ -3,17 +3,14 @@ package com.css.simulation.resource.scheduler.util;
|
|
import api.common.pojo.constants.DictConstants;
|
|
import api.common.pojo.constants.DictConstants;
|
|
import api.common.pojo.dto.ProjectMessageDTO;
|
|
import api.common.pojo.dto.ProjectMessageDTO;
|
|
import api.common.util.*;
|
|
import api.common.util.*;
|
|
|
|
+import com.css.simulation.resource.scheduler.configuration.esmini.EsminiConfiguration;
|
|
import com.css.simulation.resource.scheduler.configuration.kubernetes.KubernetesConfiguration;
|
|
import com.css.simulation.resource.scheduler.configuration.kubernetes.KubernetesConfiguration;
|
|
import com.css.simulation.resource.scheduler.configuration.redis.CustomRedisClient;
|
|
import com.css.simulation.resource.scheduler.configuration.redis.CustomRedisClient;
|
|
-import com.css.simulation.resource.scheduler.entity.ProjectEntity;
|
|
|
|
-import com.css.simulation.resource.scheduler.entity.UserEntity;
|
|
|
|
|
|
+import com.css.simulation.resource.scheduler.entity.*;
|
|
import com.css.simulation.resource.scheduler.mapper.AutoSubProjectMapper;
|
|
import com.css.simulation.resource.scheduler.mapper.AutoSubProjectMapper;
|
|
import com.css.simulation.resource.scheduler.mapper.ClusterMapper;
|
|
import com.css.simulation.resource.scheduler.mapper.ClusterMapper;
|
|
import com.css.simulation.resource.scheduler.mapper.ManualProjectMapper;
|
|
import com.css.simulation.resource.scheduler.mapper.ManualProjectMapper;
|
|
import com.css.simulation.resource.scheduler.mapper.UserMapper;
|
|
import com.css.simulation.resource.scheduler.mapper.UserMapper;
|
|
-import com.css.simulation.resource.scheduler.entity.KubernetesNodeEntity;
|
|
|
|
-import com.css.simulation.resource.scheduler.entity.NodeEntity;
|
|
|
|
-import com.css.simulation.resource.scheduler.entity.PrefixEntity;
|
|
|
|
import io.kubernetes.client.openapi.ApiClient;
|
|
import io.kubernetes.client.openapi.ApiClient;
|
|
import io.kubernetes.client.openapi.ApiException;
|
|
import io.kubernetes.client.openapi.ApiException;
|
|
import lombok.SneakyThrows;
|
|
import lombok.SneakyThrows;
|
|
@@ -55,6 +52,8 @@ public class ProjectUtil {
|
|
@Resource
|
|
@Resource
|
|
private ClusterMapper clusterMapper;
|
|
private ClusterMapper clusterMapper;
|
|
@Resource
|
|
@Resource
|
|
|
|
+ private EsminiConfiguration esminiConfiguration;
|
|
|
|
+ @Resource
|
|
private KubernetesConfiguration kubernetesConfiguration;
|
|
private KubernetesConfiguration kubernetesConfiguration;
|
|
@Resource
|
|
@Resource
|
|
private ApiClient apiClient;
|
|
private ApiClient apiClient;
|
|
@@ -148,7 +147,7 @@ public class ProjectUtil {
|
|
if (CollectionUtil.isEmpty(yamlPathCacheKeySet)) {
|
|
if (CollectionUtil.isEmpty(yamlPathCacheKeySet)) {
|
|
// 如果当前节点没有下一个yaml,则返回一个并行度。
|
|
// 如果当前节点没有下一个yaml,则返回一个并行度。
|
|
log.info("createNextPod3() 节点 " + nodeName + " 已经执行完被分配的项目 " + projectId + " 的所有 pod。");
|
|
log.info("createNextPod3() 节点 " + nodeName + " 已经执行完被分配的项目 " + projectId + " 的所有 pod。");
|
|
- addOneParallelismToNode(nodeName);
|
|
|
|
|
|
+ returnOneParallelismToGpuNode(nodeName);
|
|
} else {
|
|
} else {
|
|
final String yamlPathCacheKey = new ArrayList<>(yamlPathCacheKeySet).get(0);
|
|
final String yamlPathCacheKey = new ArrayList<>(yamlPathCacheKeySet).get(0);
|
|
final String absolutePath = stringRedisTemplate.opsForValue().get(yamlPathCacheKey);
|
|
final String absolutePath = stringRedisTemplate.opsForValue().get(yamlPathCacheKey);
|
|
@@ -242,14 +241,14 @@ public class ProjectUtil {
|
|
* @return 节点映射(节点名,并行度)
|
|
* @return 节点映射(节点名,并行度)
|
|
*/
|
|
*/
|
|
public Map<String, Integer> getNodeMap() {
|
|
public Map<String, Integer> getNodeMap() {
|
|
- List<KubernetesNodeEntity> initialNodeList = kubernetesConfiguration.getNodeList(); // 预设并行度的节点列表
|
|
|
|
|
|
+ List<GpuNodeEntity> initialNodeList = kubernetesConfiguration.getNodeList(); // 预设并行度的节点列表
|
|
log.info("预设并行度的节点列表为:" + initialNodeList);
|
|
log.info("预设并行度的节点列表为:" + initialNodeList);
|
|
Map<String, Integer> resultNodeMap = new HashMap<>(); // 用于执行的节点映射(节点名,并行度)
|
|
Map<String, Integer> resultNodeMap = new HashMap<>(); // 用于执行的节点映射(节点名,并行度)
|
|
- for (KubernetesNodeEntity kubernetesNodeSource : initialNodeList) {
|
|
|
|
- KubernetesNodeEntity kubernetesNodeCopy = kubernetesNodeSource.clone();
|
|
|
|
|
|
+ for (GpuNodeEntity kubernetesNodeSource : initialNodeList) {
|
|
|
|
+ GpuNodeEntity kubernetesNodeCopy = kubernetesNodeSource.clone();
|
|
String nodeName = kubernetesNodeCopy.getName();
|
|
String nodeName = kubernetesNodeCopy.getName();
|
|
- int maxParallelism = kubernetesNodeCopy.getMaxParallelism();
|
|
|
|
- String restParallelismKey = "node:" + nodeName + ":parallelism";
|
|
|
|
|
|
+ int maxParallelism = kubernetesNodeCopy.getParallelism();
|
|
|
|
+ String restParallelismKey = "gpu-node:" + nodeName + ":parallelism";
|
|
String restParallelismString = stringRedisTemplate.opsForValue().get(restParallelismKey);
|
|
String restParallelismString = stringRedisTemplate.opsForValue().get(restParallelismKey);
|
|
int restParallelism;
|
|
int restParallelism;
|
|
if (restParallelismString == null) { // 如果剩余可用并行度没有值,说明是第一次查询,则重置成最大并行度的预设值
|
|
if (restParallelismString == null) { // 如果剩余可用并行度没有值,说明是第一次查询,则重置成最大并行度的预设值
|
|
@@ -271,29 +270,29 @@ public class ProjectUtil {
|
|
* @return 集群剩余并行度
|
|
* @return 集群剩余并行度
|
|
*/
|
|
*/
|
|
public int getRestParallelism() {
|
|
public int getRestParallelism() {
|
|
- List<KubernetesNodeEntity> initialNodeList = kubernetesConfiguration.getNodeList(); // 预设并行度的节点列表
|
|
|
|
|
|
+ List<GpuNodeEntity> initialNodeList = kubernetesConfiguration.getNodeList(); // 预设并行度的节点列表
|
|
// 遍历所有节点,获取还有剩余并行度的节点
|
|
// 遍历所有节点,获取还有剩余并行度的节点
|
|
- List<KubernetesNodeEntity> restNodeList = new ArrayList<>(); // 剩余并行度的节点列表
|
|
|
|
- for (KubernetesNodeEntity kubernetesNodeSource : initialNodeList) {
|
|
|
|
- KubernetesNodeEntity kubernetesNodeCopy = kubernetesNodeSource.clone();
|
|
|
|
|
|
+ List<GpuNodeEntity> restNodeList = new ArrayList<>(); // 剩余并行度的节点列表
|
|
|
|
+ for (GpuNodeEntity kubernetesNodeSource : initialNodeList) {
|
|
|
|
+ GpuNodeEntity kubernetesNodeCopy = kubernetesNodeSource.clone();
|
|
String nodeName = kubernetesNodeCopy.getName(); // 节点名称
|
|
String nodeName = kubernetesNodeCopy.getName(); // 节点名称
|
|
- int maxParallelism = kubernetesNodeCopy.getMaxParallelism();
|
|
|
|
- String restParallelismString = stringRedisTemplate.opsForValue().get("node:" + nodeName + ":parallelism");// 获取节点剩余并行度的 key
|
|
|
|
|
|
+ int maxParallelism = kubernetesNodeCopy.getParallelism();
|
|
|
|
+ String restParallelismString = stringRedisTemplate.opsForValue().get("gpu-node:" + nodeName + ":parallelism");// 获取节点剩余并行度的 key
|
|
// -------------------------------- Comment --------------------------------
|
|
// -------------------------------- Comment --------------------------------
|
|
int restParallelism;
|
|
int restParallelism;
|
|
if (restParallelismString == null || Integer.parseInt(restParallelismString) > maxParallelism) { // 如果剩余可用并行度没有值,说明是第一次查询,则重置成最大并行度的预设值
|
|
if (restParallelismString == null || Integer.parseInt(restParallelismString) > maxParallelism) { // 如果剩余可用并行度没有值,说明是第一次查询,则重置成最大并行度的预设值
|
|
restParallelism = maxParallelism;
|
|
restParallelism = maxParallelism;
|
|
- stringRedisTemplate.opsForValue().set("node:" + nodeName + ":parallelism", restParallelism + "");
|
|
|
|
|
|
+ stringRedisTemplate.opsForValue().set("gpu-node:" + nodeName + ":parallelism", restParallelism + "");
|
|
} else {
|
|
} else {
|
|
restParallelism = Integer.parseInt(restParallelismString);
|
|
restParallelism = Integer.parseInt(restParallelismString);
|
|
- kubernetesNodeCopy.setMaxParallelism(restParallelism);
|
|
|
|
|
|
+ kubernetesNodeCopy.setParallelism(restParallelism);
|
|
}
|
|
}
|
|
if (restParallelism > 0) {
|
|
if (restParallelism > 0) {
|
|
restNodeList.add(kubernetesNodeCopy);
|
|
restNodeList.add(kubernetesNodeCopy);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
log.info("ProjectUtil--getRestParallelism 集群剩余并行度为:" + restNodeList);
|
|
log.info("ProjectUtil--getRestParallelism 集群剩余并行度为:" + restNodeList);
|
|
- return restNodeList.size() == 0 ? 0 : restNodeList.stream().mapToInt(KubernetesNodeEntity::getMaxParallelism).sum();
|
|
|
|
|
|
+ return restNodeList.size() == 0 ? 0 : restNodeList.stream().mapToInt(GpuNodeEntity::getParallelism).sum();
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|
|
@@ -303,23 +302,23 @@ public class ProjectUtil {
|
|
* @return 节点映射(节点名,并行度)
|
|
* @return 节点映射(节点名,并行度)
|
|
*/
|
|
*/
|
|
public Map<String, Integer> getNodeMapToUse(int parallelism) {
|
|
public Map<String, Integer> getNodeMapToUse(int parallelism) {
|
|
- List<KubernetesNodeEntity> initialNodeList = kubernetesConfiguration.getNodeList(); // 预设并行度的节点列表
|
|
|
|
|
|
+ List<GpuNodeEntity> initialNodeList = kubernetesConfiguration.getNodeList(); // 预设并行度的节点列表
|
|
log.info("预设并行度的节点列表为:" + initialNodeList);
|
|
log.info("预设并行度的节点列表为:" + initialNodeList);
|
|
// 遍历所有节点,获取还有剩余并行度的节点
|
|
// 遍历所有节点,获取还有剩余并行度的节点
|
|
- List<KubernetesNodeEntity> restNodeList = new ArrayList<>(); // 剩余并行度的节点列表
|
|
|
|
- for (KubernetesNodeEntity kubernetesNodeSource : initialNodeList) {
|
|
|
|
- KubernetesNodeEntity kubernetesNodeCopy = kubernetesNodeSource.clone();
|
|
|
|
|
|
+ List<GpuNodeEntity> restNodeList = new ArrayList<>(); // 剩余并行度的节点列表
|
|
|
|
+ for (GpuNodeEntity kubernetesNodeSource : initialNodeList) {
|
|
|
|
+ GpuNodeEntity kubernetesNodeCopy = kubernetesNodeSource.clone();
|
|
String nodeName = kubernetesNodeCopy.getName(); // 节点名称
|
|
String nodeName = kubernetesNodeCopy.getName(); // 节点名称
|
|
- int maxParallelism = kubernetesNodeCopy.getMaxParallelism();
|
|
|
|
- String restParallelismString = stringRedisTemplate.opsForValue().get("node:" + nodeName + ":parallelism");// 获取节点剩余并行度的 key
|
|
|
|
|
|
+ int maxParallelism = kubernetesNodeCopy.getParallelism();
|
|
|
|
+ String restParallelismString = stringRedisTemplate.opsForValue().get("gpu-node:" + nodeName + ":parallelism");// 获取节点剩余并行度的 key
|
|
// -------------------------------- Comment --------------------------------
|
|
// -------------------------------- Comment --------------------------------
|
|
int restParallelism;
|
|
int restParallelism;
|
|
if (restParallelismString == null || Integer.parseInt(restParallelismString) > maxParallelism) { // 如果剩余可用并行度没有值,说明是第一次查询,则重置成最大并行度的预设值
|
|
if (restParallelismString == null || Integer.parseInt(restParallelismString) > maxParallelism) { // 如果剩余可用并行度没有值,说明是第一次查询,则重置成最大并行度的预设值
|
|
restParallelism = maxParallelism;
|
|
restParallelism = maxParallelism;
|
|
- stringRedisTemplate.opsForValue().set("node:" + nodeName + ":parallelism", restParallelism + "");
|
|
|
|
|
|
+ stringRedisTemplate.opsForValue().set("gpu-node:" + nodeName + ":parallelism", restParallelism + "");
|
|
} else {
|
|
} else {
|
|
restParallelism = Integer.parseInt(restParallelismString);
|
|
restParallelism = Integer.parseInt(restParallelismString);
|
|
- kubernetesNodeCopy.setMaxParallelism(restParallelism);
|
|
|
|
|
|
+ kubernetesNodeCopy.setParallelism(restParallelism);
|
|
}
|
|
}
|
|
if (restParallelism > 0) {
|
|
if (restParallelism > 0) {
|
|
restNodeList.add(kubernetesNodeCopy);
|
|
restNodeList.add(kubernetesNodeCopy);
|
|
@@ -329,20 +328,20 @@ public class ProjectUtil {
|
|
Map<String, Integer> resultNodeMap = new HashMap<>(); // 用于执行的节点映射(节点名,并行度)
|
|
Map<String, Integer> resultNodeMap = new HashMap<>(); // 用于执行的节点映射(节点名,并行度)
|
|
if (!CollectionUtil.isEmpty(restNodeList)) {
|
|
if (!CollectionUtil.isEmpty(restNodeList)) {
|
|
if (restNodeList.size() == 1) {
|
|
if (restNodeList.size() == 1) {
|
|
- KubernetesNodeEntity tempNode = restNodeList.get(0);
|
|
|
|
|
|
+ GpuNodeEntity tempNode = restNodeList.get(0);
|
|
String tempNodeName = tempNode.getName();
|
|
String tempNodeName = tempNode.getName();
|
|
- int tempParallelism = tempNode.getMaxParallelism();
|
|
|
|
|
|
+ int tempParallelism = tempNode.getParallelism();
|
|
resultNodeMap.put(tempNodeName, Math.min(tempParallelism, parallelism));
|
|
resultNodeMap.put(tempNodeName, Math.min(tempParallelism, parallelism));
|
|
}
|
|
}
|
|
if (restNodeList.size() > 1) {
|
|
if (restNodeList.size() > 1) {
|
|
for (int i = 0; i < parallelism; i++) {
|
|
for (int i = 0; i < parallelism; i++) {
|
|
// 每次降序排序都取剩余并行度最大的一个。
|
|
// 每次降序排序都取剩余并行度最大的一个。
|
|
- restNodeList.sort((o1, o2) -> o2.getMaxParallelism() - o1.getMaxParallelism());
|
|
|
|
- KubernetesNodeEntity tempNode = restNodeList.get(0);
|
|
|
|
|
|
+ restNodeList.sort((o1, o2) -> o2.getParallelism() - o1.getParallelism());
|
|
|
|
+ GpuNodeEntity tempNode = restNodeList.get(0);
|
|
String tempNodeName = tempNode.getName();
|
|
String tempNodeName = tempNode.getName();
|
|
- int tempParallelism = tempNode.getMaxParallelism();
|
|
|
|
|
|
+ int tempParallelism = tempNode.getParallelism();
|
|
if (tempParallelism > 0) {
|
|
if (tempParallelism > 0) {
|
|
- tempNode.setMaxParallelism(tempParallelism - 1);
|
|
|
|
|
|
+ tempNode.setParallelism(tempParallelism - 1);
|
|
CollectionUtil.addValueToMap(resultNodeMap, 1, tempNodeName);
|
|
CollectionUtil.addValueToMap(resultNodeMap, 1, tempNodeName);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -477,8 +476,8 @@ public class ProjectUtil {
|
|
* @param nodeName 节点名称
|
|
* @param nodeName 节点名称
|
|
*/
|
|
*/
|
|
@Synchronized
|
|
@Synchronized
|
|
- public void addOneParallelismToNode(String nodeName) {
|
|
|
|
- String key = "node:" + nodeName + ":parallelism";
|
|
|
|
|
|
+ public void returnOneParallelismToGpuNode(String nodeName) {
|
|
|
|
+ String key = "gpu-node:" + nodeName + ":parallelism";
|
|
String parallelismString = stringRedisTemplate.opsForValue().get(key);
|
|
String parallelismString = stringRedisTemplate.opsForValue().get(key);
|
|
if (StringUtil.isEmpty(parallelismString)) {
|
|
if (StringUtil.isEmpty(parallelismString)) {
|
|
throw new RuntimeException("redisKey " + key + " 为空。");
|
|
throw new RuntimeException("redisKey " + key + " 为空。");
|
|
@@ -486,15 +485,26 @@ public class ProjectUtil {
|
|
final int parallelismBefore = Integer.parseInt(parallelismString);
|
|
final int parallelismBefore = Integer.parseInt(parallelismString);
|
|
final int parallelismAfter = parallelismBefore + 1;
|
|
final int parallelismAfter = parallelismBefore + 1;
|
|
stringRedisTemplate.opsForValue().set(key, parallelismAfter + "");
|
|
stringRedisTemplate.opsForValue().set(key, parallelismAfter + "");
|
|
- log.info("归还节点 " + nodeName + " 并行度:" + parallelismBefore + " --> " + parallelismAfter);
|
|
|
|
|
|
+ log.info("归还节点 " + nodeName + " 的 GPU 并行度:" + parallelismBefore + " --> " + parallelismAfter);
|
|
}
|
|
}
|
|
|
|
|
|
- public void parallelismAddOne(String nodeName) {
|
|
|
|
-
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- public void parallelismReduceOne(String nodeName) {
|
|
|
|
|
|
|
|
|
|
+ /**
|
|
|
|
+ * 将 redis 中的变量 +1,需要保证同步
|
|
|
|
+ *
|
|
|
|
+ * @param nodeName 节点名称
|
|
|
|
+ */
|
|
|
|
+ @Synchronized
|
|
|
|
+ public void returnOneParallelismToCpuNode(String nodeName) {
|
|
|
|
+ String key = "cpu-node:" + nodeName + ":parallelism";
|
|
|
|
+ String parallelismString = stringRedisTemplate.opsForValue().get(key);
|
|
|
|
+ if (StringUtil.isEmpty(parallelismString)) {
|
|
|
|
+ throw new RuntimeException("redisKey " + key + " 为空。");
|
|
|
|
+ }
|
|
|
|
+ final int parallelismBefore = Integer.parseInt(parallelismString);
|
|
|
|
+ final int parallelismAfter = parallelismBefore + 1;
|
|
|
|
+ stringRedisTemplate.opsForValue().set(key, parallelismAfter + "");
|
|
|
|
+ log.info("归还节点 " + nodeName + " 的 CPU 并行度:" + parallelismBefore + " --> " + parallelismAfter);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -508,13 +518,8 @@ public class ProjectUtil {
|
|
}
|
|
}
|
|
|
|
|
|
public void resetNodeParallelism() {
|
|
public void resetNodeParallelism() {
|
|
- List<KubernetesNodeEntity> initialNodeList = kubernetesConfiguration.getNodeList();
|
|
|
|
- List<String> podNameList = KubernetesUtil.getPod(apiClient, kubernetesConfiguration.getNamespace());
|
|
|
|
- if (CollectionUtil.isEmpty(podNameList)) {
|
|
|
|
- for (KubernetesNodeEntity kubernetesNodeEntity : initialNodeList) {
|
|
|
|
- stringRedisTemplate.opsForValue().set("node:" + kubernetesNodeEntity.getName() + ":parallelism", kubernetesNodeEntity.getMaxParallelism() + "");
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
|
|
+ kubernetesConfiguration.getNodeList().forEach((node) -> customRedisClient.set("gpu-node:" + node.getName() + ":parallelism", node.getParallelism() + ""));
|
|
|
|
+ esminiConfiguration.getNodeList().forEach((node) -> customRedisClient.set("cpu-node:" + node.getName() + ":parallelism", node.getParallelism() + ""));
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|