孟令鑫 преди 1 година
родител
ревизия
d1b1c958c7

+ 0 - 1
simulation-resource-scheduler/src/main/java/com/css/simulation/resource/scheduler/app/service/ProjectApplicationService.java

@@ -506,7 +506,6 @@ public class ProjectApplicationService {
         log.info("项目 " + projectId + " 运行在:" + nodeMap);
         stringRedisTemplate.opsForValue().set(projectRunningKey, JsonUtil.beanToJson(projectStartMessageEntity));
         //* -------------------------------- 3 根据算法id查询算法名称 --------------------------------
-//        String algorithmDockerImage = projectDomainService.getAlgorithmDockerImageByProjectTypeAndProjectId(projectType, projectId);
         String algorithmDockerImage = projectDomainService.getAlgorithmDockerImageByProjectId(projectId);
         // -------------------------------- 4 发送任务消息 --------------------------------
         List<NodeEntity> nodeListToCount = projectDomainService.getNodeListToCount(nodeMap);

+ 5 - 4
simulation-resource-scheduler/src/main/java/com/css/simulation/resource/scheduler/domain/service/ProjectDomainService.java

@@ -96,9 +96,10 @@ public class ProjectDomainService {
         String podYaml = getPodYamlName(nodeName, podName);     // 模板文件名称
         String yamlPath = podYamlDirectory + podYaml;
         String finalYaml;
+        String podString;
 
         if ("1".equals(modelType)) {
-            String podString = FileUtil.read(new File(vtdPodTemplateYaml));
+            podString = FileUtil.read(new File(vtdPodTemplateYaml));
             String replace0 = podString.replace("vtd-container", "vtd-" + projectId);
             String replace1 = replace0.replace("simulation-cloud-ip", simulationCloudIp);
             String replace2 = replace1.replace("kafka-ip", kafkaIp);
@@ -110,7 +111,7 @@ public class ProjectDomainService {
             String replace8 = replace7.replace("minio-secret-key", minioConfiguration.getSecretKey());
 
             String replace9 = replace8.replace("algorithm-container", "algorithm-" + projectId);
-            String replace10 = replace9.replace("algorithm-image", algorithmDockerImage);
+            String replace10 = replace9.replaceAll("algorithm-image", algorithmDockerImage);
 
             String replace11 = replace10.replace("pod-name", podName); // pod 名称包括 projectId 和 随机字符串
             String replace12 = replace11.replace("namespace-name", kubernetesConfiguration.getNamespace()); // pod 名称包括 projectId 和 随机字符串
@@ -134,7 +135,7 @@ public class ProjectDomainService {
                 throw new RuntimeException("createTempYaml() 是否使用 gpu:" + isChoiceGpu);
             }
         } else if ("2".equals(modelType)) {
-            String podString = FileUtil.read(new File(carsimPodTemplateYaml));
+            podString = FileUtil.read(new File(carsimPodTemplateYaml));
             String replace0 = podString.replace("vtd-container", "vtd-" + projectId);
             String replace1 = replace0.replace("simulation-cloud-ip", simulationCloudIp);
             String replace2 = replace1.replace("kafka-ip", kafkaIp);
@@ -146,7 +147,7 @@ public class ProjectDomainService {
             String replace8 = replace7.replaceAll("minio-secret-key", minioConfiguration.getSecretKey());
 
             String replace9 = replace8.replace("algorithm-container", "algorithm-" + projectId);
-            String replace10 = replace9.replace("algorithm-image", algorithmDockerImage);
+            String replace10 = replace9.replaceAll("algorithm-image", algorithmDockerImage);
 
             String replace11 = replace10.replace("carsim-container", "carsim-" + projectId);
             String replace12 = replace11.replace("carsim-image", kubernetesConfiguration.getCarsimImage());

+ 105 - 0
simulation-resource-scheduler/src/main/resources/kubernetes/template/pod/carsim-pod-template.yaml

@@ -0,0 +1,105 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: pod-name
+  namespace: namespace-name
+  labels:
+    user: CICV
+spec:
+  nodeName: node-name
+  dnsPolicy: None
+  dnsConfig:
+    nameservers:
+      - 10.16.11.1
+      - 10.16.11.2
+  hostAliases:
+    - ip: 10.14.85.239
+      hostnames:
+        - simulation004
+    - ip: 10.14.85.237
+      hostnames:
+        - gpu001
+  initContainers:
+    - name: init-algorithm-image
+      image: algorithm-image
+      imagePullPolicy: Always
+      command: [ 'sh', '-c', 'echo algorithm image downloaded && sleep 2' ]
+  containers:
+    - name: vtd-container
+      image: vtd-image
+      imagePullPolicy: Never
+      command: [ "/Controller/VTDController", "vtd-command", "kafka-topic" ]
+      env:
+        - name: PodName
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: LM_LICENSE_FILE
+          value: 27500@10.14.85.247
+        - name: SIMULATION_CLOUD_IP
+          value: simulation-cloud-ip
+        - name: KAFKA_IP
+          value: kafka-ip
+        - name: MINIO_IP
+          value: minio-ip
+        - name: MINIO_ACCESS_KEY
+          value: minio-access-key
+        - name: MINIO_SECRET_KEY
+          value: minio-secret-key
+        - name: KAFKA_PARTITION
+          value: kafka-partition
+        - name: KAFKA_OFFSET
+          value: kafka-offset
+        - name: CPU_ORDER
+          value: cpu-order
+      volumeMounts:
+        - name: nvidia0
+          mountPath: /dev/nvidia0
+        - name: nvidiactl
+          mountPath: /dev/nvidiactl
+      securityContext:
+        privileged: true
+    - name: algorithm-container
+      image: algorithm-image
+      imagePullPolicy: Always
+      command: [ "/bin/sh", "-c", "/run.sh; touch /tmp/hello.txt;while true;do /bin/echo $(date +%T) >> /tmp/hello.txt; sleep 600; done;" ]
+    - name: carsim-container
+      image: carsim-image
+      imagePullPolicy: Always
+      command: [ "carsim-command" ]
+      env:
+        - name: LICENSE_IP
+          value: 10.14.85.247
+        - name: SIM_FILE
+          value: /root/result/simfile.sim
+        - name: PATH_DLL
+          value: /root/libcarsim.so
+        - name: ENDPOINT
+          value: minio-ip
+        - name: ACCESSKEYID
+          value: minio-access-key
+        - name: SECRETACCESSKEY
+          value: minio-secret-key
+        - name: BUCKETNAME
+          value: minio-bucket
+        - name: LOG_DIR
+          value: log
+        - name: PAR_PATH
+          value: par-path
+        - name: LOG_PATH
+          value: log/carsim.log
+      ports:
+        - name: carsimport
+          containerPort: 8888
+#      resources:
+#        requests:
+#          cpu: "1"
+#          memory: "2Gi"
+  restartPolicy: Never
+  volumes:
+    - name: nvidia0
+      hostPath:
+        path: /dev/nvidia0
+    - name: nvidiactl
+      hostPath:
+        path: /dev/nvidiactl

+ 67 - 0
simulation-resource-scheduler/src/main/resources/kubernetes/template/pod/vtd-pod-template.yaml

@@ -0,0 +1,67 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: pod-name
+  namespace: namespace-name
+  labels:
+    user: CICV
+spec:
+  nodeName: node-name
+  dnsPolicy: None
+  dnsConfig:
+    nameservers:
+      #- 223.6.6.6
+      #- 8.8.8.8
+      - 10.16.11.1
+      - 10.16.11.2
+  hostAliases:
+    - ip: 10.14.85.239
+      hostnames:
+        - simulation004
+    - ip: 10.14.85.237
+      hostnames:
+        - gpu001
+  initContainers:
+    - name: init-algorithm-image
+      image: algorithm-image
+      imagePullPolicy: Always
+      command: ['sh', '-c', 'echo algorithm image downloaded && sleep 2']
+  containers:
+    - name: vtd-container
+      image: vtd-image
+      imagePullPolicy: Never
+      command: [ "/Controller/VTDController", "vtd-command", "kafka-topic" ]
+      resources:
+        limits:
+          nvidia.com/gpu: 1
+          #nvidia.com/mig-1g.10gb: 1
+      env:
+        - name: PodName
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: LM_LICENSE_FILE
+          value: 27500@10.14.85.247
+          #value: 27500@10.14.8.24
+          #value: 27500@172.20.0.2
+        - name: SIMULATION_CLOUD_IP
+          value: simulation-cloud-ip
+        - name: KAFKA_IP
+          value: kafka-ip
+        - name: MINIO_IP
+          value: minio-ip
+        - name: MINIO_ACCESS_KEY
+          value: minio-access-key
+        - name: MINIO_SECRET_KEY
+          value: minio-secret-key
+        - name: KAFKA_PARTITION
+          value: kafka-partition
+        - name: KAFKA_OFFSET
+          value: kafka-offset
+        - name: CPU_ORDER
+          value: cpu-order
+    - name: algorithm-container
+      image: algorithm-image
+      imagePullPolicy: Never
+      command: [ "/bin/sh", "-c", "/run.sh; touch /tmp/hello.txt;while true;do /bin/echo $(date +%T) >> /tmp/hello.txt; sleep 600; done;" ]
+  restartPolicy: Never