LingxinMeng 1 年之前
父节点
当前提交
b56c1f3860

+ 6 - 6
aarch64/pji/master/package/service/collect_one_msg.go

@@ -30,7 +30,7 @@ func collectMap() {
 		c_log.GlobalLogger.Error("程序异常退出。采集/map包", command, "出错:", s, "----", err)
 		os.Exit(-1)
 	}
-	c_log.GlobalLogger.Error("采集/map包", command, "完成。")
+	c_log.GlobalLogger.Info("采集/map包", command, "完成。")
 	config.OssMutex.Lock()
 	err = config.OssBucket.PutObjectFromFile(ossMapBagObjectKey, config.CloudConfig.MapBagPath)
 	config.OssMutex.Unlock()
@@ -38,7 +38,7 @@ func collectMap() {
 		c_log.GlobalLogger.Error("程序异常退出。上传/map包", config.CloudConfig.MapBagPath, "->", ossMapBagObjectKey, "出错:", err)
 		os.Exit(-1)
 	}
-	c_log.GlobalLogger.Error("上传/map包", config.CloudConfig.MapBagPath, "------", ossMapBagObjectKey, "成功。")
+	c_log.GlobalLogger.Info("上传/map包", config.CloudConfig.MapBagPath, "------", ossMapBagObjectKey, "成功。")
 }
 func collectTfStatic() {
 
@@ -57,7 +57,7 @@ func collectTfStatic() {
 		c_log.GlobalLogger.Error("程序异常退出。采集/tf_static包", command, "出错:", s, "----", err)
 		os.Exit(-1)
 	}
-	c_log.GlobalLogger.Error("采集/tf_static包", command, "完成。")
+	c_log.GlobalLogger.Info("采集/tf_static包", command, "完成。")
 	config.OssMutex.Lock()
 	err = config.OssBucket.PutObjectFromFile(ossMapBagObjectKey, config.CloudConfig.MapBagPath)
 	config.OssMutex.Unlock()
@@ -65,7 +65,7 @@ func collectTfStatic() {
 		c_log.GlobalLogger.Error("程序异常退出。上传/tf_static包", config.CloudConfig.MapBagPath, "->", ossMapBagObjectKey, "出错:", err)
 		os.Exit(-1)
 	}
-	c_log.GlobalLogger.Error("上传/tf_static包", config.CloudConfig.MapBagPath, "------", ossMapBagObjectKey, "成功。")
+	c_log.GlobalLogger.Info("上传/tf_static包", config.CloudConfig.MapBagPath, "------", ossMapBagObjectKey, "成功。")
 }
 func collectCostmap() {
 
@@ -84,7 +84,7 @@ func collectCostmap() {
 		c_log.GlobalLogger.Error("程序异常退出。采集/move_base/global_costmap/costmap包", command, "出错:", s, "----", err)
 		os.Exit(-1)
 	}
-	c_log.GlobalLogger.Error("采集/move_base/global_costmap/costmap包", command, "完成。")
+	c_log.GlobalLogger.Info("采集/move_base/global_costmap/costmap包", command, "完成。")
 	config.OssMutex.Lock()
 	err = config.OssBucket.PutObjectFromFile(ossMapBagObjectKey, config.CloudConfig.MapBagPath)
 	config.OssMutex.Unlock()
@@ -92,5 +92,5 @@ func collectCostmap() {
 		c_log.GlobalLogger.Error("程序异常退出。上传/move_base/global_costmap/costmap包", config.CloudConfig.MapBagPath, "->", ossMapBagObjectKey, "出错:", err)
 		os.Exit(-1)
 	}
-	c_log.GlobalLogger.Error("上传/move_base/global_costmap/costmap包", config.CloudConfig.MapBagPath, "------", ossMapBagObjectKey, "成功。")
+	c_log.GlobalLogger.Info("上传/move_base/global_costmap/costmap包", config.CloudConfig.MapBagPath, "------", ossMapBagObjectKey, "成功。")
 }

+ 10 - 16
amd64/dispatch_server/package/domain/comm_with_redis.go

@@ -44,11 +44,12 @@ func CanRunCluster() (bool, infra.GpuNode, error) {
 	return can, maxNode, nil
 }
 
-func AddWaitingUser(userId string, userParallelism int64, task entity.Task) error {
+func AddWaitingUser(userId string, userParallelism int64, algorithmObjectKey string, task entity.Task) error {
 	taskCacheTemp := entity.TaskCache{
-		UserId:          userId,
-		UserParallelism: userParallelism,
-		Task:            task,
+		UserId:             userId,
+		AlgorithmObjectKey: algorithmObjectKey,
+		UserParallelism:    userParallelism,
+		Task:               task,
 	}
 	// 转 json
 	taskCacheJson, err := TaskCacheToJson(taskCacheTemp)
@@ -76,11 +77,12 @@ func DeleteWaitingUser(taskCacheJson string) error {
 	return nil
 }
 
-func AddWaitingCluster(userId string, userParallelism int64, task entity.Task) error {
+func AddWaitingCluster(userId string, userParallelism int64, algorithmObjectKey string, task entity.Task) error {
 	taskCacheTemp := entity.TaskCache{
-		UserId:          userId,
-		UserParallelism: userParallelism,
-		Task:            task,
+		UserId:             userId,
+		AlgorithmObjectKey: algorithmObjectKey,
+		UserParallelism:    userParallelism,
+		Task:               task,
 	}
 	// 转 json
 	taskCacheJson, err := TaskCacheToJson(taskCacheTemp)
@@ -110,14 +112,6 @@ func AddRunningCluster(taskCache entity.TaskCache) error {
 	return nil
 }
 
-func TaskToJson(task entity.Task) (string, error) {
-	jsonData, err := json.MarshalIndent(task, "", "    ")
-	if err != nil {
-		return "", err
-	}
-	return string(jsonData), nil
-}
-
 func TaskCacheToJson(taskCache entity.TaskCache) (string, error) {
 	jsonData, err := json.MarshalIndent(taskCache, "", "    ")
 	if err != nil {

+ 4 - 3
amd64/dispatch_server/package/entity/task_cache.go

@@ -1,7 +1,8 @@
 package entity
 
 type TaskCache struct {
-	UserId          string `json:"userId"`
-	UserParallelism int64  `json:"userParallelism"`
-	Task            Task   `json:"task"`
+	UserId             string `json:"userId"`
+	UserParallelism    int64  `json:"userParallelism"`
+	AlgorithmObjectKey string `json:"algorithmObjectKey"`
+	Task               Task   `json:"task"`
 }

+ 3 - 2
amd64/dispatch_server/package/handler/start_project.go

@@ -129,19 +129,20 @@ func StartProject(c *gin.Context) {
 	userId := projectStartParam.UserId               // 用户ID
 	taskReceived := projectStartParam.Tasks          // 接收到的所有任务
 	userParallelism := projectStartParam.Parallelism // 用户的并行度上限
+	algorithmObjectKey := projectStartParam.AlgorithmObjectKey
 
 	// 1 判断用户并行度
 	for _, task := range taskReceived {
 		global.RunTaskMutex.Lock()
 		// 1 判断用户并行度是否有剩余,有剩余则加入集群等待队列,并从用户等待队列中拿出,没有剩余则不需要改动
 		if domain.CanRunUser(userId, userParallelism) { // 可以运行
-			err := domain.AddWaitingCluster(userId, userParallelism, task)
+			err := domain.AddWaitingCluster(userId, userParallelism, algorithmObjectKey, task)
 			if err != nil {
 				infra.GlobalLogger.Errorf("将任务 %v 添加到集群等待队列失败,错误信息为:%v", task, err)
 				continue
 			}
 		} else { // 不能运行
-			err := domain.AddWaitingUser(userId, userParallelism, task)
+			err := domain.AddWaitingUser(userId, userParallelism, algorithmObjectKey, task)
 			if err != nil {
 				infra.GlobalLogger.Errorf("将任务 %v 添加到集群等待队列失败,错误信息为:%v", task, err)
 				continue

+ 2 - 0
amd64/dispatch_server/package/infra/application.yaml

@@ -39,3 +39,5 @@ gpu-node-list:
 k8s:
   pod-yaml-dir: /mnt/disk001/cicv-data-closedloop/pod-yaml/
   vtd-pod-template-yaml: /mnt/disk001/cicv-data-closedloop/pod-template/vtd-pod-template.yaml
+  algorithm-tar-temp-dir: /mnt/disk001/cicv-data-closedloop/temp/algorithm/
+  registry-uri: 10.14.85.237:5000

+ 4 - 2
amd64/dispatch_server/package/infra/i_application.go

@@ -55,8 +55,10 @@ type GpuNode struct {
 }
 
 type K8sStruct struct {
-	PodYamlDir         string `yaml:"pod-yaml-dir"`
-	VtdPodTemplateYaml string `yaml:"vtd-pod-template-yaml"`
+	PodYamlDir          string `yaml:"pod-yaml-dir"`
+	VtdPodTemplateYaml  string `yaml:"vtd-pod-template-yaml"`
+	AlgorithmTarTempDir string `yaml:"algorithm-tar-temp-dir"`
+	RegistryUri         string `yaml:"registry-uri"`
 }
 
 var (

+ 5 - 5
amd64/dispatch_server/package/infra/i_oss.go

@@ -6,22 +6,22 @@ import (
 )
 
 var (
-	OssClient *oss.Client
-	OssBucket *oss.Bucket
+	GlobalOssClient *oss.Client
+	GlobalOssBucket *oss.Bucket
 )
 
 func InitOss(isUseCname bool, endpoint string, accessKeyId string, accessKeySecret string, bucketName string) {
 	var err error
 	if isUseCname {
-		OssClient, err = oss.New(endpoint, accessKeyId, accessKeySecret, oss.UseCname(true)) // 公网
+		GlobalOssClient, err = oss.New(endpoint, accessKeyId, accessKeySecret, oss.UseCname(true)) // 公网
 	} else {
-		OssClient, err = oss.New(endpoint, accessKeyId, accessKeySecret, oss.UseCname(false)) // 内网
+		GlobalOssClient, err = oss.New(endpoint, accessKeyId, accessKeySecret, oss.UseCname(false)) // 内网
 	}
 	if err != nil {
 		GlobalLogger.Error("无法创建阿里云client:", err)
 		os.Exit(-1)
 	}
-	OssBucket, err = OssClient.Bucket(bucketName)
+	GlobalOssBucket, err = GlobalOssClient.Bucket(bucketName)
 	if err != nil {
 		GlobalLogger.Error("无法创建阿里云bucket:", err)
 		os.Exit(-1)

+ 93 - 5
amd64/dispatch_server/package/service/run_task.go

@@ -6,9 +6,13 @@ import (
 	"cicv-data-closedloop/amd64/dispatch_server/package/global"
 	"cicv-data-closedloop/amd64/dispatch_server/package/infra"
 	"cicv-data-closedloop/amd64/dispatch_server/package/util"
+	"cicv-data-closedloop/common/config/c_log"
 	"encoding/json"
 	"errors"
 	"fmt"
+	"github.com/confluentinc/confluent-kafka-go/kafka"
+	"path/filepath"
+	"strconv"
 	"strings"
 	"time"
 )
@@ -39,10 +43,11 @@ func RunWaitingUser() {
 			}
 			userId := taskCache.UserId
 			userParallelism := taskCache.UserParallelism
+			algorithmObjectKey := taskCache.AlgorithmObjectKey
 			task := taskCache.Task
 			// 1 判断用户并行度是否有剩余,有剩余则加入集群等待队列,并从用户等待队列中拿出,没有剩余则不需要改动
 			if domain.CanRunUser(userId, userParallelism) { // 可以运行
-				err = domain.AddWaitingCluster(userId, userParallelism, task)
+				err = domain.AddWaitingCluster(userId, userParallelism, algorithmObjectKey, task)
 				if err != nil {
 					infra.GlobalLogger.Error(err)
 					continue
@@ -71,10 +76,10 @@ func RunWaitingCluster() {
 		}
 		var firstTaskCache entity.TaskCache
 		if can {
-			// 移除并取出
-			firstTaskCacheJson, err := infra.GlobalRedisClient.LPop(global.KeyTaskQueueWaitingCluster).Result()
+			// 取出但不移除
+			firstTaskCacheJson, err := infra.GlobalRedisClient.LIndex(global.KeyTaskQueueWaitingCluster, 0).Result()
 			if err != nil {
-				infra.GlobalLogger.Error("移除并取出集群等待队列中的头元素报错,错误信息为:", err)
+				infra.GlobalLogger.Error("取出集群等待队列中的头元素报错,错误信息为:", err)
 				continue
 			}
 			firstTaskCache, err = JsonToTaskCache(firstTaskCacheJson)
@@ -89,10 +94,78 @@ func RunWaitingCluster() {
 			}
 		}
 		global.GpuNodeListMutex.Unlock()
+		// 获取项目ID
+		projectId := firstTaskCache.Task.Info.ProjectId
+		offsetKey := "offset:" + projectId
+		offset := 0
+		// 根据项目ID获取偏移量
+		val, err := infra.GlobalRedisClient.Get(offsetKey).Result()
+		if err != nil {
+			infra.GlobalLogger.Infof("偏移量键 %v 不存在,初始化设置为 0。", offsetKey)
+			err = infra.GlobalRedisClient.Set(offsetKey, 0, 0).Err()
+			if err != nil {
+				infra.GlobalLogger.Infof("偏移量键值对 %v 初始化失败,错误信息为: %v", offsetKey, err)
+				continue
+			}
+		} else {
+			offset, err = strconv.Atoi(val)
+			if err != nil {
+				infra.GlobalLogger.Infof("字符串 %v 转整数失败,错误信息为: %v", val, err)
+				continue
+			}
+		}
+		// 取出偏移量后将缓存中的加一,给下个任务使用。
+		_, err = infra.GlobalRedisClient.Incr(offsetKey).Result()
+		if err != nil {
+			infra.GlobalLogger.Infof("偏移量 %v 加一失败,错误信息为: %v", offsetKey, err)
+			continue
+		}
+
 		// --------------- 发送 kafka 消息(获取偏移量和分区) ---------------
+		// 获取任务消息转json
+		taskJson, err := TaskToJson(firstTaskCache.Task)
+		if err != nil {
+			infra.GlobalLogger.Error(err)
+			continue
+		}
+		topic := projectId
+		value := []byte(taskJson)
 
+		// 创建一个Message,并指定分区为0
+		msg := &kafka.Message{
+			TopicPartition: kafka.TopicPartition{Topic: &topic, Partition: 0, Offset: kafka.Offset(offset)},
+			Value:          value,
+		}
+		// 发送消息,并处理结果
+		err = infra.GlobalKafkaProducer.Produce(msg, nil)
+		if err != nil {
+			infra.GlobalLogger.Infof("发送任务消息 %v 失败,错误信息为: %v", msg, err)
+			continue
+		}
+		// --------------- 下载算法 ---------------
+		algorithmTarName := filepath.Base(firstTaskCache.AlgorithmObjectKey)
+		algorithmTarPath := infra.ApplicationYaml.K8s.AlgorithmTarTempDir + algorithmTarName
+		algorithmImageName := infra.ApplicationYaml.K8s.RegistryUri + "/cicvdcl_" + util.MD5HashShort(algorithmTarName)
+		err = infra.GlobalOssBucket.GetObjectToFile(firstTaskCache.AlgorithmObjectKey, algorithmTarPath)
+		if err != nil {
+			c_log.GlobalLogger.Error("下载oss上的算法镜像 "+firstTaskCache.AlgorithmObjectKey+" 失败,错误信息为:", err)
+			time.Sleep(time.Duration(2) * time.Second)
+			continue
+		}
+		// 导入算法
+		_, s, err := util.Execute("docker", "import", algorithmTarPath, algorithmImageName)
+		_, s, err = util.Execute("docker", "push", algorithmImageName)
+		if err != nil {
+			c_log.GlobalLogger.Errorf("导入算法镜像 %v 为 %v 失败,执行结果为:%v,错误信息为:%v", algorithmTarPath, algorithmImageName, s, err)
+			time.Sleep(time.Duration(2) * time.Second)
+			continue
+		}
+		c_log.GlobalLogger.Infof("导入算法镜像 %v 为 %v 成功,执行结果为:%v", algorithmTarPath, algorithmImageName, s)
+		err = util.RemoveFile(algorithmTarPath)
+		if err != nil {
+			c_log.GlobalLogger.Errorf("删除算法镜像文件 %v 失败,错误信息为:%v", algorithmTarPath, err)
+		}
 		// --------------- 启动 k8s pod ---------------
-		projectId := firstTaskCache.Task.Info.ProjectId
 		nodeName := gpuNode.Hostname
 		// 1 生成 podName
 		podName := "project-" + projectId + "-" + util.NewShortUUID()
@@ -117,6 +190,13 @@ func RunWaitingCluster() {
 		podString = strings.Replace(podString, "kafka-partition", projectId, 1)
 		podString = strings.Replace(podString, "kafka-offset", projectId, 1)
 		// todo cpu编号是剩余并行度减一
+
+		// --------------- 移除头元素
+		_, err = infra.GlobalRedisClient.LPop(global.KeyTaskQueueWaitingCluster).Result()
+		if err != nil {
+			infra.GlobalLogger.Error("取出集群等待队列中的头元素报错,错误信息为:", err)
+			continue
+		}
 	}
 }
 
@@ -131,3 +211,11 @@ func JsonToTaskCache(jsonData string) (entity.TaskCache, error) {
 	}
 	return taskCache, nil
 }
+
+func TaskToJson(task entity.Task) (string, error) {
+	jsonData, err := json.MarshalIndent(task, "", "    ")
+	if err != nil {
+		return "", errors.New("转json失败,错误信息为:" + err.Error())
+	}
+	return string(jsonData), nil
+}

+ 37 - 0
amd64/dispatch_server/package/util/u_crypto.go

@@ -0,0 +1,37 @@
+package util
+
+import (
+	"crypto/md5"
+	"encoding/hex"
+	"io"
+)
+
+// MD5Hash 函数接收一个字符串作为输入,返回该字符串的MD5哈希值(十六进制格式)
+func MD5Hash(text string) string {
+	// 创建一个新的hash.Hash接口来写入数据
+	hasher := md5.New()
+
+	// 写入需要哈希的数据
+	io.WriteString(hasher, text)
+
+	// 计算哈希值的Sum(字节切片)
+	sum := hasher.Sum(nil)
+
+	// 将字节切片转换为十六进制字符串
+	return hex.EncodeToString(sum)
+}
+
+// MD5Hash 函数接收一个字符串作为输入,返回该字符串的MD5哈希值(十六进制格式)
+func MD5HashShort(text string) string {
+	// 创建一个新的hash.Hash接口来写入数据
+	hasher := md5.New()
+
+	// 写入需要哈希的数据
+	io.WriteString(hasher, text)
+
+	// 计算哈希值的Sum(字节切片)
+	sum := hasher.Sum(nil)
+
+	// 将字节切片转换为十六进制字符串
+	return hex.EncodeToString(sum)[0:8]
+}

+ 16 - 0
amd64/dispatch_server/package/util/u_exec.go

@@ -0,0 +1,16 @@
+package util
+
+import (
+	"cicv-data-closedloop/common/config/c_log"
+	"os/exec"
+)
+
+func Execute(name string, arg ...string) (*exec.Cmd, string, error) {
+	cmd := exec.Command(name, arg...)
+	combinedOutput, err := cmd.CombinedOutput()
+	if err != nil {
+		c_log.GlobalLogger.Info("命令", name, " ", arg, "执行出错:", err)
+		return nil, string(combinedOutput), err
+	}
+	return cmd, string(combinedOutput), nil
+}

+ 12 - 0
amd64/dispatch_server/package/util/u_file.go

@@ -20,3 +20,15 @@ func ReadFile(filePath string) (string, error) {
 	}
 	return string(content), err
 }
+
+func RemoveFile(path string) error {
+	// 检查文件是否存在
+	if _, err := os.Stat(path); err == nil {
+		// 文件存在,执行删除操作
+		err = os.Remove(path)
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}

+ 29 - 0
amd64/dispatch_server/test/test.go

@@ -0,0 +1,29 @@
+package main
+
+import (
+	"crypto/md5"
+	"encoding/hex"
+	"fmt"
+	"io"
+)
+
+// MD5Hash 函数接收一个字符串作为输入,返回该字符串的MD5哈希值(十六进制格式)
+func MD5Hash(text string) string {
+	// 创建一个新的hash.Hash接口来写入数据
+	hasher := md5.New()
+
+	// 写入需要哈希的数据
+	io.WriteString(hasher, text)
+
+	// 计算哈希值的Sum(字节切片)
+	sum := hasher.Sum(nil)
+
+	// 将字节切片转换为十六进制字符串
+	return hex.EncodeToString(sum)
+}
+
+func main() {
+	text := "算法比赛.tar"
+	hash := MD5Hash(text)
+	fmt.Printf("The MD5 hash of '%s' is: %s\n", text, hash[0:8])
+}

+ 66 - 0
amd64/dispatch_server/vtd-pod-template_20240506.yaml

@@ -0,0 +1,66 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: pod-name
+  namespace: namespace-name
+  labels:
+    user: CICV
+spec:
+  nodeName: node-name
+  dnsPolicy: None
+  dnsConfig:
+    nameservers:
+      - 10.16.11.1
+      - 10.16.11.2
+  hostAliases:
+    - ip: 10.14.85.239
+      hostnames:
+        - simulation004
+    - ip: 10.14.85.237
+      hostnames:
+        - gpu001
+  initContainers:
+    - name: init
+      image: algorithm-image
+      imagePullPolicy: Always
+      command: ['sh', '-c', 'echo algorithm image downloaded && sleep 2']
+  containers:
+    - name: vtd-container
+      image: vtd-image
+      imagePullPolicy: Always
+      command: [ "/Controller/VTDController", "vtd-command", "kafka-topic" ]
+      resources:
+        limits:
+          nvidia.com/gpu: 1
+      env:
+        - name: PodName
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: LM_LICENSE_FILE
+          value: 27500@10.14.85.247
+        - name: SIMULATION_CLOUD_IP
+          value: simulation-cloud-ip
+        - name: KAFKA_IP
+          value: kafka-ip
+        - name: OSS_TYPE
+          value: oss-type
+        - name: OSS_IP
+          value: oss-ip
+        - name: OSS_ACCESS_KEY
+          value: oss-access-key
+        - name: OSS_SECRET_KEY
+          value: oss-secret-key
+        - name: OSS_BUCKET_NAME
+          value: oss-bucket
+        - name: KAFKA_PARTITION
+          value: kafka-partition
+        - name: KAFKA_OFFSET
+          value: kafka-offset
+        - name: CPU_ORDER
+          value: cpu-order
+    - name: algorithm-container
+      image: algorithm-image
+      imagePullPolicy: Never
+      command: [ "/bin/sh", "-c", "/run.sh; touch /tmp/hello.txt;while true;do /bin/echo $(date +%T) >> /tmp/hello.txt; sleep 600; done;" ]
+  restartPolicy: Never

+ 19 - 0
pjisuv_msgs/control_msgs.go

@@ -0,0 +1,19 @@
+package pjisuv_msgs
+
+import (
+	"github.com/bluenviron/goroslib/v2/pkg/msg"
+	"github.com/bluenviron/goroslib/v2/pkg/msgs/std_msgs"
+)
+
+type VehicleFdb struct {
+	msg.Package         `ros:"control_msgs"`
+	Header              std_msgs.Header
+	CurGear             float64
+	SteeringWheelAngle  float64
+	SteeringWheelRotDir float64
+	SteeringWheelRotSpd float64
+	AccPed1             float64
+	AccPed2             float64
+	Automode            int16 `rosname:"Automode"`
+	FlagCanReadSuccess  int16
+}