run_task.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. package service
  2. import (
  3. "cicv-data-closedloop/amd64/dispatch_server/package/domain"
  4. "cicv-data-closedloop/amd64/dispatch_server/package/entity"
  5. "cicv-data-closedloop/amd64/dispatch_server/package/global"
  6. "cicv-data-closedloop/amd64/dispatch_server/package/infra"
  7. "cicv-data-closedloop/amd64/dispatch_server/package/util"
  8. "encoding/json"
  9. "errors"
  10. "fmt"
  11. "github.com/confluentinc/confluent-kafka-go/kafka"
  12. "path/filepath"
  13. "strconv"
  14. "strings"
  15. "time"
  16. )
  17. /*
  18. 负责处理用户等待队列中的任务
  19. 负责运行集群等待队列中的任务
  20. */
  21. // 判断用户等待队列中的任务是否可以加入到集群等待队列
  22. func RunWaitingUser() {
  23. infra.GlobalLogger.Infof("启动【用户等待队列】监控进程。")
  24. for {
  25. time.Sleep(2 * time.Second)
  26. global.RunTaskMutex.Lock()
  27. // 获取Redis列表中的值
  28. taskCacheJsons, err := infra.GlobalRedisClient.LRange(global.KeyTaskQueueWaitingUser, 0, -1).Result()
  29. if err != nil {
  30. infra.GlobalLogger.Errorf("遍历用户等待队列 %v 失败,错误信息为: %v", global.KeyTaskQueueWaitingUser, err)
  31. continue
  32. }
  33. for _, taskCacheJson := range taskCacheJsons {
  34. taskCache, err := JsonToTaskCache(taskCacheJson)
  35. if err != nil {
  36. infra.GlobalLogger.Error(err)
  37. continue
  38. }
  39. userId := taskCache.UserId
  40. userParallelism := taskCache.UserParallelism
  41. algorithmObjectKey := taskCache.AlgorithmObjectKey
  42. task := taskCache.Task
  43. // 1 判断用户并行度是否有剩余,有剩余则加入集群等待队列,并从用户等待队列中拿出,没有剩余则不需要改动
  44. if domain.CanRunUser(userId, userParallelism) { // 可以运行
  45. err = domain.AddWaitingCluster(userId, userParallelism, algorithmObjectKey, task)
  46. if err != nil {
  47. infra.GlobalLogger.Error(err)
  48. continue
  49. }
  50. err = domain.DeleteWaitingUser(task.Info.TaskId)
  51. if err != nil {
  52. infra.GlobalLogger.Error(err)
  53. continue
  54. }
  55. }
  56. }
  57. global.RunTaskMutex.Unlock()
  58. }
  59. }
  60. // 集群等待队列中的任务判断是否可以加入集群运行队列
  61. func RunWaitingCluster() {
  62. infra.GlobalLogger.Infof("启动【集群等待队列】监控进程。")
  63. for {
  64. var algorithmTarName string
  65. var algorithmTarPath string
  66. var algorithmImageName string
  67. time.Sleep(2 * time.Second)
  68. global.GpuNodeListMutex.Lock()
  69. // 1 判断用户并行度是否有剩余,有剩余则从集群等待队列取出第一个加入集群运行队列,并运行pod,没有剩余则不需要改动
  70. can, gpuNode, err := domain.CanRunCluster()
  71. if err != nil {
  72. infra.GlobalLogger.Error(err)
  73. global.GpuNodeListMutex.Unlock()
  74. continue
  75. }
  76. var firstTaskCache entity.TaskCache
  77. if can {
  78. //infra.GlobalLogger.Infof("节点 %v 有剩余并行度。", gpuNode)
  79. // 判断是否有待运行的任务
  80. waitingClusterNumber, _ := infra.GlobalRedisClient.LLen(global.KeyTaskQueueWaitingCluster).Result()
  81. if waitingClusterNumber == 0 {
  82. //infra.GlobalLogger.Info("集群没有等待运行的任务。")
  83. global.GpuNodeListMutex.Unlock()
  84. continue
  85. } else {
  86. infra.GlobalLogger.Infof("集群存在 %v 个等待运行的任务。", waitingClusterNumber)
  87. }
  88. // 取出但不移除
  89. {
  90. firstTaskCacheJson, err := infra.GlobalRedisClient.LIndex(global.KeyTaskQueueWaitingCluster, 0).Result()
  91. if err != nil {
  92. infra.GlobalLogger.Error("取出集群等待队列中的头元素报错,错误信息为:", err)
  93. global.GpuNodeListMutex.Unlock()
  94. continue
  95. }
  96. firstTaskCache, err = JsonToTaskCache(firstTaskCacheJson)
  97. if err != nil {
  98. infra.GlobalLogger.Error(err)
  99. global.GpuNodeListMutex.Unlock()
  100. continue
  101. }
  102. }
  103. // --------------- 下载算法 ---------------
  104. {
  105. infra.GlobalLogger.Infof("开始下载算法 %v。", firstTaskCache.AlgorithmObjectKey)
  106. algorithmTarName = filepath.Base(firstTaskCache.AlgorithmObjectKey)
  107. algorithmTarPath = infra.ApplicationYaml.K8s.AlgorithmTarTempDir + util.NewShortUUID() + "/" + algorithmTarName
  108. _ = util.CreateParentDir(algorithmTarPath)
  109. algorithmImageName = infra.ApplicationYaml.K8s.RegistryUri + "/cicvdcl_" + util.MD5HashShort(algorithmTarName)
  110. _ = infra.GlobalOssBucket.GetObjectToFile(firstTaskCache.AlgorithmObjectKey, algorithmTarPath)
  111. if err != nil {
  112. infra.GlobalLogger.Error("下载oss上的算法镜像 "+firstTaskCache.AlgorithmObjectKey+" 失败,错误信息为:", err)
  113. time.Sleep(time.Duration(2) * time.Second)
  114. global.GpuNodeListMutex.Unlock()
  115. continue
  116. }
  117. infra.GlobalLogger.Infof("下载算法 %v 成功。", firstTaskCache.AlgorithmObjectKey)
  118. }
  119. } else {
  120. infra.GlobalLogger.Infof("集群没有剩余并行度。")
  121. global.GpuNodeListMutex.Unlock()
  122. continue
  123. }
  124. global.GpuNodeListMutex.Unlock()
  125. // 获取项目ID
  126. projectId := firstTaskCache.Task.Info.ProjectId
  127. offsetKey := "offset:" + projectId
  128. offset := 0
  129. // 根据项目ID获取偏移量
  130. val, err := infra.GlobalRedisClient.Get(offsetKey).Result()
  131. if err != nil {
  132. infra.GlobalLogger.Infof("偏移量键 %v 不存在,初始化设置为 0。", offsetKey)
  133. err = infra.GlobalRedisClient.Set(offsetKey, 0, 0).Err()
  134. if err != nil {
  135. infra.GlobalLogger.Infof("偏移量键值对 %v 初始化失败,错误信息为: %v", offsetKey, err)
  136. continue
  137. }
  138. } else {
  139. offset, err = strconv.Atoi(val)
  140. if err != nil {
  141. infra.GlobalLogger.Infof("字符串 %v 转整数失败,错误信息为: %v", val, err)
  142. continue
  143. }
  144. infra.GlobalLogger.Infof("当前任务使用偏移量【%v】", offset)
  145. }
  146. // 取出偏移量后将缓存中的加一,给下个任务使用。
  147. _, err = infra.GlobalRedisClient.Incr(offsetKey).Result()
  148. if err != nil {
  149. infra.GlobalLogger.Infof("偏移量 %v 加一失败,错误信息为: %v", offsetKey, err)
  150. continue
  151. }
  152. infra.GlobalLogger.Infof("偏移量【%v】加一给下个任务使用。", offsetKey)
  153. // --------------- 发送 kafka 消息(获取偏移量和分区) ---------------
  154. // 获取任务消息转json
  155. taskJson, err := TaskToJson(firstTaskCache.Task)
  156. if err != nil {
  157. infra.GlobalLogger.Error(err)
  158. continue
  159. }
  160. topic := projectId
  161. // 创建一个Message,并指定分区为0
  162. msg := &kafka.Message{
  163. TopicPartition: kafka.TopicPartition{Topic: &topic, Partition: infra.ApplicationYaml.Kafka.Partition, Offset: kafka.Offset(offset)},
  164. Value: []byte(taskJson),
  165. }
  166. // 发送消息,并处理结果
  167. err = infra.GlobalKafkaProducer.Produce(msg, nil)
  168. if err != nil {
  169. infra.GlobalLogger.Infof("发送任务消息 %v 失败,错误信息为: %v", msg, err)
  170. continue
  171. }
  172. infra.GlobalLogger.Infof("发送任务消息成功,话题为【%v】,偏移量为【%v】。", topic, offset)
  173. // 导入算法
  174. _, s, err := util.Execute("docker", "import", algorithmTarPath, algorithmImageName)
  175. _, s, err = util.Execute("docker", "push", algorithmImageName)
  176. if err != nil {
  177. infra.GlobalLogger.Errorf("导入算法镜像 %v 为 %v 失败,执行结果为:%v,错误信息为:%v", algorithmTarPath, algorithmImageName, s, err)
  178. time.Sleep(time.Duration(2) * time.Second)
  179. continue
  180. }
  181. infra.GlobalLogger.Infof("导入算法镜像 %v 为 %v 成功,执行结果为:%v", algorithmTarPath, algorithmImageName, s)
  182. err = util.RemoveFile(algorithmTarPath)
  183. if err != nil {
  184. infra.GlobalLogger.Errorf("删除算法镜像文件 %v 失败,错误信息为:%v", algorithmTarPath, err)
  185. }
  186. // --------------- 启动 k8s pod ---------------
  187. podName := "project-" + projectId + "-" + util.NewShortUUID()
  188. namespaceName := infra.ApplicationYaml.K8s.NamespaceName
  189. nodeName := gpuNode.Hostname
  190. restParallelism := gpuNode.Parallelism
  191. vtdContainer := "vtd-" + projectId
  192. algorithmContainer := "algorithm-" + projectId
  193. vtdImage := infra.ApplicationYaml.K8s.VtdImage
  194. // 2 生成模板文件名称
  195. podYaml := nodeName + "#" + podName + ".yaml"
  196. // 3 模板yaml存储路径
  197. yamlPath := infra.ApplicationYaml.K8s.PodYamlDir + podYaml
  198. // 4 模板yaml备份路径
  199. yamlPathBak := infra.ApplicationYaml.K8s.PodYamlDir + "bak/" + podYaml
  200. fmt.Println(yamlPath, yamlPathBak)
  201. // 5
  202. podString, err := util.ReadFile(infra.ApplicationYaml.K8s.VtdPodTemplateYaml)
  203. if err != nil {
  204. infra.GlobalLogger.Error(err)
  205. continue
  206. }
  207. podString = strings.Replace(podString, "pod-name", podName, -1)
  208. podString = strings.Replace(podString, "namespace-name", namespaceName, -1)
  209. podString = strings.Replace(podString, "node-name", nodeName, -1)
  210. podString = strings.Replace(podString, "algorithm-image", algorithmImageName, -1)
  211. podString = strings.Replace(podString, "vtd-container", vtdContainer, -1)
  212. podString = strings.Replace(podString, "vtd-image", vtdImage, -1)
  213. podString = strings.Replace(podString, "vtd-command", infra.ApplicationYaml.K8s.VtdCommand, -1)
  214. podString = strings.Replace(podString, "platform-ip", infra.ApplicationYaml.Web.IpPrivate, -1)
  215. podString = strings.Replace(podString, "oss-type", infra.ApplicationYaml.Oss.Type, -1)
  216. podString = strings.Replace(podString, "oss-ip", infra.ApplicationYaml.Oss.Endpoint, -1) // 不带http://前缀
  217. podString = strings.Replace(podString, "oss-access-key", infra.ApplicationYaml.Oss.AccessKeyId, -1)
  218. podString = strings.Replace(podString, "oss-secret-key", infra.ApplicationYaml.Oss.AccessKeySecret, -1)
  219. podString = strings.Replace(podString, "oss-bucket", infra.ApplicationYaml.Oss.BucketName, -1)
  220. podString = strings.Replace(podString, "kafka-ip", infra.ApplicationYaml.Kafka.Broker, -1)
  221. podString = strings.Replace(podString, "kafka-topic", projectId, -1)
  222. podString = strings.Replace(podString, "kafka-partition", "\""+util.ToString(infra.ApplicationYaml.Kafka.Partition)+"\"", -1)
  223. podString = strings.Replace(podString, "kafka-offset", "\""+util.ToString(offset)+"\"", -1)
  224. podString = strings.Replace(podString, "cpu-order", "\""+util.ToString(restParallelism-1)+"\"", -1) // cpu编号是剩余并行度-1
  225. podString = strings.Replace(podString, "algorithm-container", algorithmContainer, -1)
  226. // --------------- 保存成文件
  227. err = util.WriteFile(podString, yamlPath)
  228. err = util.WriteFile(podString, yamlPathBak)
  229. if err != nil {
  230. infra.GlobalLogger.Error("保存yaml字符串失败,错误信息为", err)
  231. continue
  232. }
  233. infra.GlobalLogger.Infof("保存yaml文件到执行路径【%v】和备份路径【%v】", yamlPath, yamlPathBak)
  234. // --------------- 启动 pod
  235. _, s2, err := util.Execute("kubectl", "apply", "-f", yamlPath)
  236. if err != nil {
  237. infra.GlobalLogger.Errorf("启动pod失败,执行结果为 %v,错误信息为 %v", s2, err)
  238. continue
  239. }
  240. infra.GlobalLogger.Errorf("启动pod成功,执行结果为 %v。", s2)
  241. // 收尾
  242. {
  243. // --------------- 添加到运行队列
  244. err = domain.AddRunningCluster(firstTaskCache, gpuNode.Hostname)
  245. if err != nil {
  246. infra.GlobalLogger.Error(err)
  247. global.GpuNodeListMutex.Unlock()
  248. continue
  249. }
  250. // --------------- 从等待队列中移除
  251. _, err = infra.GlobalRedisClient.LPop(global.KeyTaskQueueWaitingCluster).Result()
  252. if err != nil {
  253. infra.GlobalLogger.Error("取出集群等待队列中的头元素报错,错误信息为:", err)
  254. continue
  255. }
  256. // --------------- 删除镜像文件
  257. _ = util.RemoveFile(algorithmTarPath)
  258. }
  259. }
  260. }
  261. func JsonToTaskCache(jsonData string) (entity.TaskCache, error) {
  262. // 创建一个 Person 类型的变量
  263. var taskCache entity.TaskCache
  264. // 使用 json.Unmarshal 解析 JSON 字符串到结构体
  265. err := json.Unmarshal([]byte(jsonData), &taskCache)
  266. if err != nil {
  267. return entity.TaskCache{}, errors.New("对象json " + jsonData + " 转对象失败错误信息为: " + fmt.Sprintf("%v", err))
  268. }
  269. return taskCache, nil
  270. }
  271. func TaskToJson(task entity.Task) (string, error) {
  272. jsonData, err := json.MarshalIndent(task, "", " ")
  273. if err != nil {
  274. return "", errors.New("转json失败,错误信息为:" + err.Error())
  275. }
  276. return string(jsonData), nil
  277. }