|
@@ -129,12 +129,10 @@ func RunWaitingCluster() {
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
topic := projectId
|
|
topic := projectId
|
|
- value := []byte(taskJson)
|
|
|
|
-
|
|
|
|
// 创建一个Message,并指定分区为0
|
|
// 创建一个Message,并指定分区为0
|
|
msg := &kafka.Message{
|
|
msg := &kafka.Message{
|
|
- TopicPartition: kafka.TopicPartition{Topic: &topic, Partition: 0, Offset: kafka.Offset(offset)},
|
|
|
|
- Value: value,
|
|
|
|
|
|
+ TopicPartition: kafka.TopicPartition{Topic: &topic, Partition: infra.ApplicationYaml.Kafka.Partition, Offset: kafka.Offset(offset)},
|
|
|
|
+ Value: []byte(taskJson),
|
|
}
|
|
}
|
|
// 发送消息,并处理结果
|
|
// 发送消息,并处理结果
|
|
err = infra.GlobalKafkaProducer.Produce(msg, nil)
|
|
err = infra.GlobalKafkaProducer.Produce(msg, nil)
|
|
@@ -166,9 +164,13 @@ func RunWaitingCluster() {
|
|
c_log.GlobalLogger.Errorf("删除算法镜像文件 %v 失败,错误信息为:%v", algorithmTarPath, err)
|
|
c_log.GlobalLogger.Errorf("删除算法镜像文件 %v 失败,错误信息为:%v", algorithmTarPath, err)
|
|
}
|
|
}
|
|
// --------------- 启动 k8s pod ---------------
|
|
// --------------- 启动 k8s pod ---------------
|
|
- nodeName := gpuNode.Hostname
|
|
|
|
- // 1 生成 podName
|
|
|
|
podName := "project-" + projectId + "-" + util.NewShortUUID()
|
|
podName := "project-" + projectId + "-" + util.NewShortUUID()
|
|
|
|
+ namespaceName := infra.ApplicationYaml.K8s.NamespaceName
|
|
|
|
+ nodeName := gpuNode.Hostname
|
|
|
|
+ restParallelism := gpuNode.Parallelism
|
|
|
|
+ vtdContainer := "vtd-" + projectId
|
|
|
|
+ algorithmContainer := "algorithm-" + projectId
|
|
|
|
+ vtdImage := infra.ApplicationYaml.K8s.VtdImage
|
|
// 2 生成模板文件名称
|
|
// 2 生成模板文件名称
|
|
podYaml := nodeName + "#" + podName + ".yaml"
|
|
podYaml := nodeName + "#" + podName + ".yaml"
|
|
// 3 模板yaml存储路径
|
|
// 3 模板yaml存储路径
|
|
@@ -182,15 +184,38 @@ func RunWaitingCluster() {
|
|
infra.GlobalLogger.Error(err)
|
|
infra.GlobalLogger.Error(err)
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
- podString = strings.Replace(podString, "vtd-container-name", "vtd-"+projectId+"-"+nodeName, 1)
|
|
|
|
- podString = strings.Replace(podString, "cicv-data-closedloop-ip", infra.ApplicationYaml.Web.IpPrivate, 1)
|
|
|
|
- podString = strings.Replace(podString, "kafka-ip", infra.ApplicationYaml.Kafka.Brokers[0], 1)
|
|
|
|
- podString = strings.Replace(podString, "kafka-topic", projectId, 1)
|
|
|
|
- // 发送消息之后会拿到消息的分区和偏移量
|
|
|
|
- podString = strings.Replace(podString, "kafka-partition", projectId, 1)
|
|
|
|
- podString = strings.Replace(podString, "kafka-offset", projectId, 1)
|
|
|
|
- // todo cpu编号是剩余并行度减一
|
|
|
|
|
|
+ podString = strings.Replace(podString, "pod-name", podName, -1)
|
|
|
|
+ podString = strings.Replace(podString, "namespace-name", namespaceName, -1)
|
|
|
|
+ podString = strings.Replace(podString, "node-name", nodeName, -1)
|
|
|
|
+ podString = strings.Replace(podString, "algorithm-image", algorithmImageName, -1)
|
|
|
|
+ podString = strings.Replace(podString, "vtd-container", vtdContainer, -1)
|
|
|
|
+ podString = strings.Replace(podString, "vtd-image", vtdImage, -1)
|
|
|
|
+ podString = strings.Replace(podString, "vtd-command", infra.ApplicationYaml.K8s.VtdCommand, -1)
|
|
|
|
+ podString = strings.Replace(podString, "platform-ip", infra.ApplicationYaml.Web.IpPrivate, -1)
|
|
|
|
+ podString = strings.Replace(podString, "oss-type", infra.ApplicationYaml.Oss.Type, -1)
|
|
|
|
+ podString = strings.Replace(podString, "oss-ip", infra.ApplicationYaml.Oss.Endpoint, -1) // 不带http://前缀
|
|
|
|
+ podString = strings.Replace(podString, "oss-access-key", infra.ApplicationYaml.Oss.AccessKeyId, -1)
|
|
|
|
+ podString = strings.Replace(podString, "oss-secret-key", infra.ApplicationYaml.Oss.AccessKeySecret, -1)
|
|
|
|
+ podString = strings.Replace(podString, "kafka-ip", infra.ApplicationYaml.Kafka.Brokers[0], -1)
|
|
|
|
+ podString = strings.Replace(podString, "kafka-topic", projectId, -1)
|
|
|
|
+ podString = strings.Replace(podString, "kafka-partition", "\""+util.ToString(infra.ApplicationYaml.Kafka.Partition)+"\"", -1)
|
|
|
|
+ podString = strings.Replace(podString, "kafka-offset", "\""+util.ToString(offset)+"\"", -1)
|
|
|
|
+ podString = strings.Replace(podString, "cpu-order", "\""+util.ToString(restParallelism-1)+"\"", -1) // cpu编号是剩余并行度-1
|
|
|
|
+ podString = strings.Replace(podString, "algorithm-container", algorithmContainer, -1)
|
|
|
|
|
|
|
|
+ // --------------- 保存成文件
|
|
|
|
+ err = util.WriteFile(podString, yamlPath)
|
|
|
|
+ err = util.WriteFile(podString, yamlPathBak)
|
|
|
|
+ if err != nil {
|
|
|
|
+ infra.GlobalLogger.Error("保存yaml字符串失败,错误信息为", err)
|
|
|
|
+ continue
|
|
|
|
+ }
|
|
|
|
+ // --------------- 启动 pod
|
|
|
|
+ _, s2, err := util.Execute("kubectl", "apply", "-f", yamlPath)
|
|
|
|
+ if err != nil {
|
|
|
|
+ infra.GlobalLogger.Errorf("保存yaml字符串失败,执行结果为 %v,错误信息为 %v", s2, err)
|
|
|
|
+ continue
|
|
|
|
+ }
|
|
// --------------- 移除头元素
|
|
// --------------- 移除头元素
|
|
_, err = infra.GlobalRedisClient.LPop(global.KeyTaskQueueWaitingCluster).Result()
|
|
_, err = infra.GlobalRedisClient.LPop(global.KeyTaskQueueWaitingCluster).Result()
|
|
if err != nil {
|
|
if err != nil {
|