|
@@ -42,6 +42,9 @@ type Job struct {
|
|
|
ptsLock sync.RWMutex
|
|
|
handlerTable map[string]func(ctx context.Context, task *Task, technologyName string) error
|
|
|
handlerIntervalTable map[string]time.Duration
|
|
|
+ jobProjectTotalKey string
|
|
|
+ jobNodesCountKey string
|
|
|
+ jobNodesHeartbeatKey string
|
|
|
}
|
|
|
|
|
|
func NewJob(id uint32, name string, svc *svc.ServiceContext, handlerTable map[string]func(ctx context.Context, task *Task, technologyName string) error, handlerIntervalTable map[string]time.Duration) *Job {
|
|
@@ -56,6 +59,9 @@ func NewJob(id uint32, name string, svc *svc.ServiceContext, handlerTable map[st
|
|
|
pts: make(map[uint32]*Task),
|
|
|
handlerTable: handlerTable,
|
|
|
handlerIntervalTable: handlerIntervalTable,
|
|
|
+ jobProjectTotalKey: fmt.Sprintf("job:project:total:%s", name),
|
|
|
+ jobNodesCountKey: fmt.Sprintf("job:nodes:count:%s", name),
|
|
|
+ jobNodesHeartbeatKey: fmt.Sprintf("job:nodes:heartbeat:%s", name),
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -159,12 +165,8 @@ func (j *Job) ClearTask() {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-func (j *Job) setPartition(partition Partition) {
|
|
|
- j.Partition = partition
|
|
|
-}
|
|
|
-
|
|
|
func (p *Partition) partitionLimit() int64 {
|
|
|
- if p.N == 0 {
|
|
|
+ if p.N == 0 || p.Total < p.N {
|
|
|
return 1
|
|
|
}
|
|
|
total := p.Total
|
|
@@ -175,25 +177,25 @@ func (p *Partition) partitionLimit() int64 {
|
|
|
}
|
|
|
|
|
|
func (j *Job) syncPartition() {
|
|
|
- tk := time.NewTicker(60 * time.Second)
|
|
|
+ tk := time.NewTicker(30 * time.Second)
|
|
|
|
|
|
go func() {
|
|
|
for {
|
|
|
- message, err := j.syncClient.Subscribe(j.Ctx, "simulation-job-"+j.Name).ReceiveMessage(j.Ctx)
|
|
|
+ message, err := j.syncClient.Subscribe(j.Ctx, j.jobProjectTotalKey).ReceiveMessage(j.Ctx)
|
|
|
if err != nil {
|
|
|
time.Sleep(5 * time.Second)
|
|
|
continue
|
|
|
}
|
|
|
|
|
|
if total, err := strconv.ParseInt(message.Payload, 10, 64); err == nil {
|
|
|
- ic := j.syncClient.Incr(j.Ctx, "simulation-job-n-"+j.Name)
|
|
|
+ ic := j.syncClient.Incr(j.Ctx, j.jobNodesCountKey)
|
|
|
j.Partition.Total = total
|
|
|
j.Partition.Pn = ic.Val()
|
|
|
j.Partition.N = 1
|
|
|
|
|
|
time.Sleep(3 * time.Second)
|
|
|
- n := j.syncClient.Get(j.Ctx, "simulation-job-n-"+j.Name)
|
|
|
- j.syncClient.Expire(j.Ctx, "simulation-job-n-"+j.Name, 10*time.Second)
|
|
|
+ n := j.syncClient.Get(j.Ctx, j.jobNodesCountKey)
|
|
|
+ j.syncClient.Expire(j.Ctx, j.jobNodesCountKey, 10*time.Second)
|
|
|
|
|
|
if n, err := n.Int64(); err == nil {
|
|
|
j.Partition.N = n
|
|
@@ -207,14 +209,31 @@ func (j *Job) syncPartition() {
|
|
|
for {
|
|
|
select {
|
|
|
case <-tk.C:
|
|
|
+
|
|
|
+ //fmt.Printf("total: %d nodes: %d pos: %d\n", j.Partition.Total, j.Partition.N, j.Partition.Pn)
|
|
|
+ // 维持一个tk周期的心跳
|
|
|
+ ml := false
|
|
|
+ nowUnix := time.Now().Unix()
|
|
|
+ j.syncClient.HSet(context.Background(), j.jobNodesHeartbeatKey, j.Partition.Pn, nowUnix)
|
|
|
+ // 检测心跳超过1分钟的连接, 如果存在, 收重新分配
|
|
|
+ if hm := j.syncClient.HGetAll(context.Background(), j.jobNodesHeartbeatKey); hm != nil {
|
|
|
+ for _, lastSyncUnixs := range hm.Val() {
|
|
|
+ if lastSyncUnix, err := strconv.ParseInt(lastSyncUnixs, 10, 64); err == nil && nowUnix-lastSyncUnix > 60 {
|
|
|
+ j.syncClient.Del(context.Background(), j.jobNodesHeartbeatKey)
|
|
|
+ ml = true
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
ps, err := j.SvcCtx.ProjectConfig.FindAll(j.Ctx)
|
|
|
if err != nil {
|
|
|
logx.Errorf("not found project config")
|
|
|
}
|
|
|
|
|
|
- if len(ps) != int(j.Partition.Total) {
|
|
|
+ if ml == true || len(ps) != int(j.Partition.Total) {
|
|
|
// 发布一条消息
|
|
|
- j.syncClient.Publish(j.Ctx, "simulation-job-"+j.Name, len(ps))
|
|
|
+ j.syncClient.Publish(j.Ctx, j.jobProjectTotalKey, len(ps))
|
|
|
}
|
|
|
}
|
|
|
}
|