Skip to content

Commit

Permalink
autoupdater: fix: pull request with failed CI status is not suspended
Browse files Browse the repository at this point in the history
This fixes that a pull request that is uptodate with it's base branch
and has a failed CI status for a build that has not been triggered by
directorius is not suspended.
The failed status is wrongly evaluated as insignificant because no
triggered build for the job is found in PullRequest.LastStartedCIBuilds.

- make the PullRequest.LastStartedCIBuilds map private and hold the lock
  in it's getter and setter, this makes it easy to
  use it concurrency-safe and prevent issues in the future.
  The map was always accessed serialized and no issues are known.
- replace PullRequest.FailedCIStatusIsForNewestBuild with
  queue.failedRequiredCIStatusesAreObsolete. The functions returns true
  if a status is exist that is required, failed and no build with a
  higher ID has been triggered.
- add a testcase for the bug
  • Loading branch information
fho committed Jan 21, 2025
1 parent ccca16c commit c042ba2
Show file tree
Hide file tree
Showing 5 changed files with 235 additions and 72 deletions.
9 changes: 9 additions & 0 deletions internal/autoupdater/autoupdate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,22 @@ const (
repo = "repo"
repoOwner = "testman"
queueHeadLabel = "first"
prBranch = "prbr"
prNR = 101
)

const (
condCheckInterval = 20 * time.Millisecond
condWaitTimeout = 5 * time.Second
)

func forwardLogsToTestLogger(t *testing.T) *zap.Logger {
t.Helper()
l := zaptest.NewLogger(t).Named(t.Name())
t.Cleanup(zap.ReplaceGlobals(l))
return l
}

func mockSuccessfulGithubAddLabelQueueHeadCall(clt *mocks.MockGithubClient, expectedPRNr int) *gomock.Call {
return clt.
EXPECT().
Expand Down
5 changes: 3 additions & 2 deletions internal/autoupdater/ci.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,17 @@ func (c *CI) RunAll(ctx context.Context, retryer *retry.Retryer, pr *PullRequest
go c.runCIJobToCh(ctx, ch, retryer, pr, jobTempl)
}

clear(pr.LastStartedCIBuilds)
builds := make(map[string]*jenkins.Build, len(c.Jobs))

for range len(c.Jobs) {
result := <-ch
if result.Err != nil {
errs = append(errs, result.Err)
continue
}
pr.LastStartedCIBuilds[result.Build.JobName] = result.Build
builds[result.Build.JobName] = result.Build
}
pr.SetLastStartedCIBuilds(builds)

if len(errs) > 0 {
return errors.Join(errs...)
Expand Down
66 changes: 17 additions & 49 deletions internal/autoupdater/pullrequest.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package autoupdater
import (
"errors"
"fmt"
"maps"
"sync"
"sync/atomic"
"time"
Expand Down Expand Up @@ -30,17 +31,16 @@ type PullRequest struct {
// EnqueuedAt is the time when the PR is added to merge-queue
EnqueuedAt time.Time

// LastStartedCIBuilds keys are [jenkins.Build.jobName]
LastStartedCIBuilds map[string]*jenkins.Build

Priority atomic.Int32

// SuspendCount is increased whenever a PR is moved from the active to
// the suspend queue.
SuspendCount atomic.Uint32

// lastStartedCIBuilds keys are [jenkins.Build.jobName]
lastStartedCIBuilds map[string]*jenkins.Build
stateUnchangedSince time.Time
lock sync.Mutex // must be held when accessing stateUnchangedSince
lock sync.Mutex // must be held when accessing stateUnchangedSince, lastStartedCIBuilds

GithubStatusLastSetState ReportedStatusState
GithubStatusLock sync.Mutex
Expand Down Expand Up @@ -84,7 +84,7 @@ func NewPullRequest(nr int, branch, author, title, link string) (*PullRequest, e
logfields.PullRequest(nr),
logfields.Branch(branch),
},
LastStartedCIBuilds: map[string]*jenkins.Build{},
lastStartedCIBuilds: map[string]*jenkins.Build{},
EnqueuedAt: time.Now(),
inActiveQueueSince: atomic.Pointer[time.Time]{},
}
Expand Down Expand Up @@ -130,50 +130,6 @@ func (p *PullRequest) SetStateUnchangedSinceIfZero(t time.Time) {
}
}

// FailedCIStatusIsForNewestBuild returns true if a required and failed status
// in [statuses] is for a job in [pr.LastStartedCIBuilds] and has the same or
// newer build id.
//
// This function is not concurrency-safe, reading and writing to
// [pr.CITriggerStatus.BuildURLs] must be serialized.
func (p *PullRequest) FailedCIStatusIsForNewestBuild(logger *zap.Logger, statuses []*githubclt.CIJobStatus) (bool, error) {
if len(statuses) == 0 {
return false, errors.New("github status ci job list is empty")
}

for _, status := range statuses {
if !status.Required || status.Status != githubclt.CIStatusFailure {
continue
}

build, err := jenkins.ParseBuildURL(status.JobURL)
if err != nil {
logger.Warn(
"parsing ci job url from github webhook as jenkins build url failed,",
logfields.CIBuildURL(build.String()),
zap.Error(err),
)
return false, fmt.Errorf("parsing ci job url (%q) from github webhook as jenkins build url failed: %w", status.JobURL, err)
}

lastBuild, exists := p.LastStartedCIBuilds[build.JobName]
if !exists {
continue
}

if build.Number >= lastBuild.Number {
logger.Debug("failed ci job status is for latest or newer build",
logfields.CIJob(build.JobName),
zap.Stringer("ci.latest_build", lastBuild),
zap.Stringer("github.ci_failed_status_build", build),
)
return true, nil
}
}

return false, nil
}

// SetInActiveQueueSince sets the inActiveQueueSince to [time.Now()].
func (p *PullRequest) SetInActiveQueueSince() {
n := time.Now()
Expand All @@ -185,3 +141,15 @@ func (p *PullRequest) SetInActiveQueueSince() {
func (p *PullRequest) InActiveQueueSince() time.Time {
return *p.inActiveQueueSince.Load()
}

func (p *PullRequest) SetLastStartedCIBuilds(m map[string]*jenkins.Build) {
p.lock.Lock()
p.lastStartedCIBuilds = m
p.lock.Unlock()
}

func (p *PullRequest) GetLastStartedCIBuilds() map[string]*jenkins.Build {
p.lock.Lock()
defer p.lock.Unlock()
return maps.Clone(p.lastStartedCIBuilds)
}
92 changes: 71 additions & 21 deletions internal/autoupdater/queue_processpr.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@ import (
"strconv"
"time"

"go.uber.org/zap"

"github.com/simplesurance/directorius/internal/githubclt"
"github.com/simplesurance/directorius/internal/jenkins"
"github.com/simplesurance/directorius/internal/logfields"

"go.uber.org/zap"
)

type Action int
Expand Down Expand Up @@ -148,6 +149,7 @@ func (q *queue) evalPRAction(ctx context.Context, logger *zap.Logger, pr *PullRe
)
// TODO: rerun the update loop, instead of continuing with the wrong information!
}

switch status.CIStatus {
case githubclt.CIStatusSuccess:
return &requiredActions{
Expand All @@ -168,38 +170,86 @@ func (q *queue) evalPRAction(ctx context.Context, logger *zap.Logger, pr *PullRe
}, nil

case githubclt.CIStatusFailure:
newestBuildFailed, err := pr.FailedCIStatusIsForNewestBuild(logger, status.Statuses)
if len(status.Statuses) == 0 {
return nil, errors.New("ready for merge status has negative CI status but statuses list is empty")
}

failedStatusesAreObsolete, err := q.failedRequiredCIStatusesAreObsolete(
status.Statuses,
pr.GetLastStartedCIBuilds(),
)
if err != nil {
// suspend the PR to prevent that it is stuck #1 with failed CI checks
return &requiredActions{
Actions: []Action{ActionSuspend},
Reason: reasonCIStatusFailure + ": " + err.Error(),
HeadCommitID: updateHeadCommit,
}, nil
return nil, err
}

if newestBuildFailed {
if failedStatusesAreObsolete {
logger.Debug("status check is negative "+
"but none of the affected ci builds are in the list "+
"of recently triggered required jobs, pr is not suspended",
zap.Any("ci.build.last_triggered", pr.lastStartedCIBuilds),
zap.Any("github.ci_statuses", status),
)

return &requiredActions{
Actions: []Action{ActionSuspend},
Reason: reasonCIStatusFailure,
Actions: []Action{ActionNone},
Reason: reasonPreviousCIJobsFailed,
HeadCommitID: updateHeadCommit,
}, nil
}

logger.Debug("status check is negative "+
"but none of the affected ci builds are in the list "+
"of recently triggered required jobs, pr is not suspended",
zap.Any("ci.build.last_triggered", pr.LastStartedCIBuilds),
zap.Any("github.ci_statuses", status),
)
return &requiredActions{
Actions: []Action{ActionNone},
Reason: reasonPreviousCIJobsFailed,
Actions: []Action{ActionSuspend},
Reason: reasonCIStatusFailure,
HeadCommitID: updateHeadCommit,
}, nil

default:
logger.DPanic("BUG:pull request ci status has unexpected value")
logger.DPanic("BUG: pull request ci status has unexpected value")
return nil, fmt.Errorf("BUG: pull request ci status has unexpected value: %q", status.CIStatus)
}
}

func failedRequiredStatuses(statuses []*githubclt.CIJobStatus) []*githubclt.CIJobStatus {
var result []*githubclt.CIJobStatus

for _, s := range statuses {
if s.Required && s.Status == githubclt.CIStatusFailure {
result = append(result, s)
}
}

return result
}

// failedRequiredCIStatusesAreObsolete returns true when for each failed and
// required status an entry in lastStartedCIBuilds exist that has a higher
// build number.
func (q *queue) failedRequiredCIStatusesAreObsolete(statuses []*githubclt.CIJobStatus, lastStartedCIBuilds map[string]*jenkins.Build) (bool, error) {
for _, s := range failedRequiredStatuses(statuses) {
failedBuild, err := jenkins.ParseBuildURL(s.JobURL)
if err != nil {
return false, fmt.Errorf("parsing ci job url (%q) as jenkins build url failed: %w", s.JobURL, err)
}

lastBuild, exist := lastStartedCIBuilds[failedBuild.JobName]
if !exist {
q.logger.Debug("ci job status is for job that has not been triggered",
logfields.CIJob(failedBuild.JobName),
zap.Stringer("ci.latest_build", lastBuild),
zap.Stringer("github.ci_failed_status_build", failedBuild),
)
return false, nil
}

if failedBuild.Number >= lastBuild.Number {
q.logger.Debug("failed ci job status is for latest or newer build",
logfields.CIJob(failedBuild.JobName),
zap.Stringer("ci.latest_build", lastBuild),
zap.Stringer("github.ci_failed_status_build", failedBuild),
)
return false, nil
}
}

return true, nil
}
Loading

0 comments on commit c042ba2

Please sign in to comment.