Skip to content

Commit 2fcd799

Browse files
authored
chore: add Cloud Build build retry in deploy helper (#1436)
1 parent 2646fa0 commit 2fcd799

File tree

10 files changed

+485
-77
lines changed

10 files changed

+485
-77
lines changed

helpers/foundation-deployer/gcp/gcp.go

Lines changed: 135 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,22 @@
1515
package gcp
1616

1717
import (
18+
"context"
19+
"encoding/json"
1820
"fmt"
21+
"regexp"
1922
"strings"
2023
"time"
2124

2225
"github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test/pkg/gcloud"
26+
"github.com/GoogleCloudPlatform/cloud-foundation-toolkit/infra/blueprint-test/pkg/utils"
2327
"github.com/mitchellh/go-testing-interface"
2428
"github.com/tidwall/gjson"
2529

2630
"github.com/terraform-google-modules/terraform-example-foundation/test/integration/testutils"
31+
32+
"google.golang.org/api/cloudbuild/v1"
33+
"google.golang.org/api/option"
2734
)
2835

2936
const (
@@ -34,23 +41,78 @@ const (
3441
StatusCancelled = "CANCELLED"
3542
)
3643

44+
type RetryOp struct {
45+
Type string `json:"@type"`
46+
Build Build `json:"build"`
47+
}
48+
type Build struct {
49+
ID string `json:"id"`
50+
Status string `json:"status"`
51+
CreateTime string `json:"createTime"`
52+
}
53+
54+
var (
55+
retryRegexp = map[*regexp.Regexp]string{}
56+
// ctx = context.Background()
57+
)
58+
59+
func init() {
60+
for e, m := range testutils.RetryableTransientErrors {
61+
r, err := regexp.Compile(fmt.Sprintf("(?s)%s", e)) //(?s) enables dot (.) to match newline.
62+
if err != nil {
63+
panic(fmt.Sprintf("failed to compile regex %s: %s", e, err.Error()))
64+
}
65+
retryRegexp[r] = m
66+
}
67+
}
68+
3769
type GCP struct {
38-
Runf func(t testing.TB, cmd string, args ...interface{}) gjson.Result
39-
sleepTime time.Duration
70+
Runf func(t testing.TB, cmd string, args ...interface{}) gjson.Result
71+
RunCmd func(t testing.TB, cmd string, args ...interface{}) string
72+
TriggerNewBuild func(t testing.TB, ctx context.Context, buildName string) (string, error)
73+
sleepTime time.Duration
74+
}
75+
76+
// runCmd is a wrapper around gcloud.RunCmd because the original function has an input with a private type
77+
func runCmd(t testing.TB, cmd string, args ...interface{}) string {
78+
return gcloud.RunCmd(t, utils.StringFromTextAndArgs(append([]interface{}{cmd}, args...)...))
79+
}
80+
81+
// triggerNewBuild triggers a new build based on the build provided
82+
func triggerNewBuild(t testing.TB, ctx context.Context, buildName string) (string, error) {
83+
84+
buildService, err := cloudbuild.NewService(ctx, option.WithScopes(cloudbuild.CloudPlatformScope))
85+
if err != nil {
86+
return "", fmt.Errorf("failed to create Cloud Build service: %w", err)
87+
}
88+
retryOperation, err := buildService.Projects.Locations.Builds.Retry(buildName, &cloudbuild.RetryBuildRequest{}).Do()
89+
if err != nil {
90+
return "", fmt.Errorf("failed to retry build: %w", err)
91+
}
92+
93+
var data RetryOp
94+
err = json.Unmarshal(retryOperation.Metadata, &data)
95+
if err != nil {
96+
return "", fmt.Errorf("error unmarshaling retry operation metadata: %v", err)
97+
}
98+
99+
return data.Build.ID, nil
40100
}
41101

42102
// NewGCP creates a new wrapper for Google Cloud Platform CLI.
43103
func NewGCP() GCP {
44104
return GCP{
45-
Runf: gcloud.Runf,
46-
sleepTime: 20,
105+
Runf: gcloud.Runf,
106+
RunCmd: runCmd,
107+
TriggerNewBuild: triggerNewBuild,
108+
sleepTime: 20,
47109
}
48110
}
49111

50112
// IsComponentInstalled checks if a given gcloud component is installed
51113
func (g GCP) IsComponentInstalled(t testing.TB, componentID string) bool {
52-
filter := fmt.Sprintf("\"id='%s'\"",componentID)
53-
components := g.Runf(t, "components list --filter %s", filter).Array()
114+
filter := fmt.Sprintf("\"id='%s'\"", componentID)
115+
components := g.Runf(t, "components list --filter %s", filter).Array()
54116
if len(components) == 0 {
55117
return false
56118
}
@@ -70,8 +132,13 @@ func (g GCP) GetBuilds(t testing.TB, projectID, region, filter string) map[strin
70132
}
71133

72134
// GetLastBuildStatus gets the status of the last build form a project and region that satisfy the given filter.
73-
func (g GCP) GetLastBuildStatus(t testing.TB, projectID, region, filter string) string {
74-
return g.Runf(t, "builds list --project %s --region %s --limit 1 --sort-by ~createTime --filter %s", projectID, region, filter).Array()[0].Get("status").String()
135+
func (g GCP) GetLastBuildStatus(t testing.TB, projectID, region, filter string) (string, string) {
136+
builds := g.Runf(t, "builds list --project %s --region %s --limit 1 --sort-by ~createTime --filter %s", projectID, region, filter).Array()
137+
if len(builds) == 0 {
138+
return "", ""
139+
}
140+
build := builds[0]
141+
return build.Get("status").String(), build.Get("id").String()
75142
}
76143

77144
// GetBuildStatus gets the status of the given build
@@ -91,16 +158,21 @@ func (g GCP) GetRunningBuildID(t testing.TB, projectID, region, filter string) s
91158
return ""
92159
}
93160

161+
// GetBuildLogs get the execution logs of the given build
162+
func (g GCP) GetBuildLogs(t testing.TB, projectID, region, buildID string) string {
163+
return g.RunCmd(t, "builds log %s --project %s --region %s", buildID, projectID, region)
164+
}
165+
94166
// GetFinalBuildState gets the terminal status of the given build. It will wait if build is not finished.
95-
func (g GCP) GetFinalBuildState(t testing.TB, projectID, region, buildID string, maxRetry int) (string, error) {
167+
func (g GCP) GetFinalBuildState(t testing.TB, projectID, region, buildID string, maxBuildRetry int) (string, error) {
96168
var status string
97169
count := 0
98170
fmt.Printf("waiting for build %s execution.\n", buildID)
99171
status = g.GetBuildStatus(t, projectID, region, buildID)
100172
fmt.Printf("build status is %s\n", status)
101173
for status != StatusSuccess && status != StatusFailure && status != StatusCancelled {
102174
fmt.Printf("build status is %s\n", status)
103-
if count >= maxRetry {
175+
if count >= maxBuildRetry {
104176
return "", fmt.Errorf("timeout waiting for build '%s' execution", buildID)
105177
}
106178
count = count + 1
@@ -112,29 +184,66 @@ func (g GCP) GetFinalBuildState(t testing.TB, projectID, region, buildID string,
112184
}
113185

114186
// WaitBuildSuccess waits for the current build in a repo to finish.
115-
func (g GCP) WaitBuildSuccess(t testing.TB, project, region, repo, commitSha, failureMsg string, maxRetry int) error {
116-
var filter string
187+
func (g GCP) WaitBuildSuccess(t testing.TB, project, region, repo, commitSha, failureMsg string, maxBuildRetry, maxErrorRetries int, timeBetweenErrorRetries time.Duration) error {
188+
var filter, status, build string
189+
var timeoutErr, err error
190+
ctx := context.Background()
191+
117192
if commitSha == "" {
118193
filter = fmt.Sprintf("source.repoSource.repoName:%s", repo)
119194
} else {
120195
filter = fmt.Sprintf("source.repoSource.commitSha:%s", commitSha)
121196
}
122-
build := g.GetRunningBuildID(t, project, region, filter)
123-
if build != "" {
124-
status, err := g.GetFinalBuildState(t, project, region, build, maxRetry)
125-
if err != nil {
126-
return err
197+
198+
build = g.GetRunningBuildID(t, project, region, filter)
199+
for i := 0; i < maxErrorRetries; i++ {
200+
if build != "" {
201+
status, timeoutErr = g.GetFinalBuildState(t, project, region, build, maxBuildRetry)
202+
if timeoutErr != nil {
203+
return timeoutErr
204+
}
205+
} else {
206+
status, build = g.GetLastBuildStatus(t, project, region, filter)
207+
if build == "" {
208+
return fmt.Errorf("no build found for filter: %s", filter)
209+
}
127210
}
211+
128212
if status != StatusSuccess {
129-
return fmt.Errorf("%s\nSee:\nhttps://console.cloud.google.com/cloud-build/builds;region=%s/%s?project=%s\nfor details", failureMsg, region, build, project)
213+
if !g.IsRetryableError(t, project, region, build) {
214+
return fmt.Errorf("%s\nSee:\nhttps://console.cloud.google.com/cloud-build/builds;region=%s/%s?project=%s\nfor details", failureMsg, region, build, project)
215+
}
216+
fmt.Println("build failed with retryable error. a new build will be triggered.")
217+
} else {
218+
return nil // Build succeeded
130219
}
131-
} else {
132-
status := g.GetLastBuildStatus(t, project, region, filter)
133-
if status != StatusSuccess {
134-
return fmt.Errorf("%s\nSee:\nhttps://console.cloud.google.com/cloud-build/builds;region=%s/%s?project=%s\nfor details", failureMsg, region, build, project)
220+
221+
// Trigger a new build
222+
build, err = g.TriggerNewBuild(t, ctx, fmt.Sprintf("projects/%s/locations/%s/builds/%s", project, region, build))
223+
if err != nil {
224+
return fmt.Errorf("failed to trigger new build (attempt %d/%d): %w", i+1, maxErrorRetries, err)
225+
}
226+
fmt.Printf("triggered new build with ID: %s (attempt %d/%d)\n", build, i+1, maxErrorRetries)
227+
if i < maxErrorRetries-1 {
228+
time.Sleep(timeBetweenErrorRetries) // Wait before retrying
229+
}
230+
}
231+
return fmt.Errorf("%s\nbuild failed after %d retries.\nSee Cloud Build logs for details", failureMsg, maxErrorRetries)
232+
}
233+
234+
// IsRetryableError checks the logs of a failed Cloud Build build
235+
// and verify if the error is a transient one and can be retried
236+
func (g GCP) IsRetryableError(t testing.TB, projectID, region, build string) bool {
237+
logs := g.GetBuildLogs(t, projectID, region, build)
238+
found := false
239+
for pattern, msg := range retryRegexp {
240+
if pattern.MatchString(logs) {
241+
found = true
242+
fmt.Printf("error '%s' is worth of a retry\n", msg)
243+
break
135244
}
136245
}
137-
return nil
246+
return found
138247
}
139248

140249
// HasSccNotification checks if a Security Command Center notification exists
@@ -158,12 +267,12 @@ func (g GCP) HasTagKey(t testing.TB, orgID, tag string) bool {
158267
}
159268

160269
// EnableApis enables the apis in the given project
161-
func (g GCP) EnableApis(t testing.TB, project string, apis []string) {
270+
func (g GCP) EnableAPIs(t testing.TB, project string, apis []string) {
162271
g.Runf(t, "services enable %s --project %s", strings.Join(apis, " "), project)
163272
}
164273

165-
// IsApiEnabled checks if the api is enabled in the given project
166-
func (g GCP) IsApiEnabled(t testing.TB, project, api string) bool {
274+
// IsAPIEnabled checks if the api is enabled in the given project
275+
func (g GCP) IsAPIEnabled(t testing.TB, project, api string) bool {
167276
filter := fmt.Sprintf("config.name=%s", api)
168277
return len(g.Runf(t, "services list --enabled --project %s --filter %s", project, filter).Array()) > 0
169278
}

helpers/foundation-deployer/gcp/gcp_test.go

Lines changed: 66 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,17 @@
1515
package gcp
1616

1717
import (
18+
"context"
1819
"fmt"
1920
"os"
2021
"path/filepath"
22+
"time"
23+
2124
gotest "testing"
2225

2326
"github.com/mitchellh/go-testing-interface"
2427
"github.com/stretchr/testify/assert"
28+
2529
"github.com/tidwall/gjson"
2630
)
2731

@@ -71,12 +75,12 @@ func TestGetLastBuildStatus(t *gotest.T) {
7175
},
7276
sleepTime: 1,
7377
}
74-
status := gcp.GetLastBuildStatus(t, "prj-b-cicd-0123", "us-central1", "filter")
78+
status, _ := gcp.GetLastBuildStatus(t, "prj-b-cicd-0123", "us-central1", "filter")
7579
assert.Equal(t, StatusSuccess, status)
7680

7781
current, err = os.ReadFile(filepath.Join(".", "testdata", "failure_build.json"))
7882
assert.NoError(t, err)
79-
status = gcp.GetLastBuildStatus(t, "prj-b-cicd-0123", "us-central1", "filter")
83+
status, _ = gcp.GetLastBuildStatus(t, "prj-b-cicd-0123", "us-central1", "filter")
8084
assert.Equal(t, StatusFailure, status)
8185
}
8286

@@ -132,10 +136,13 @@ func TestWaitBuildSuccess(t *gotest.T) {
132136
callCount = callCount + 1
133137
return resp
134138
},
139+
RunCmd: func(t testing.TB, cmd string, args ...interface{}) string {
140+
return ""
141+
},
135142
sleepTime: 1,
136143
}
137144

138-
err = gcp.WaitBuildSuccess(t, "prj-b-cicd-0123", "us-central1", "repo", "", "failed_test_for_WaitBuildSuccess", 40)
145+
err = gcp.WaitBuildSuccess(t, "prj-b-cicd-0123", "us-central1", "repo", "", "failed_test_for_WaitBuildSuccess", 40, 2, 1*time.Second)
139146
assert.Error(t, err, "should have failed")
140147
assert.Contains(t, err.Error(), "failed_test_for_WaitBuildSuccess", "should have failed with custom info")
141148
assert.Equal(t, callCount, 3, "Runf must be called three times")
@@ -164,11 +171,66 @@ func TestWaitBuildTimeout(t *gotest.T) {
164171
callCount = callCount + 1
165172
return resp
166173
},
174+
RunCmd: func(t testing.TB, cmd string, args ...interface{}) string {
175+
return ""
176+
},
167177
sleepTime: 1,
168178
}
169179

170-
err = gcp.WaitBuildSuccess(t, "prj-b-cicd-0123", "us-central1", "repo", "", "failed_test_for_WaitBuildSuccess", 1)
180+
err = gcp.WaitBuildSuccess(t, "prj-b-cicd-0123", "us-central1", "repo", "", "failed_test_for_WaitBuildSuccess", 1, 1, 1*time.Second)
171181
assert.Error(t, err, "should have failed")
172182
assert.Contains(t, err.Error(), "timeout waiting for build '736f4689-2497-4382-afd0-b5f0f50eea5b' execution", "should have failed with timeout error")
173183
assert.Equal(t, callCount, 3, "Runf must be called three times")
174184
}
185+
186+
func TestWaitBuildSuccessRetry(t *gotest.T) {
187+
188+
working, err := os.ReadFile(filepath.Join(".", "testdata", "working_build.json"))
189+
assert.NoError(t, err)
190+
failure, err := os.ReadFile(filepath.Join(".", "testdata", "failure_build.json"))
191+
assert.NoError(t, err)
192+
retry, err := os.ReadFile(filepath.Join(".", "testdata", "working_build_retry.json"))
193+
assert.NoError(t, err)
194+
success, err := os.ReadFile(filepath.Join(".", "testdata", "success_build.json"))
195+
assert.NoError(t, err)
196+
197+
runCmdCallCount := 0
198+
triggerNewBuildCallCount := 0
199+
runfCallCount := 0
200+
runfCalls := []gjson.Result{
201+
{Type: gjson.JSON,
202+
Raw: fmt.Sprintf("[%s]", string(working[:]))}, // builds list
203+
{Type: gjson.JSON,
204+
Raw: string(working[:])}, // builds describe
205+
{Type: gjson.JSON,
206+
Raw: string(failure[:])}, // builds describe
207+
{Type: gjson.JSON,
208+
Raw: string(retry[:])}, // builds describe
209+
{Type: gjson.JSON,
210+
Raw: string(success[:])}, // builds describe
211+
}
212+
213+
gcp := GCP{
214+
Runf: func(t testing.TB, cmd string, args ...interface{}) gjson.Result {
215+
resp := runfCalls[runfCallCount]
216+
runfCallCount = runfCallCount + 1
217+
return resp
218+
},
219+
RunCmd: func(t testing.TB, cmd string, args ...interface{}) string {
220+
runCmdCallCount = runCmdCallCount + 1
221+
return "a\nError 403. Compute Engine API has not been used in project\nz" // get build logs
222+
},
223+
TriggerNewBuild: func(t testing.TB, ctx context.Context, buildName string) (string, error) {
224+
triggerNewBuildCallCount = triggerNewBuildCallCount + 1
225+
return "845f5790-2497-4382-afd0-b5f0f50eea5a", nil // buildService.Projects.Locations.Builds.Retry
226+
},
227+
sleepTime: 1,
228+
}
229+
230+
err = gcp.WaitBuildSuccess(t, "prj-b-cicd-0123", "us-central1", "repo", "", "", 40, 2, 1*time.Second)
231+
232+
assert.Nil(t, err, "should have succeeded")
233+
assert.Equal(t, runfCallCount, 5, "Runf must be called five times")
234+
assert.Equal(t, runCmdCallCount, 1, "runCmd getLogs must be called once")
235+
assert.Equal(t, triggerNewBuildCallCount, 1, "TriggerNewBuild must be called once")
236+
}

0 commit comments

Comments
 (0)