Skip to content

Commit bf67251

Browse files
Wyveraldcopybara-github
authored andcommitted
Fix hang with force fetching + repo contents cache
With the repo contents cache enabled, `bazel fetch --force` (and the now-removed `bazel sync`) would cause Bazel to hang forever. The reason is that writing into the repo contents cache causes a Skyframe restart very late into `RepositoryDelegatorFunction`, and the force-fetch bit causes the restarted function to consider it a cache miss, writing into the repo contents cache again, causing an infinite loop. This PR changes it so that a restarted `RepositoryDelegatorFunction` will remember whether it had _just_ finished a fetch before; if so, it will use the cache, even if force-fetch is on. Fixes #26389. Closes #26409. PiperOrigin-RevId: 776306095 Change-Id: I52ad7aeb6581a16268d26a8b142041dc0e748768
1 parent 6c760af commit bf67251

File tree

5 files changed

+101
-12
lines changed

5 files changed

+101
-12
lines changed

src/main/java/com/google/devtools/build/lib/bazel/repository/cache/RepoContentsCache.java

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,16 @@ public ImmutableList<CandidateRepo> getCandidateRepos(String predeclaredInputHas
119119
try {
120120
return entryDir.getDirectoryEntries().stream()
121121
.filter(path -> path.getBaseName().endsWith(RECORDED_INPUTS_SUFFIX))
122-
// We're just sorting for consistency here. (Note that "10.recorded_inputs" would sort
123-
// before "1.recorded_inputs".) If necessary, we could use some sort of heuristics here
124-
// to speed up the subsequent up-to-date-ness checking.
125-
.sorted(Comparator.comparing(Path::getBaseName))
122+
// Prefer newer cache entries over older ones. They're more likely to be up-to-date; plus,
123+
// if a repo is force-fetched, we want to use the new repo instead of always being stuck
124+
// with the old one.
125+
// To "prefer newer cache entries", we sort the entry file names by length DESC and then
126+
// lexicographically DESC. This approximates sorting by converting to int and then DESC,
127+
// but is defensive against non-numerically named entries.
128+
.sorted(
129+
Comparator.comparing((Path path) -> path.getBaseName().length())
130+
.thenComparing(Path::getBaseName)
131+
.reversed())
126132
.map(CandidateRepo::fromRecordedInputsFile)
127133
.collect(toImmutableList());
128134
} catch (IOException e) {

src/main/java/com/google/devtools/build/lib/bazel/repository/starlark/StarlarkRepositoryFunction.java

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,11 @@ private static class State extends WorkerSkyKeyComputeState<FetchResult> {
124124
@Nullable FetchResult result;
125125
}
126126

127+
@Override
128+
public boolean wasJustFetched(Environment env) {
129+
return env.getState(State::new).result != null;
130+
}
131+
127132
private record FetchArgs(
128133
Rule rule, Path outputDirectory, BlazeDirectories directories, Environment env, SkyKey key) {
129134
FetchArgs toWorkerArgs(Environment env) {
@@ -136,19 +141,21 @@ FetchArgs toWorkerArgs(Environment env) {
136141
public FetchResult fetch(
137142
Rule rule, Path outputDirectory, BlazeDirectories directories, Environment env, SkyKey key)
138143
throws RepositoryFunctionException, InterruptedException {
144+
var state = env.getState(State::new);
145+
if (state.result != null) {
146+
// Escape early if we've already finished fetching once. This can happen if
147+
// RepositoryDelegatorFunction triggers a Skyframe restart _after_
148+
// StarlarkRepositoryFunction#fetch is finished.
149+
return state.result;
150+
}
139151
var args = new FetchArgs(rule, outputDirectory, directories, env, key);
140152
if (!useWorkers) {
141-
return fetchInternal(args);
153+
state.result = fetchInternal(args);
154+
return state.result;
142155
}
143156
// See below (the `catch CancellationException` clause) for why there's a `while` loop here.
144157
while (true) {
145-
var state = env.getState(State::new);
146-
if (state.result != null) {
147-
// Escape early if we've already finished fetching once. This can happen if
148-
// RepositoryDelegatorFunction triggers a Skyframe restart _after_
149-
// StarlarkRepositoryFunction#fetch is finished.
150-
return state.result;
151-
}
158+
state = env.getState(State::new);
152159
try {
153160
state.result =
154161
state.startOrContinueWork(

src/main/java/com/google/devtools/build/lib/rules/repository/RepositoryDelegatorFunction.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,14 @@ private Rule getRepositoryRule(Environment env, RepositoryName repositoryName)
393393
/* Determines whether we should use the cached repositories */
394394
private boolean shouldUseCachedRepos(Environment env, RepositoryFunction handler, Rule rule)
395395
throws InterruptedException {
396+
if (handler.wasJustFetched(env)) {
397+
// If this SkyFunction has finished fetching once, then we should always use the cached
398+
// result. This means that we _very_ recently (as in, in the same command invocation) fetched
399+
// this repo (possibly with --force or --configure), and are only here again due to a Skyframe
400+
// restart very late into RepositoryDelegatorFunction.
401+
return true;
402+
}
403+
396404
boolean forceFetchEnabled = !FORCE_FETCH.get(env).isEmpty();
397405
boolean forceFetchConfigureEnabled =
398406
handler.isConfigure(rule) && !FORCE_FETCH_CONFIGURE.get(env).isEmpty();

src/main/java/com/google/devtools/build/lib/rules/repository/RepositoryFunction.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,15 @@ public enum Reproducibility {
197197
public record FetchResult(
198198
Map<? extends RepoRecordedInput, String> recordedInputValues, Reproducibility reproducible) {}
199199

200+
/**
201+
* We sometimes get to the point before fetching a repo when we have <em>just</em> fetched it,
202+
* particularly because of Skyframe restarts very late into RepositoryDelegatorFunction. In that
203+
* case, this method can be used to report that fact.
204+
*/
205+
public boolean wasJustFetched(Environment env) {
206+
return false;
207+
}
208+
200209
/**
201210
* Verify the data provided by the marker file to check if a refetch is needed. Returns an empty
202211
* Optional if the data is up to date and no refetch is needed and an Optional with a

src/test/py/bazel/bzlmod/bazel_fetch_test.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import os
1818
import tempfile
19+
1920
from absl.testing import absltest
2021
from src.test.py.bazel import test_base
2122
from src.test.py.bazel.bzlmod.test_utils import BazelRegistry
@@ -318,6 +319,64 @@ def testForceFetch(self):
318319
_, _, stderr = self.RunBazel(['fetch', '--repo=@hello', '--force'])
319320
self.assertIn('No more Orange Juice!', ''.join(stderr))
320321

322+
def testForceFetchWithRepoCache(self):
323+
self.ScratchFile(
324+
'MODULE.bazel',
325+
[
326+
'ext = use_extension("extension.bzl", "ext")',
327+
'use_repo(ext, "hello")',
328+
],
329+
)
330+
self.ScratchFile('BUILD')
331+
self.ScratchFile('name.txt', ['foo'])
332+
file_path = self.Path('name.txt').replace('\\', '\\\\')
333+
self.ScratchFile(
334+
'extension.bzl',
335+
[
336+
'def impl(ctx):',
337+
' file_content = ctx.read("' + file_path + '", watch="no")',
338+
' print("name is " + file_content)',
339+
' ctx.file("BUILD",',
340+
' "filegroup(name=\'" + file_content.strip() + "\')")',
341+
' return ctx.repo_metadata(reproducible=True)',
342+
'repo_rule = repository_rule(implementation=impl)',
343+
'',
344+
'def _ext_impl(ctx):',
345+
' repo_rule(name="hello")',
346+
'ext = module_extension(implementation=_ext_impl)',
347+
],
348+
)
349+
350+
_, _, stderr = self.RunBazel(['fetch', '--repo=@hello'])
351+
self.assertIn('name is foo', ''.join(stderr))
352+
self.RunBazel(['build', '@hello//:foo'])
353+
354+
# Change file content and run WITHOUT force, assert no fetching!
355+
self.ScratchFile('name.txt', ['bar'])
356+
_, _, stderr = self.RunBazel(['fetch', '--repo=@hello'])
357+
self.assertNotIn('name is bar', ''.join(stderr))
358+
self.RunBazel(['build', '@hello//:foo'])
359+
360+
# Run again WITH --force and assert fetching
361+
_, _, stderr = self.RunBazel(['fetch', '--repo=@hello', '--force'])
362+
self.assertIn('name is bar', ''.join(stderr))
363+
self.RunBazel(['build', '@hello//:bar'])
364+
365+
# Clean expunge. Assert the cache entry with "bar" is selected (despite
366+
# "foo" also still existing in the cache).
367+
self.RunBazel(['clean', '--expunge'])
368+
self.ScratchFile('name.txt', ['quux'])
369+
_, _, stderr = self.RunBazel(['build', '@hello//:bar'])
370+
self.assertNotIn('name is ', ''.join(stderr))
371+
372+
def testForceFetchWithRepoCacheNoRepoWorkers(self):
373+
self.ScratchFile(
374+
'.bazelrc',
375+
['common --experimental_worker_for_repo_fetching=off'],
376+
mode='a',
377+
)
378+
self.testForceFetchWithRepoCache()
379+
321380
def testFetchTarget(self):
322381
self.main_registry.createCcModule('aaa', '1.0').createCcModule(
323382
'bbb', '1.0', {'aaa': '1.0'}

0 commit comments

Comments
 (0)