Skip to content

Commit 49e43bb

Browse files
bazel-ioWyverald
andauthored
[8.3.1] Fix hang with force fetching + repo contents cache (#26412)
With the repo contents cache enabled, `bazel fetch --force` (and the now-removed `bazel sync`) would cause Bazel to hang forever. The reason is that writing into the repo contents cache causes a Skyframe restart very late into `RepositoryDelegatorFunction`, and the force-fetch bit causes the restarted function to consider it a cache miss, writing into the repo contents cache again, causing an infinite loop. This PR changes it so that a restarted `RepositoryDelegatorFunction` will remember whether it had _just_ finished a fetch before; if so, it will use the cache, even if force-fetch is on. Fixes #26389. Closes #26409. PiperOrigin-RevId: 776306095 Change-Id: I52ad7aeb6581a16268d26a8b142041dc0e748768 Commit bf67251 --------- Co-authored-by: Xdng Yng <[email protected]>
1 parent 42dbcff commit 49e43bb

File tree

5 files changed

+98
-12
lines changed

5 files changed

+98
-12
lines changed

src/main/java/com/google/devtools/build/lib/bazel/repository/cache/RepoContentsCache.java

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,16 @@ public ImmutableList<CandidateRepo> getCandidateRepos(String predeclaredInputHas
121121
try {
122122
return entryDir.getDirectoryEntries().stream()
123123
.filter(path -> path.getBaseName().endsWith(RECORDED_INPUTS_SUFFIX))
124-
// We're just sorting for consistency here. (Note that "10.recorded_inputs" would sort
125-
// before "1.recorded_inputs".) If necessary, we could use some sort of heuristics here
126-
// to speed up the subsequent up-to-date-ness checking.
127-
.sorted(Comparator.comparing(Path::getBaseName))
124+
// Prefer newer cache entries over older ones. They're more likely to be up-to-date; plus,
125+
// if a repo is force-fetched, we want to use the new repo instead of always being stuck
126+
// with the old one.
127+
// To "prefer newer cache entries", we sort the entry file names by length DESC and then
128+
// lexicographically DESC. This approximates sorting by converting to int and then DESC,
129+
// but is defensive against non-numerically named entries.
130+
.sorted(
131+
Comparator.comparing((Path path) -> path.getBaseName().length())
132+
.thenComparing(Path::getBaseName)
133+
.reversed())
128134
.map(CandidateRepo::fromRecordedInputsFile)
129135
.collect(toImmutableList());
130136
} catch (IOException e) {

src/main/java/com/google/devtools/build/lib/bazel/repository/starlark/StarlarkRepositoryFunction.java

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,11 @@ private static class State extends WorkerSkyKeyComputeState<FetchResult> {
127127
@Nullable FetchResult result;
128128
}
129129

130+
@Override
131+
public boolean wasJustFetched(Environment env) {
132+
return env.getState(State::new).result != null;
133+
}
134+
130135
private record FetchArgs(
131136
Rule rule, Path outputDirectory, BlazeDirectories directories, Environment env, SkyKey key) {
132137
FetchArgs toWorkerArgs(Environment env) {
@@ -139,19 +144,21 @@ FetchArgs toWorkerArgs(Environment env) {
139144
public FetchResult fetch(
140145
Rule rule, Path outputDirectory, BlazeDirectories directories, Environment env, SkyKey key)
141146
throws RepositoryFunctionException, InterruptedException {
147+
var state = env.getState(State::new);
148+
if (state.result != null) {
149+
// Escape early if we've already finished fetching once. This can happen if
150+
// RepositoryDelegatorFunction triggers a Skyframe restart _after_
151+
// StarlarkRepositoryFunction#fetch is finished.
152+
return state.result;
153+
}
142154
var args = new FetchArgs(rule, outputDirectory, directories, env, key);
143155
if (!useWorkers) {
144-
return fetchInternal(args);
156+
state.result = fetchInternal(args);
157+
return state.result;
145158
}
146159
// See below (the `catch CancellationException` clause) for why there's a `while` loop here.
147160
while (true) {
148-
var state = env.getState(State::new);
149-
if (state.result != null) {
150-
// Escape early if we've already finished fetching once. This can happen if
151-
// RepositoryDelegatorFunction triggers a Skyframe restart _after_
152-
// StarlarkRepositoryFunction#fetch is finished.
153-
return state.result;
154-
}
161+
state = env.getState(State::new);
155162
try {
156163
state.result =
157164
state.startOrContinueWork(

src/main/java/com/google/devtools/build/lib/rules/repository/RepositoryDelegatorFunction.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,14 @@ private RepositoryFunction getHandler(Rule rule) {
450450
/* Determines whether we should use the cached repositories */
451451
private boolean shouldUseCachedRepos(Environment env, RepositoryFunction handler, Rule rule)
452452
throws InterruptedException {
453+
if (handler.wasJustFetched(env)) {
454+
// If this SkyFunction has finished fetching once, then we should always use the cached
455+
// result. This means that we _very_ recently (as in, in the same command invocation) fetched
456+
// this repo (possibly with --force or --configure), and are only here again due to a Skyframe
457+
// restart very late into RepositoryDelegatorFunction.
458+
return true;
459+
}
460+
453461
boolean forceFetchEnabled = !FORCE_FETCH.get(env).isEmpty();
454462
boolean forceFetchConfigureEnabled =
455463
handler.isConfigure(rule) && !FORCE_FETCH_CONFIGURE.get(env).isEmpty();

src/main/java/com/google/devtools/build/lib/rules/repository/RepositoryFunction.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,15 @@ protected static void ensureNativeRepoRuleEnabled(Rule rule, Environment env, St
218218
Transience.PERSISTENT);
219219
}
220220

221+
/**
222+
* We sometimes get to the point before fetching a repo when we have <em>just</em> fetched it,
223+
* particularly because of Skyframe restarts very late into RepositoryDelegatorFunction. In that
224+
* case, this method can be used to report that fact.
225+
*/
226+
public boolean wasJustFetched(Environment env) {
227+
return false;
228+
}
229+
221230
/**
222231
* Verify the data provided by the marker file to check if a refetch is needed. Returns an empty
223232
* Optional if the data is up to date and no refetch is needed and an Optional with a

src/test/py/bazel/bzlmod/bazel_fetch_test.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import os
1818
import tempfile
19+
1920
from absl.testing import absltest
2021
from src.test.py.bazel import test_base
2122
from src.test.py.bazel.bzlmod.test_utils import BazelRegistry
@@ -305,6 +306,61 @@ def testForceFetch(self):
305306
_, _, stderr = self.RunBazel(['fetch', '--repo=@hello', '--force'])
306307
self.assertIn('No more Orange Juice!', ''.join(stderr))
307308

309+
def testForceFetchWithRepoCache(self):
310+
self.ScratchFile(
311+
'MODULE.bazel',
312+
[
313+
'ext = use_extension("extension.bzl", "ext")',
314+
'use_repo(ext, "hello")',
315+
],
316+
)
317+
self.ScratchFile('BUILD')
318+
self.ScratchFile('name.txt', ['foo'])
319+
file_path = self.Path('name.txt').replace('\\', '\\\\')
320+
self.ScratchFile(
321+
'extension.bzl',
322+
[
323+
'def impl(ctx):',
324+
' file_content = ctx.read("' + file_path + '", watch="no")',
325+
' print("name is " + file_content)',
326+
' ctx.file("BUILD",',
327+
' "filegroup(name=\'" + file_content.strip() + "\')")',
328+
' return ctx.repo_metadata(reproducible=True)',
329+
'repo_rule = repository_rule(implementation=impl)',
330+
'',
331+
'def _ext_impl(ctx):',
332+
' repo_rule(name="hello")',
333+
'ext = module_extension(implementation=_ext_impl)',
334+
],
335+
)
336+
337+
_, _, stderr = self.RunBazel(['fetch', '--repo=@hello'])
338+
self.assertIn('name is foo', ''.join(stderr))
339+
self.RunBazel(['build', '@hello//:foo'])
340+
341+
# Change file content and run WITHOUT force, assert no fetching!
342+
self.ScratchFile('name.txt', ['bar'])
343+
_, _, stderr = self.RunBazel(['fetch', '--repo=@hello'])
344+
self.assertNotIn('name is bar', ''.join(stderr))
345+
self.RunBazel(['build', '@hello//:foo'])
346+
347+
# Run again WITH --force and assert fetching
348+
_, _, stderr = self.RunBazel(['fetch', '--repo=@hello', '--force'])
349+
self.assertIn('name is bar', ''.join(stderr))
350+
self.RunBazel(['build', '@hello//:bar'])
351+
352+
# Clean expunge. Assert the cache entry with "bar" is selected (despite
353+
# "foo" also still existing in the cache).
354+
self.RunBazel(['clean', '--expunge'])
355+
self.ScratchFile('name.txt', ['quux'])
356+
_, _, stderr = self.RunBazel(['build', '@hello//:bar'])
357+
self.assertNotIn('name is ', ''.join(stderr))
358+
359+
def testForceFetchWithRepoCacheNoRepoWorkers(self):
360+
with open(self.Path('.bazelrc'), 'a') as f:
361+
f.write('common --experimental_worker_for_repo_fetching=off\n')
362+
self.testForceFetchWithRepoCache()
363+
308364
def testFetchTarget(self):
309365
self.main_registry.createCcModule('aaa', '1.0').createCcModule(
310366
'bbb', '1.0', {'aaa': '1.0'}

0 commit comments

Comments
 (0)