Skip to content

Commit b62804c

Browse files
janbucharvdusek
andauthored
fix: Also load input from a file with a .json extension in file system storage (#617)
- fixes bug reported in https://apify.slack.com/archives/C0L33UM7Z/p1759754068120539 --------- Co-authored-by: Vlada Dusek <[email protected]>
1 parent cfadc98 commit b62804c

File tree

5 files changed

+104
-36
lines changed

5 files changed

+104
-36
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ keywords = [
3636
dependencies = [
3737
"apify-client>=2.0.0,<3.0.0",
3838
"apify-shared>=2.0.0,<3.0.0",
39-
"crawlee>=1.0.0,<2.0.0",
39+
"crawlee>=1.0.2,<2.0.0",
4040
"cachetools>=5.5.0",
4141
"cryptography>=42.0.0",
4242
"impit>=0.6.1",

src/apify/_configuration.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from datetime import datetime, timedelta
44
from decimal import Decimal
55
from logging import getLogger
6+
from pathlib import Path
67
from typing import Annotated, Any
78

89
from pydantic import AliasChoices, BeforeValidator, Field, model_validator
@@ -421,6 +422,14 @@ def disable_browser_sandbox_on_platform(self) -> Self:
421422
logger.warning('Actor is running on the Apify platform, `disable_browser_sandbox` was changed to True.')
422423
return self
423424

425+
@property
426+
def canonical_input_key(self) -> str:
427+
return str(Path(self.input_key).with_suffix('.json'))
428+
429+
@property
430+
def input_key_candidates(self) -> set[str]:
431+
return {self.input_key, self.canonical_input_key, Path(self.canonical_input_key).stem}
432+
424433
@classmethod
425434
def get_global_configuration(cls) -> Configuration:
426435
"""Retrieve the global instance of the configuration.
Lines changed: 66 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,19 @@
11
import asyncio
22
import json
3-
from pathlib import Path
3+
import logging
44

5-
from typing_extensions import override
5+
from more_itertools import flatten
6+
from typing_extensions import Self, override
67

78
from crawlee._consts import METADATA_FILENAME
9+
from crawlee.configuration import Configuration as CrawleeConfiguration
810
from crawlee.storage_clients._file_system import FileSystemKeyValueStoreClient
11+
from crawlee.storage_clients.models import KeyValueStoreRecord
912

1013
from apify._configuration import Configuration
1114

15+
logger = logging.getLogger(__name__)
16+
1217

1318
class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient):
1419
"""Apify-specific implementation of the `FileSystemKeyValueStoreClient`.
@@ -17,23 +22,39 @@ class ApifyFileSystemKeyValueStoreClient(FileSystemKeyValueStoreClient):
1722
directory, except for the metadata file and the `INPUT.json` file.
1823
"""
1924

25+
@override
26+
@classmethod
27+
async def open(
28+
cls,
29+
*,
30+
id: str | None,
31+
name: str | None,
32+
alias: str | None,
33+
configuration: CrawleeConfiguration,
34+
) -> Self:
35+
client = await super().open(id=id, name=name, alias=alias, configuration=configuration)
36+
37+
await client._sanitize_input_json_files() # noqa: SLF001 - it's okay, this is a factory method
38+
39+
return client
40+
2041
@override
2142
async def purge(self) -> None:
2243
"""Purges the key-value store by deleting all its contents.
2344
2445
It deletes all files in the key-value store directory, except for the metadata file and
2546
the `INPUT.json` file. It also updates the metadata to reflect that the store has been purged.
2647
"""
27-
kvs_input_key = Configuration.get_global_configuration().input_key
28-
29-
# First try to find the alternative format of the input file and process it if it exists.
30-
for file_path in self.path_to_kvs.glob('*'):
31-
if file_path.name == f'{kvs_input_key}.json':
32-
await self._process_input_json(file_path)
48+
configuration = Configuration.get_global_configuration()
3349

3450
async with self._lock:
51+
files_to_keep = set(
52+
flatten([key, f'{key}.{METADATA_FILENAME}'] for key in configuration.input_key_candidates)
53+
)
54+
files_to_keep.add(METADATA_FILENAME)
55+
3556
for file_path in self.path_to_kvs.glob('*'):
36-
if file_path.name in {METADATA_FILENAME, kvs_input_key, f'{kvs_input_key}.{METADATA_FILENAME}'}:
57+
if file_path.name in files_to_keep:
3758
continue
3859
if file_path.is_file():
3960
await asyncio.to_thread(file_path.unlink, missing_ok=True)
@@ -43,15 +64,40 @@ async def purge(self) -> None:
4364
update_modified_at=True,
4465
)
4566

46-
async def _process_input_json(self, path: Path) -> None:
47-
"""Process simple input json file to format expected by the FileSystemKeyValueStoreClient.
67+
async def _sanitize_input_json_files(self) -> None:
68+
"""Handle missing metadata for input files."""
69+
configuration = Configuration.get_global_configuration()
70+
alternative_keys = configuration.input_key_candidates - {configuration.canonical_input_key}
4871

49-
For example: INPUT.json -> INPUT, INPUT.json.metadata
50-
"""
51-
try:
52-
f = await asyncio.to_thread(path.open)
53-
input_data = json.load(f)
54-
finally:
55-
f.close()
56-
await asyncio.to_thread(path.unlink, missing_ok=True)
57-
await self.set_value(key=path.stem, value=input_data)
72+
if (self.path_to_kvs / configuration.canonical_input_key).exists():
73+
# Refresh metadata to prevent inconsistencies
74+
input_data = await asyncio.to_thread(
75+
lambda: json.loads((self.path_to_kvs / configuration.canonical_input_key).read_text())
76+
)
77+
await self.set_value(key=configuration.canonical_input_key, value=input_data)
78+
79+
for alternative_key in alternative_keys:
80+
if (alternative_input_file := self.path_to_kvs / alternative_key).exists():
81+
logger.warning(f'Redundant input file found: {alternative_input_file}')
82+
else:
83+
for alternative_key in alternative_keys:
84+
alternative_input_file = self.path_to_kvs / alternative_key
85+
86+
# Only process files that actually exist
87+
if alternative_input_file.exists():
88+
# Refresh metadata to prevent inconsistencies
89+
with alternative_input_file.open() as f:
90+
input_data = await asyncio.to_thread(lambda: json.load(f))
91+
await self.set_value(key=alternative_key, value=input_data)
92+
93+
@override
94+
async def get_value(self, *, key: str) -> KeyValueStoreRecord | None:
95+
configuration = Configuration.get_global_configuration()
96+
97+
if key in configuration.input_key_candidates:
98+
for candidate in configuration.input_key_candidates:
99+
value = await super().get_value(key=candidate)
100+
if value is not None:
101+
return value
102+
103+
return await super().get_value(key=key)

tests/unit/storage_clients/test_file_system.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,16 @@
22

33
import asyncio
44
import json
5-
from typing import TYPE_CHECKING
5+
from pathlib import Path
66

7+
import pytest
8+
9+
from crawlee import service_locator
710
from crawlee._consts import METADATA_FILENAME
811

912
from apify import Actor, Configuration
1013
from apify.storage_clients._file_system import ApifyFileSystemKeyValueStoreClient
1114

12-
if TYPE_CHECKING:
13-
from pathlib import Path
14-
1515

1616
async def test_purge_preserves_input_file_and_metadata() -> None:
1717
"""Test that purge() preserves INPUT.json and metadata files but removes other files."""
@@ -61,19 +61,32 @@ async def test_purge_preserves_input_file_and_metadata() -> None:
6161

6262
# Verify INPUT.json content is unchanged
6363
input_content = await asyncio.to_thread(input_file.read_text)
64-
assert input_content == '{"test": "input"}'
64+
assert json.loads(input_content) == json.loads('{"test": "input"}')
65+
6566

67+
@pytest.mark.parametrize('input_file_name', ['INPUT', 'INPUT.json'])
68+
async def test_pre_existing_input_used_by_actor(input_file_name: str) -> None:
69+
configuration = Configuration()
70+
service_locator.set_configuration(configuration)
71+
72+
# Create key-value store directory and make sure that it is empty
73+
path_to_input = Path(configuration.storage_dir) / 'key_value_stores' / 'default'
74+
path_to_input.mkdir(parents=True)
75+
assert list(path_to_input.glob('*')) == []
6676

67-
async def test_pre_existing_input_used_by_actor(tmp_path: Path) -> None:
6877
pre_existing_input = {
6978
'foo': 'bar',
7079
}
7180

72-
configuration = Configuration.get_global_configuration()
7381
# Create pre-existing INPUT.json file
74-
path_to_input = tmp_path / 'key_value_stores' / 'default'
75-
path_to_input.mkdir(parents=True)
76-
(path_to_input / f'{configuration.input_key}.json').write_text(json.dumps(pre_existing_input))
82+
(path_to_input / input_file_name).write_text(json.dumps(pre_existing_input))
7783

7884
async with Actor():
7985
assert pre_existing_input == await Actor.get_input()
86+
87+
# Make sure that the input file doesn't get renamed in the process and metadata are added
88+
assert set(path_to_input.glob('*')) == {
89+
path_to_input / '__metadata__.json',
90+
path_to_input / input_file_name,
91+
path_to_input / f'{input_file_name}.__metadata__.json',
92+
}

uv.lock

Lines changed: 5 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)