Skip to content

Commit 64943b8

Browse files
authored
Merge pull request #5 from fractal-analytics-platform/4-setup-data-retrieval-script-and-github-action
New data-retrieval logic for tasks page
2 parents 44fa9a6 + 7379d13 commit 64943b8

File tree

5 files changed

+311
-0
lines changed

5 files changed

+311
-0
lines changed

.github/workflows/task_list.yaml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
name: Task-list page
2+
3+
on:
4+
push:
5+
branches: ["main"]
6+
pull_request:
7+
branches: ["main"]
8+
workflow_dispatch:
9+
10+
11+
jobs:
12+
retrieve-tasks-data:
13+
runs-on: ubuntu-latest
14+
15+
steps:
16+
- uses: actions/checkout@v4
17+
with:
18+
fetch-depth: 1
19+
20+
- uses: actions/setup-python@v5
21+
with:
22+
python-version: 3.11
23+
cache: pip
24+
25+
- name: Install requirements
26+
run: python3 -m pip install -r tasks/data_retrieval/requirements.txt
27+
28+
- name: Fetch tasks data
29+
run: python3 -u tasks/data_retrieval/create_tasks_data.py
30+
31+
- run: cat tasks/data_retrieval/tasks_data.json

tasks/data_retrieval/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
downloads
2+
venv
3+
tasks_data.json
Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
import time
2+
from pathlib import Path
3+
import requests
4+
import json
5+
from zipfile import ZipFile
6+
from typing import Any
7+
from pydantic import BaseModel
8+
from typing import Optional, Literal
9+
10+
11+
DOWNLOAD_FOLDER = Path(__file__).parent / "downloads"
12+
DOWNLOAD_FOLDER.mkdir(exist_ok=True)
13+
14+
15+
class TaskReadV2(BaseModel):
16+
"""
17+
Based on
18+
https://github.com/fractal-analytics-platform/fractal-server/blob/main/fractal_server/app/schemas/v2/task.py
19+
"""
20+
21+
name: str
22+
type: Literal["parallel", "non_parallel", "compound"]
23+
source: Optional[str] = None
24+
version: Optional[str] = None
25+
docs_info: Optional[str] = None
26+
docs_link: Optional[str] = None
27+
input_types: dict[str, bool]
28+
output_types: dict[str, bool]
29+
category: Optional[str] = None
30+
modality: Optional[str] = None
31+
authors: Optional[str] = None
32+
tags: list[str]
33+
34+
class Config:
35+
extra = "forbid"
36+
37+
38+
class TaskGroupReadV2(BaseModel):
39+
pkg_name: str
40+
version: Optional[str] = None
41+
task_list: list[TaskReadV2]
42+
43+
44+
def parse_wheel_filename(wheel_path: str) -> dict[str, str]:
45+
"""
46+
Given a wheel-file name or path, extract distribution and version.
47+
"""
48+
wheel_filename = wheel_path.split("/")[-1]
49+
parts = wheel_filename.split("-")
50+
return dict(name=parts[0], version=parts[1])
51+
52+
53+
def load_manifest_from_zip(wheel_path: str) -> dict[str, Any]:
54+
"""
55+
Given a wheel file on-disk, extract the Fractal manifest.
56+
"""
57+
with ZipFile(wheel_path) as wheel:
58+
namelist = wheel.namelist()
59+
try:
60+
manifest = next(
61+
name for name in namelist if "__FRACTAL_MANIFEST__.json" in name
62+
)
63+
except StopIteration:
64+
msg = f"{wheel_path} does not include __FRACTAL_MANIFEST__.json"
65+
raise ValueError(msg)
66+
with wheel.open(manifest) as manifest_fd:
67+
manifest_dict = json.load(manifest_fd)
68+
return manifest_dict
69+
70+
71+
def download_file(url: str) -> str:
72+
file_name = url.split("/")[-1]
73+
response = requests.get(url, stream=True)
74+
file_path = (DOWNLOAD_FOLDER / file_name).as_posix()
75+
with open(file_path, "wb") as f:
76+
for data in response.iter_content():
77+
f.write(data)
78+
return file_path
79+
80+
81+
def handle_pypi_project(pypi_project_url: str) -> dict[str, Any]:
82+
"""
83+
Example: https://pypi.org/project/fractal-tasks-core
84+
"""
85+
86+
# Extract project_name
87+
parts = pypi_project_url.split("/")
88+
if parts[:4] != ["https:", "", "pypi.org", "project"]:
89+
raise ValueError(
90+
f"Invalid {pypi_project_url=}.\n"
91+
"Valid example: https://pypi.org/project/fractal-tasks-core"
92+
)
93+
project_name = parts[4]
94+
95+
# Fetch and parse PyPI information
96+
pypi_api_url = f"https://pypi.org/pypi/{project_name}/json"
97+
res = requests.get(pypi_api_url)
98+
response_data = res.json()
99+
if not res.status_code == 200:
100+
raise RuntimeError(f"Invalid response from {pypi_api_url}: {res}")
101+
latest_version = response_data["info"]["version"]
102+
releases = response_data["releases"]
103+
latest_release = releases[latest_version]
104+
latest_release_wheel_assets = [
105+
item for item in latest_release if item["filename"].endswith(".whl")
106+
]
107+
if len(latest_release_wheel_assets) > 1:
108+
raise ValueError(
109+
f"Found more than one wheel asset in release {latest_version}."
110+
)
111+
latest_release_wheel_asset = latest_release_wheel_assets[0]
112+
latest_release_wheel_asset_url = latest_release_wheel_asset["url"]
113+
114+
# Download wheel and parse manifest
115+
wheel_path = download_file(latest_release_wheel_asset_url)
116+
info = parse_wheel_filename(wheel_path)
117+
manifest = load_manifest_from_zip(wheel_path)
118+
Path(wheel_path).unlink()
119+
120+
return dict(manifest=manifest, **info)
121+
122+
123+
def handle_github_repository(github_url: str) -> dict[str, Any]:
124+
"""
125+
Example:
126+
https://github.com/fractal-analytics-platform/fractal-lif-converters/
127+
"""
128+
129+
# Extract owner and repository
130+
parts = github_url.split("/")
131+
if parts[:3] != ["https:", "", "github.com"]:
132+
print(parts)
133+
raise ValueError(
134+
f"Invalid {github_url=}.\n"
135+
"Valid example: https://github.com/fractal-analytics-platform/fractal-lif-converters"
136+
)
137+
owner, repository = parts[3:5]
138+
139+
# Fetch and parse GitHub information
140+
github_api_url = (
141+
f"https://api.github.com/repos/{owner}/{repository}/releases/latest"
142+
)
143+
headers = {
144+
"Accept": "application/vnd.github+json",
145+
"X-GitHub-Api-Version": "2022-11-28",
146+
}
147+
res = requests.get(github_api_url, headers=headers)
148+
if not res.status_code == 200:
149+
raise RuntimeError(f"Invalid response from {github_api_url}: {res}")
150+
assets = res.json()["assets"]
151+
wheel_assets = [asset for asset in assets if asset["name"].endswith(".whl")]
152+
if len(wheel_assets) > 1:
153+
raise ValueError("Found more than one wheel asset in latest GitHub release.")
154+
wheel_asset = wheel_assets[0]
155+
wheel_asset_browser_download_url = wheel_asset["browser_download_url"]
156+
157+
# Download wheel and parse manifest
158+
wheel_path = download_file(wheel_asset_browser_download_url)
159+
info = parse_wheel_filename(wheel_path)
160+
manifest = load_manifest_from_zip(wheel_path)
161+
Path(wheel_path).unlink()
162+
163+
return dict(manifest=manifest, **info)
164+
165+
166+
def get_package_info(source: str) -> dict[str, Any]:
167+
if source.startswith("https://github.com"):
168+
return handle_github_repository(source)
169+
elif source.startswith("https://pypi.org"):
170+
return handle_pypi_project(source)
171+
else:
172+
raise ValueError(f"Invalid {source=}.")
173+
174+
175+
def _get_task_type(
176+
task: dict[str, Any],
177+
) -> Literal["parallel", "non_parallel", "compound"]:
178+
np = task.get("executable_non_parallel", None)
179+
p = task.get("executable_parallel", None)
180+
if p and np:
181+
return "compound"
182+
elif p and not np:
183+
return "parallel"
184+
elif np and not p:
185+
return "non_parallel"
186+
else:
187+
raise ValueError(f"Invalid task with {p=} and {np=}.")
188+
189+
190+
COLUMN_NAMES = [
191+
"version",
192+
"name",
193+
"category",
194+
"modality",
195+
"tags",
196+
"input_types",
197+
"output_types",
198+
"docs_link",
199+
]
200+
COLUMN_DEFAULTS = {
201+
"input_types": {},
202+
"output_types": {},
203+
"tags": [],
204+
}
205+
COLUMN_TITLES = list(map(str.title, COLUMN_NAMES))
206+
207+
208+
# Read and filter list of sources
209+
sources_file = Path(__file__).parent / "sources.txt"
210+
with sources_file.open("r") as f:
211+
sources = f.read().splitlines()
212+
sources = [
213+
source
214+
for source in sources
215+
if not (source.startswith("#") or source == "")
216+
]
217+
218+
TASK_GROUPS = []
219+
for source in sources:
220+
t_start = time.perf_counter()
221+
print(f"START processing {source=}")
222+
try:
223+
task_list = []
224+
data = get_package_info(source)
225+
pkg_name = data["name"]
226+
pkg_version = data.get("version")
227+
pkg_task_list = data["manifest"]["task_list"]
228+
for task in pkg_task_list:
229+
new_task = dict()
230+
for column_name in COLUMN_NAMES:
231+
new_task[column_name] = task.get(
232+
column_name, COLUMN_DEFAULTS.get(column_name, None)
233+
)
234+
new_task["version"] = pkg_version
235+
new_task["type"] = _get_task_type(task)
236+
TaskReadV2(**new_task)
237+
task_list.append(new_task)
238+
239+
task_group = dict(
240+
pkg_name=pkg_name,
241+
version=pkg_version,
242+
task_list=task_list,
243+
)
244+
except Exception as e:
245+
print(f"ERROR, skip.\nOriginal error:\n{str(e)}")
246+
247+
TaskGroupReadV2(**task_group)
248+
249+
TASK_GROUPS.append(task_group)
250+
251+
t_end = time.perf_counter()
252+
print(f"END processing {source=} - elapsed {t_end-t_start:.3f} s.")
253+
print()
254+
255+
output_file = Path(__file__).parent / "tasks_data.json"
256+
with output_file.open("w") as f:
257+
json.dump(TASK_GROUPS, f, indent=2)
258+
259+
DOWNLOAD_FOLDER.rmdir()
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
requests
2+
pydantic

tasks/data_retrieval/sources.txt

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# PyPI
2+
https://pypi.org/project/fractal-tasks-core/
3+
https://pypi.org/project/fractal-faim-ipa
4+
https://pypi.org/project/fractal-lif-converters
5+
https://pypi.org/project/operetta-compose
6+
7+
# GitHub releases with wheels
8+
https://github.com/fractal-analytics-platform/fractal-lif-converters/
9+
10+
11+
# https://github.com/fractal-analytics-platform/fractal-helper-tasks
12+
# https://github.com/fmi-basel/gliberal-scMultipleX
13+
# https://github.com/Apricot-Therapeutics/APx_fractal_task_collection
14+
# https://github.com/fractal-analytics-platform/fractal-plantseg-tasks
15+
# https://github.com/m-albert/fractal-ome-zarr-hcs-stitching/archive
16+
# https://github.com/fractal-analytics-platform/fractal-ilastik-tasksC/archive/refs/tags/0.1.1.zip

0 commit comments

Comments
 (0)