Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
FROM python:3.12-slim

RUN apt-get update && \
apt-get install -y git curl && \
rm -rf /var/lib/apt/lists/*

RUN curl -sSL https://install.python-poetry.org | python3 -

ENV PATH="/root/.local/bin:$PATH"

WORKDIR /rsfc

RUN git clone https://github.com/oeg-upm/rsfc.git .

RUN poetry install --no-root

RUN poetry run pip install .

RUN poetry run python -m nltk.downloader wordnet

RUN poetry run somef configure -a

RUN mkdir -p /rsfc/outputs

ENTRYPOINT ["poetry", "run", "rsfc"]
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,30 @@ Without poetry virtual environment activated you need to use the poetry run:
poetry run rsfc --help
```

## Docker installation

If preferred, RSFC can be executed using docker.

Once you have cloned the repository, go to the project's root directory and run the following command to build the image:

```
docker build -t rsfc-docker .
```

After that, it is necessary to create the directory in which the output assessment will be saved. You can do it by running the following command:

```
mkdir ./outputs
```

Finally, run the following command to run the container:

```
docker run --rm -v $(pwd)/outputs:/rsfc/outputs rsfc-docker <repo_url>
```

where repo_url is the url of the repository to be analyzed, which is strictly needed.

## Usage

After installation, you can use the package by running if you activated the poetry env
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "rsfc"
version = "0.0.2"
version = "0.0.3"
description = "EVERSE Research Software Fairness Checks"
authors = ["Andres Montero <[email protected]>"]
license = "MIT"
Expand Down
2 changes: 1 addition & 1 deletion src/rsfc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-


__version__ = "0.0.2"
__version__ = "0.0.3"

8 changes: 3 additions & 5 deletions src/rsfc/harvesters/cff_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,15 @@ def __init__(self, sw):


def get_cff_file(self, sw):
req_url = sw.base_url + '/contents/CITATION.cff'

try:
if sw.repo_type == "GITHUB":
req_url = sw.base_url + '/contents/codemeta.json'
req_url = sw.base_url + '/contents/CITATION.cff'
headers = {'Accept': 'application/vnd.github.v3.raw'}
params = {'ref': sw.repo_branch}

response = requests.get(req_url, headers=headers, params=params)
response.raise_for_status()
return response.json()
return yaml.safe_load(response.text)
elif sw.repo_type == "GITLAB":
project_path_encoded = sw.base_url.split("/projects/")[-1]
branch = sw.repo_branch or "main"
Expand Down Expand Up @@ -55,7 +53,7 @@ def harvest_cff(self, cff):
if "version" in cff:
cff_info["version"] = cff["version"]

if "idenfiers" in cff:
if "identifiers" in cff:
cff_info["identifiers"] = cff["identifiers"]

if "preferred-citation" in cff:
Expand Down
2 changes: 1 addition & 1 deletion src/rsfc/harvesters/somef_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import contextlib
import json
from somef import somef_cli
import os
#import os

class SomefHarvester:

Expand Down
2 changes: 1 addition & 1 deletion src/rsfc/model/assessedSoftware.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def __init__(self, repo_url):
self.name = self.get_soft_name()
self.version = self.get_soft_version()
self.id = None
self.repo_branch = rsfc_helpers.get_gitlab_default_branch(self.base_url, self.repo_type)
self.repo_branch = rsfc_helpers.get_repo_default_branch(self.base_url)


def get_repo_base_url(self):
Expand Down
5 changes: 3 additions & 2 deletions src/rsfc/model/indicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def __init__(self, sw, somef, cd, cf):
(rt.test_id_associated_with_software, [somef.somef_data, cd.codemeta_data, cf.cff_data]),
(rt.test_id_common_schema, [somef.somef_data]),
(rt.test_identifier_in_readme_citation, [somef.somef_data, cf.cff_data]),
(rt.test_identifier_resolves_to_software, [somef.somef_data])
(rt.test_identifier_resolves_to_software, [somef.somef_data, cd.codemeta_data, cf.cff_data, sw])
],
"requirements_specified": [
(rt.test_dependencies_declared, [somef.somef_data]),
Expand Down Expand Up @@ -43,7 +43,8 @@ def __init__(self, sw, somef, cd, cf):
"software_has_license": [
(rt.test_has_license, [somef.somef_data]),
(rt.test_license_spdx_compliant, [somef.somef_data]),
(rt.test_license_info_in_metadata_files, [somef.somef_data, cd.codemeta_data, cf.cff_data])
(rt.test_license_info_in_metadata_files, [somef.somef_data, cd.codemeta_data, cf.cff_data]),
(rt.test_license_information_provided, [somef.somef_data])
],
"descriptive_metadata": [
(rt.test_authors, [somef.somef_data, cd.codemeta_data, cf.cff_data]),
Expand Down
68 changes: 51 additions & 17 deletions src/rsfc/rsfc_tests/rsfc_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ def test_identifier_in_readme_citation(somef_data, cff_data):
readme = True

if cff_data != None:
if "identifiers" in cff_data:
if cff_data["identifiers"] != None:
citation = True

if readme and not citation:
Expand All @@ -584,29 +584,43 @@ def test_identifier_in_readme_citation(somef_data, cff_data):
return check.convert()



def test_identifier_resolves_to_software(somef_data): #CAMBIAR. Este debe coger el id que este en el readme, codemeta o cff y mirar que resuelva a la url del software
def test_identifier_resolves_to_software(somef_data, codemeta_data, cff_data, sw): #CAMBIAR. Este debe coger el id que este en el readme, codemeta o cff y mirar que resuelva a la url del software

output = "false"

evidence = constants.EVIDENCE_NO_IDENTIFIER_FOUND
identifier = None
pause = False

if 'identifier' in somef_data:
for item in somef_data['identifier']:
if item['source']:
if 'README' in item['source']:
id = item['result']['value']
identifier = item['result']['value']
pause = True
break

response = requests.head(id, allow_redirects=True, timeout=5)
if response.status_code == 200:
output = "true"
evidence = constants.EVIDENCE_ID_RESOLVES
else:
evidence = constants.EVIDENCE_NO_RESOLVE_DOI_IDENTIFIER
break
else:
evidence = constants.EVIDENCE_NO_DOCUMENTATION_README
else:
output = "false"
evidence = constants.EVIDENCE_NO_IDENTIFIER_FOUND
if not pause and codemeta_data != None and codemeta_data['identifier']:
identifier = codemeta_data['identifier']

if not pause and cff_data != None and cff_data['identifiers'] != None:
identifier = cff_data['identifiers'][0]['value']

if identifier:
doi_url = rsfc_helpers.normalize_identifier_url(identifier)
try:
resp = requests.get(doi_url, allow_redirects=True, timeout=10)
html = resp.text

if rsfc_helpers.landing_page_links_back(html, sw.url):
output = "true"
evidence = constants.EVIDENCE_DOI_LINKS_BACK_TO_REPO
else:
output = "false"
evidence = constants.EVIDENCE_DOI_NO_LINK_BACK_TO_REPO

except requests.RequestException:
output = "false"
evidence = constants.EVIDENCE_NO_RESOLVE_DOI_IDENTIFIER


check = ch.Check(constants.INDICATORS_DICT['persistent_and_unique_identifier'], 'RSFC-07-2', constants.PROCESS_ID_RESOLVES_TO_SOFTWARE, output, evidence)
Expand Down Expand Up @@ -890,6 +904,26 @@ def test_license_spdx_compliant(somef_data):

return check.convert()


def test_license_information_provided(somef_data):

if 'license' not in somef_data:
output = "false"
evidence = constants.EVIDENCE_NO_LICENSE
else:
output = "false"
evidence = constants.EVIDENCE_NO_LICENSE_INFORMATION_PROVIDED
for item in somef_data['license']:
if 'source' in item:
if 'README' in item['source']:
output = "true"
evidence = constants.EVIDENCE_LICENSE_INFORMATION_PROVIDED


check = ch.Check(constants.INDICATORS_DICT['software_has_license'], 'RSFC-15-3', constants.PROCESS_LICENSE_INFORMATION_PROVIDED, output, evidence)

return check.convert()

################################################### FRSM_16 ###################################################

def test_license_info_in_metadata_files(somef_data, codemeta_data, cff_data):
Expand Down
7 changes: 6 additions & 1 deletion src/rsfc/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,15 @@
PROCESS_IS_GITHUB_OR_GITLAB_REPOSITORY = 'Checks if the URL provided is indeed a Github or Gitlab repository'
PROCESS_ZENODO_SOFTWARE_HERITAGE = 'Searches for Zenodo and Software Heritage badges in the README file of the repository'
PROCESS_IDENTIFIER_IN_README_CITATION = 'Searches for an identifier in the README or CITATION.cff files of the repository'
PROCESS_ID_RESOLVES_TO_SOFTWARE = 'Checks if the identifier found in the README file of the repository resolves to the software'
PROCESS_ID_RESOLVES_TO_SOFTWARE = 'Checks if the identifier found in the README file or metadata files (i.e. codemeta.json, CITATION.cff) resolves to a page that links back to the software repository'
PROCESS_AUTHORS = 'Searches for authors in various files of the repository (i.e. CITATION.cff, AUTHORS.md, codemeta.json)'
PROCESS_CONTRIBUTORS = "Searches for contributors in various files of the repository (i.e. codemeta.json, pyproject.toml, pom.xml)'"
PROCESS_AUTHOR_ORCIDS = 'Checks if all authors stated in the CITATION.cff file have an ORCID assigned'
PROCESS_AUTHOR_ROLES = 'Checks if all authors stated in a codemeta.json file have a role assigned '
PROCESS_VERSION_IN_METADATA = 'Checks if a version number for the software is indicated in the CITATION.cff, codemeta.json or package files(i.e. pyproject.toml, pom.xml, etc.)'
PROCESS_COMMITS_LINKED_TO_ISSUES = 'Checks if there is at least one of the existing issues (opened or closed) referenced in any of the commits made in the default branch of the repository'
PROCESS_COMMITS_HISTORY = 'Checks if the software repository has a commits history'
PROCESS_LICENSE_INFORMATION_PROVIDED = 'Checks if license information is found in the README file of the repository'


#Evidences
Expand Down Expand Up @@ -108,6 +109,7 @@
EVIDENCE_CONTACT_INFO = 'Contact and support information was found in the repository'
EVIDENCE_SPDX_COMPLIANT = 'Licenses are SPDX compliant'
EVIDENCE_LICENSE_INFO_IN_METADATA = 'License information was found in metadata files'
EVIDENCE_LICENSE_INFORMATION_PROVIDED = 'License information was found in the README file of the repository'
EVIDENCE_TICKETS = 'Tickets/Issues were found in the repository'
EVIDENCE_REPO_ENABLED_AND_HAS_COMMITS = 'Repository is enabled and has commits'
EVIDENCE_AUTHOR_ORCIDS_CODEMETA = 'All authors in the codemeta.json file have an orcid identifier'
Expand Down Expand Up @@ -139,6 +141,7 @@
EVIDENCE_VERSION_IN_METADATA = 'Found the software version in one of the specified files'
EVIDENCE_CONTRIBUTORS = 'Found contributors metadata in the codemeta or package files'
EVIDENCE_COMMITS_LINKED_TO_ISSUES = 'There is at least one commit linked to an issue'
EVIDENCE_DOI_LINKS_BACK_TO_REPO = "The landing page of the software's identifier links back to the software repository"


EVIDENCE_NO_LICENSE = 'Could not find any license in the repository'
Expand All @@ -165,6 +168,7 @@
EVIDENCE_NO_CONTACT_INFO = 'Could not find any of the following information: '
EVIDENCE_NO_SPDX_COMPLIANT = 'There is one or more licenses that are not SPDX compliant'
EVIDENCE_NO_LICENSE_INFO_IN_METADATA = 'Could not find any licensing information in the following metadata files: '
EVIDENCE_NO_LICENSE_INFORMATION_PROVIDED = 'Could not find license information in the README file of the repository'
EVIDENCE_NO_TICKETS = 'Could not find tickets/issues in the repository'
EVIDENCE_NO_REPO_ENABLED = 'Repository is not enabled'
EVIDENCE_NO_COMMITS = 'Could not find any commits in the repository'
Expand Down Expand Up @@ -196,6 +200,7 @@
EVIDENCE_NO_VERSION_IN_METADATA = 'Could not find a version number for the software in any of the specified files'
EVIDENCE_NOT_ENOUGH_ISSUES_COMMITS_INFO = 'Could not get the necessary information to perform the test, it being the commits record or repository issues'
EVIDENCE_NO_COMMITS_LINKED_TO_ISSUES = 'There is not any commits linked to any issues in the repository'
EVIDENCE_DOI_NO_LINK_BACK_TO_REPO = "The landing page of the software's identifier does not link back to the software repository"


#Dictionaries
Expand Down
68 changes: 60 additions & 8 deletions src/rsfc/utils/rsfc_helpers.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from datetime import datetime
import regex as re
import base64
from bs4 import BeautifulSoup
import requests
from rsfc.utils import constants
from concurrent.futures import ThreadPoolExecutor, as_completed

def get_gitlab_default_branch(base_url, repo_type):
if repo_type == "GITLAB":
res = requests.get(base_url)
res.raise_for_status()
data = res.json()
return data.get("default_branch", "main")
def get_repo_default_branch(base_url):
res = requests.get(base_url)
res.raise_for_status()
data = res.json()
return data.get("default_branch", "main")

def decode_github_content(content_json):
encoded_content = content_json.get('content', '')
Expand Down Expand Up @@ -93,13 +93,13 @@ def subtest_author_orcids(file_data):
return True



def build_url_pattern(url):
base_url = url.rsplit('/', 1)[0]
escaped = re.escape(base_url)
pattern_str = f"^{escaped}/\\d+$"
return re.compile(pattern_str)


def get_latest_release(repo_data):
if 'releases' in repo_data:
latest_release = None
Expand Down Expand Up @@ -155,4 +155,56 @@ def cross_check_any_issue(issues, commits, max_workers=8):
if future.result():
executor.shutdown(cancel_futures=True)
return True
return False
return False


def normalize_identifier_url(identifier):

identifier = identifier.strip()
lower = identifier.lower()

#Already normalized
if lower.startswith("https://doi.org/") or lower.startswith("http://doi.org/"):
return identifier

#Raw DOI
if re.match(constants.DOI_SCHEMA_REGEX, identifier, re.IGNORECASE):
return f"https://doi.org/{identifier}"

#DOI prefix
if lower.startswith("doi:"):
doi = identifier.split(":", 1)[1].strip()
return f"https://doi.org/{doi}"

#Other
if lower.startswith(("http://", "https://")):
try:
resp = requests.head(identifier, allow_redirects=True)
return resp.url
except requests.RequestException:
return identifier

#Fallback
return identifier


def landing_page_links_back(lp_html, repo_url):

if not lp_html:
return False

repo_norm = repo_url.rstrip("/").lower()
soup = BeautifulSoup(lp_html, "html.parser")

for a in soup.find_all("a", href=True):
if repo_norm in a["href"].rstrip("/").lower():
return True

for m in soup.find_all("meta"):
content = (m.get("content") or "").lower()
if repo_norm in content:
return True

return False