oeg-upm · andriumon · Sep 8, 2025 · Sep 1, 2025 · Sep 3, 2025 · Sep 8, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,25 @@
+FROM python:3.12-slim
+
+RUN apt-get update && \
+    apt-get install -y git curl && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN curl -sSL https://install.python-poetry.org | python3 -
+
+ENV PATH="/root/.local/bin:$PATH"
+
+WORKDIR /rsfc
+
+RUN git clone https://github.com/oeg-upm/rsfc.git .
+
+RUN poetry install --no-root
+
+RUN poetry run pip install .
+
+RUN poetry run python -m nltk.downloader wordnet
+
+RUN poetry run somef configure -a
+
+RUN mkdir -p /rsfc/outputs
+
+ENTRYPOINT ["poetry", "run", "rsfc"]
diff --git a/README.md b/README.md
@@ -99,6 +99,30 @@ Without poetry virtual environment activated you need to use the poetry run:
 poetry run rsfc --help
 ```
 
+## Docker installation
+
+If preferred, RSFC can be executed using docker.
+
+Once you have cloned the repository, go to the project's root directory and run the following command to build the image:
+
+```
+docker build -t rsfc-docker .
+```
+
+After that, it is necessary to create the directory in which the output assessment will be saved. You can do it by running the following command:
+
+```
+mkdir ./outputs
+```
+
+Finally, run the following command to run the container:
+
+```
+docker run --rm -v $(pwd)/outputs:/rsfc/outputs rsfc-docker <repo_url>
+```
+
+where repo_url is the url of the repository to be analyzed, which is strictly needed.
+
 ## Usage
 
 After installation, you can use the package by running if you activated the poetry env

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "rsfc"
-version = "0.0.2"
+version = "0.0.3"
 description = "EVERSE Research Software Fairness Checks"
 authors = ["Andres Montero <[email protected]>"]
 license = "MIT"

diff --git a/src/rsfc/__init__.py b/src/rsfc/__init__.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
 
 
-__version__ = "0.0.2"
+__version__ = "0.0.3"
 
diff --git a/src/rsfc/harvesters/cff_harvester.py b/src/rsfc/harvesters/cff_harvester.py
@@ -9,17 +9,15 @@ def __init__(self, sw):
 
 
     def get_cff_file(self, sw):
-        req_url = sw.base_url + '/contents/CITATION.cff'
 
         try:
             if sw.repo_type == "GITHUB":
-                req_url = sw.base_url + '/contents/codemeta.json'
+                req_url = sw.base_url + '/contents/CITATION.cff'
                 headers = {'Accept': 'application/vnd.github.v3.raw'}
                 params = {'ref': sw.repo_branch}
-
                 response = requests.get(req_url, headers=headers, params=params)
                 response.raise_for_status()
-                return response.json()
+                return yaml.safe_load(response.text)
             elif sw.repo_type == "GITLAB":
                 project_path_encoded = sw.base_url.split("/projects/")[-1]
                 branch = sw.repo_branch or "main"
@@ -55,7 +53,7 @@ def harvest_cff(self, cff):
             if "version" in cff:
                 cff_info["version"] = cff["version"]
 
-            if "idenfiers" in cff:
+            if "identifiers" in cff:
                 cff_info["identifiers"] = cff["identifiers"]
 
             if "preferred-citation" in cff:

diff --git a/src/rsfc/harvesters/somef_harvester.py b/src/rsfc/harvesters/somef_harvester.py
@@ -2,7 +2,7 @@
 import contextlib
 import json
 from somef import somef_cli
-import os
+#import os
 
 class SomefHarvester:
 

diff --git a/src/rsfc/model/assessedSoftware.py b/src/rsfc/model/assessedSoftware.py
@@ -12,7 +12,7 @@ def __init__(self, repo_url):
         self.name = self.get_soft_name()
         self.version = self.get_soft_version()
         self.id = None
-        self.repo_branch = rsfc_helpers.get_gitlab_default_branch(self.base_url, self.repo_type)
+        self.repo_branch = rsfc_helpers.get_repo_default_branch(self.base_url)
 
 
     def get_repo_base_url(self):

diff --git a/src/rsfc/model/indicator.py b/src/rsfc/model/indicator.py
@@ -10,7 +10,7 @@ def __init__(self, sw, somef, cd, cf):
                 (rt.test_id_associated_with_software, [somef.somef_data, cd.codemeta_data, cf.cff_data]),
                 (rt.test_id_common_schema, [somef.somef_data]),
                 (rt.test_identifier_in_readme_citation, [somef.somef_data, cf.cff_data]),
-                (rt.test_identifier_resolves_to_software, [somef.somef_data])
+                (rt.test_identifier_resolves_to_software, [somef.somef_data, cd.codemeta_data, cf.cff_data, sw])
             ],
             "requirements_specified": [
                 (rt.test_dependencies_declared, [somef.somef_data]),
@@ -43,7 +43,8 @@ def __init__(self, sw, somef, cd, cf):
             "software_has_license": [
                 (rt.test_has_license, [somef.somef_data]),
                 (rt.test_license_spdx_compliant, [somef.somef_data]),
-                (rt.test_license_info_in_metadata_files, [somef.somef_data, cd.codemeta_data, cf.cff_data])
+                (rt.test_license_info_in_metadata_files, [somef.somef_data, cd.codemeta_data, cf.cff_data]),
+                (rt.test_license_information_provided, [somef.somef_data])
             ],
             "descriptive_metadata": [
                 (rt.test_authors, [somef.somef_data, cd.codemeta_data, cf.cff_data]),

diff --git a/src/rsfc/rsfc_tests/rsfc_tests.py b/src/rsfc/rsfc_tests/rsfc_tests.py
@@ -562,7 +562,7 @@ def test_identifier_in_readme_citation(somef_data, cff_data):
         readme = True
 
     if cff_data != None:
-        if "identifiers" in cff_data:
+        if cff_data["identifiers"] != None:
             citation = True
 
     if readme and not citation:
@@ -584,29 +584,43 @@ def test_identifier_in_readme_citation(somef_data, cff_data):
     return check.convert()
 
 
-
-def test_identifier_resolves_to_software(somef_data): #CAMBIAR. Este debe coger el id que este en el readme, codemeta o cff y mirar que resuelva a la url del software
+def test_identifier_resolves_to_software(somef_data, codemeta_data, cff_data, sw): #CAMBIAR. Este debe coger el id que este en el readme, codemeta o cff y mirar que resuelva a la url del software
 
     output = "false"
-
+    evidence = constants.EVIDENCE_NO_IDENTIFIER_FOUND
+    identifier = None
+    pause = False
+
     if 'identifier' in somef_data:
         for item in somef_data['identifier']:
             if item['source']:
                 if 'README' in item['source']:
-                    id = item['result']['value']
+                    identifier = item['result']['value']
+                    pause = True
+                    break
 
-                    response = requests.head(id, allow_redirects=True, timeout=5)
-                    if response.status_code == 200:
-                        output = "true"
-                        evidence = constants.EVIDENCE_ID_RESOLVES
-                    else:
-                        evidence = constants.EVIDENCE_NO_RESOLVE_DOI_IDENTIFIER
-                        break
-                else:
-                    evidence = constants.EVIDENCE_NO_DOCUMENTATION_README
-    else:
-        output = "false"
-        evidence = constants.EVIDENCE_NO_IDENTIFIER_FOUND
+    if not pause and codemeta_data != None and codemeta_data['identifier']:
+        identifier = codemeta_data['identifier']
+
+    if not pause and cff_data != None and cff_data['identifiers'] != None:
+        identifier = cff_data['identifiers'][0]['value']
+
+    if identifier:
+        doi_url = rsfc_helpers.normalize_identifier_url(identifier)
+        try:
+            resp = requests.get(doi_url, allow_redirects=True, timeout=10)
+            html = resp.text
+
+            if rsfc_helpers.landing_page_links_back(html, sw.url):
+                output = "true"
+                evidence = constants.EVIDENCE_DOI_LINKS_BACK_TO_REPO
+            else:
+                output = "false"
+                evidence = constants.EVIDENCE_DOI_NO_LINK_BACK_TO_REPO
+
+        except requests.RequestException:
+            output = "false"
+            evidence = constants.EVIDENCE_NO_RESOLVE_DOI_IDENTIFIER
 
 
     check = ch.Check(constants.INDICATORS_DICT['persistent_and_unique_identifier'], 'RSFC-07-2', constants.PROCESS_ID_RESOLVES_TO_SOFTWARE, output, evidence)
@@ -890,6 +904,26 @@ def test_license_spdx_compliant(somef_data):
 
     return check.convert()
 
+
+def test_license_information_provided(somef_data):
+
+    if 'license' not in somef_data:
+        output = "false"
+        evidence = constants.EVIDENCE_NO_LICENSE
+    else:
+        output = "false"
+        evidence = constants.EVIDENCE_NO_LICENSE_INFORMATION_PROVIDED
+        for item in somef_data['license']:
+            if 'source' in item:
+                if 'README' in item['source']:
+                    output = "true"
+                    evidence = constants.EVIDENCE_LICENSE_INFORMATION_PROVIDED
+
+
+    check = ch.Check(constants.INDICATORS_DICT['software_has_license'], 'RSFC-15-3', constants.PROCESS_LICENSE_INFORMATION_PROVIDED, output, evidence)
+
+    return check.convert()
+
 ################################################### FRSM_16 ###################################################
 
 def test_license_info_in_metadata_files(somef_data, codemeta_data, cff_data):

diff --git a/src/rsfc/utils/constants.py b/src/rsfc/utils/constants.py
@@ -73,14 +73,15 @@
 PROCESS_IS_GITHUB_OR_GITLAB_REPOSITORY = 'Checks if the URL provided is indeed a Github or Gitlab repository'
 PROCESS_ZENODO_SOFTWARE_HERITAGE = 'Searches for Zenodo and Software Heritage badges in the README file of the repository'
 PROCESS_IDENTIFIER_IN_README_CITATION = 'Searches for an identifier in the README or CITATION.cff files of the repository'
-PROCESS_ID_RESOLVES_TO_SOFTWARE = 'Checks if the identifier found in the README file of the repository resolves to the software'
+PROCESS_ID_RESOLVES_TO_SOFTWARE = 'Checks if the identifier found in the README file or metadata files (i.e. codemeta.json, CITATION.cff) resolves to a page that links back to the software repository'
 PROCESS_AUTHORS = 'Searches for authors in various files of the repository (i.e. CITATION.cff, AUTHORS.md, codemeta.json)'
 PROCESS_CONTRIBUTORS = "Searches for contributors in various files of the repository (i.e. codemeta.json, pyproject.toml, pom.xml)'"
 PROCESS_AUTHOR_ORCIDS = 'Checks if all authors stated in the CITATION.cff file have an ORCID assigned'
 PROCESS_AUTHOR_ROLES = 'Checks if all authors stated in a codemeta.json file have a role assigned '
 PROCESS_VERSION_IN_METADATA = 'Checks if a version number for the software is indicated in the CITATION.cff, codemeta.json or package files(i.e. pyproject.toml, pom.xml, etc.)'
 PROCESS_COMMITS_LINKED_TO_ISSUES = 'Checks if there is at least one of the existing issues (opened or closed) referenced in any of the commits made in the default branch of the repository'
 PROCESS_COMMITS_HISTORY = 'Checks if the software repository has a commits history'
+PROCESS_LICENSE_INFORMATION_PROVIDED = 'Checks if license information is found in the README file of the repository'
 
 
 #Evidences
@@ -108,6 +109,7 @@
 EVIDENCE_CONTACT_INFO = 'Contact and support information was found in the repository'
 EVIDENCE_SPDX_COMPLIANT = 'Licenses are SPDX compliant'
 EVIDENCE_LICENSE_INFO_IN_METADATA = 'License information was found in metadata files'
+EVIDENCE_LICENSE_INFORMATION_PROVIDED = 'License information was found in the README file of the repository'
 EVIDENCE_TICKETS = 'Tickets/Issues were found in the repository'
 EVIDENCE_REPO_ENABLED_AND_HAS_COMMITS = 'Repository is enabled and has commits'
 EVIDENCE_AUTHOR_ORCIDS_CODEMETA = 'All authors in the codemeta.json file have an orcid identifier'
@@ -139,6 +141,7 @@
 EVIDENCE_VERSION_IN_METADATA = 'Found the software version in one of the specified files'
 EVIDENCE_CONTRIBUTORS = 'Found contributors metadata in the codemeta or package files'
 EVIDENCE_COMMITS_LINKED_TO_ISSUES = 'There is at least one commit linked to an issue'
+EVIDENCE_DOI_LINKS_BACK_TO_REPO = "The landing page of the software's identifier links back to the software repository"
 
 
 EVIDENCE_NO_LICENSE = 'Could not find any license in the repository'
@@ -165,6 +168,7 @@
 EVIDENCE_NO_CONTACT_INFO = 'Could not find any of the following information: '
 EVIDENCE_NO_SPDX_COMPLIANT = 'There is one or more licenses that are not SPDX compliant'
 EVIDENCE_NO_LICENSE_INFO_IN_METADATA = 'Could not find any licensing information in the following metadata files: '
+EVIDENCE_NO_LICENSE_INFORMATION_PROVIDED = 'Could not find license information in the README file of the repository'
 EVIDENCE_NO_TICKETS = 'Could not find tickets/issues in the repository'
 EVIDENCE_NO_REPO_ENABLED = 'Repository is not enabled'
 EVIDENCE_NO_COMMITS = 'Could not find any commits in the repository'
@@ -196,6 +200,7 @@
 EVIDENCE_NO_VERSION_IN_METADATA = 'Could not find a version number for the software in any of the specified files'
 EVIDENCE_NOT_ENOUGH_ISSUES_COMMITS_INFO = 'Could not get the necessary information to perform the test, it being the commits record or repository issues'
 EVIDENCE_NO_COMMITS_LINKED_TO_ISSUES = 'There is not any commits linked to any issues in the repository'
+EVIDENCE_DOI_NO_LINK_BACK_TO_REPO = "The landing page of the software's identifier does not link back to the software repository"
 
 
 #Dictionaries

diff --git a/src/rsfc/utils/rsfc_helpers.py b/src/rsfc/utils/rsfc_helpers.py
@@ -1,16 +1,16 @@
 from datetime import datetime
 import regex as re
 import base64
+from bs4 import BeautifulSoup
 import requests
 from rsfc.utils import constants
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
-def get_gitlab_default_branch(base_url, repo_type):
-    if repo_type == "GITLAB":
-        res = requests.get(base_url)
-        res.raise_for_status()
-        data = res.json()
-        return data.get("default_branch", "main")
+def get_repo_default_branch(base_url):
+    res = requests.get(base_url)
+    res.raise_for_status()
+    data = res.json()
+    return data.get("default_branch", "main")
 
 def decode_github_content(content_json):
     encoded_content = content_json.get('content', '')
@@ -93,13 +93,13 @@ def subtest_author_orcids(file_data):
     return True
 
 
-
 def build_url_pattern(url):
     base_url = url.rsplit('/', 1)[0]
     escaped = re.escape(base_url)
     pattern_str = f"^{escaped}/\\d+$"
     return re.compile(pattern_str)
 
+
 def get_latest_release(repo_data):
     if 'releases' in repo_data:
         latest_release = None
@@ -155,4 +155,56 @@ def cross_check_any_issue(issues, commits, max_workers=8):
             if future.result():
                 executor.shutdown(cancel_futures=True)
                 return True
-    return False
+    return False
+
+
+def normalize_identifier_url(identifier):
+
+    identifier = identifier.strip()
+    lower = identifier.lower()
+
+    #Already normalized
+    if lower.startswith("https://doi.org/") or lower.startswith("http://doi.org/"):
+        return identifier
+
+    #Raw DOI
+    if re.match(constants.DOI_SCHEMA_REGEX, identifier, re.IGNORECASE):
+        return f"https://doi.org/{identifier}"
+
+    #DOI prefix
+    if lower.startswith("doi:"):
+        doi = identifier.split(":", 1)[1].strip()
+        return f"https://doi.org/{doi}"
+
+    #Other
+    if lower.startswith(("http://", "https://")):
+        try:
+            resp = requests.head(identifier, allow_redirects=True)
+            return resp.url
+        except requests.RequestException:
+            return identifier
+
+    #Fallback
+    return identifier
+
+
+def landing_page_links_back(lp_html, repo_url):
+
+    if not lp_html:
+        return False
+
+    repo_norm = repo_url.rstrip("/").lower()
+    soup = BeautifulSoup(lp_html, "html.parser")
+
+    for a in soup.find_all("a", href=True):
+        if repo_norm in a["href"].rstrip("/").lower():
+            return True
+
+    for m in soup.find_all("meta"):
+        content = (m.get("content") or "").lower()
+        if repo_norm in content:
+            return True
+
+    return False
+
+