Skip to content

Commit 2243f51

Browse files
committed
Add support for external licenses in scans #480
This adds `-dir` or `--additional-directories` as a command line option in license detection. This allows users to specify paths to directories of licenses and rules they'd like to use during license detection, but would not like to add to the ScanCode database of licenses. This involves adding a new option in `licensedcode/plugin_license.py`, and this option is used as a parameter in `scancode/api.py`. In this approach, the licenses and rules contained in these additional directories are combined with the existing licenses and rules in the ScanCode database to produce a single index. The code for this is found in `licensedcode/cache.py` and the helper methods for loading these licenses and rules are found in `licensedcode/models.py`. This commit also includes a unit test to verify that license detection succeeds with an additional directory found in `tests/licensedcode/test_plugin_license.py`. Part of the setup for the unit test and future tests involves creating a new directory in `tests/licensedcode/data` that contains sample external licenses used in the unit tests. Signed-off-by: Kevin Ji <[email protected]>
1 parent 7394e79 commit 2243f51

File tree

17 files changed

+445
-30
lines changed

17 files changed

+445
-30
lines changed

src/licensedcode/cache.py

Lines changed: 74 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
LICENSE_INDEX_FILENAME = 'index_cache'
3636
LICENSE_LOCKFILE_NAME = 'scancode_license_index_lockfile'
3737
LICENSE_CHECKSUM_FILE = 'scancode_license_index_tree_checksums'
38+
CACHED_DIRECTORIES_FILENAME = 'cached_directories'
3839

3940

4041
@attr.s(slots=True)
@@ -58,6 +59,7 @@ def load_or_build(
5859
timeout=LICENSE_INDEX_LOCK_TIMEOUT,
5960
licenses_data_dir=None,
6061
rules_data_dir=None,
62+
additional_directories=None,
6163
):
6264
"""
6365
Load or build and save and return a LicenseCache object.
@@ -66,7 +68,8 @@ def load_or_build(
6668
On the side, we load cached or build license db, SPDX symbols and other
6769
license-related data structures.
6870
69-
- If the cache exists, it is returned unless corrupted or ``force`` is True.
71+
- If the cache exists, it is returned unless corrupted, ``force`` is True, or if we pass in additional
72+
directories containing licenses that are not present in the existing cache.
7073
- If the cache does not exist, a new index is built and cached.
7174
- If ``index_all_languages`` is True, include texts in all languages when
7275
building the license index. Otherwise, only include the English license \
@@ -75,11 +78,30 @@ def load_or_build(
7578
idx_cache_dir = os.path.join(licensedcode_cache_dir, LICENSE_INDEX_DIR)
7679
create_dir(idx_cache_dir)
7780
cache_file = os.path.join(idx_cache_dir, LICENSE_INDEX_FILENAME)
81+
cached_directories_file = os.path.join(idx_cache_dir, CACHED_DIRECTORIES_FILENAME)
7882

7983
has_cache = os.path.exists(cache_file) and os.path.getsize(cache_file)
84+
has_cached_directories = os.path.exists(cached_directories_file) and os.path.getsize(cached_directories_file)
85+
should_rebuild_cache = False
86+
87+
if has_cached_directories:
88+
# if we have cached additional directories of licenses, check if those licenses are equal to the additional
89+
# directories passed in
90+
with open(cached_directories_file, 'rb') as file:
91+
cached_additional_directories = pickle.load(file)
92+
93+
# we need to rebuild the cache if the list of additional directories we passed in is different than
94+
# the set of additional directories current included in the index cache
95+
should_rebuild_cache = additional_directories is not None \
96+
and sorted(additional_directories) != sorted(cached_additional_directories)
97+
else:
98+
# otherwise, we don't have a file of cached directories. If there are additional directories passed in,
99+
# we know we need to make a new cache file.
100+
if additional_directories:
101+
should_rebuild_cache = True
80102

81103
# bypass build if cache exists
82-
if has_cache and not force:
104+
if has_cache and not force and not should_rebuild_cache:
83105
try:
84106
return load_cache_file(cache_file)
85107
except Exception as e:
@@ -92,6 +114,8 @@ def load_or_build(
92114
from licensedcode.models import licenses_data_dir as ldd
93115
from licensedcode.models import rules_data_dir as rdd
94116
from licensedcode.models import load_licenses
117+
from licensedcode.models import load_licenses_from_multiple_dirs
118+
from licensedcode.models import get_license_dirs
95119
from scancode import lockfile
96120

97121
licenses_data_dir = licenses_data_dir or ldd
@@ -106,13 +130,21 @@ def load_or_build(
106130
# Here, the cache is either stale or non-existing: we need to
107131
# rebuild all cached data (e.g. mostly the index) and cache it
108132

109-
licenses_db = load_licenses(licenses_data_dir=licenses_data_dir)
133+
if additional_directories:
134+
additional_license_dirs = get_license_dirs(additional_dirs=additional_directories)
135+
combined_directories = [licenses_data_dir] + additional_license_dirs
136+
licenses_db = load_licenses_from_multiple_dirs(license_directories=combined_directories)
137+
else:
138+
licenses_db = load_licenses(licenses_data_dir=licenses_data_dir)
110139

140+
# create a single merged index containing license data from licenses_data_dir
141+
# and data from additional directories
111142
index = build_index(
112143
licenses_db=licenses_db,
113144
licenses_data_dir=licenses_data_dir,
114145
rules_data_dir=rules_data_dir,
115146
index_all_languages=index_all_languages,
147+
additional_directories=additional_directories,
116148
)
117149

118150
spdx_symbols = build_spdx_symbols(licenses_db=licenses_db)
@@ -131,6 +163,10 @@ def load_or_build(
131163
with open(cache_file, 'wb') as fn:
132164
pickle.dump(license_cache, fn, protocol=PICKLE_PROTOCOL)
133165

166+
# save the list of additional directories included in the cache
167+
with open(cached_directories_file, 'wb') as file:
168+
pickle.dump(additional_directories, file, protocol=PICKLE_PROTOCOL)
169+
134170
return license_cache
135171

136172
except lockfile.LockTimeout:
@@ -143,27 +179,50 @@ def build_index(
143179
licenses_data_dir=None,
144180
rules_data_dir=None,
145181
index_all_languages=False,
182+
additional_directories=None,
146183
):
147184
"""
148185
Return an index built from rules and licenses directories
149186
150187
If ``index_all_languages`` is True, include texts and rules in all languages.
151188
Otherwise, only include the English license texts and rules (the default)
189+
If ``additional_directories`` is not None, we will include licenses and rules
190+
from these additional directories in the returned index.
152191
"""
153192
from licensedcode.index import LicenseIndex
193+
from licensedcode.models import get_license_dirs
194+
from licensedcode.models import get_rule_dirs
154195
from licensedcode.models import get_rules
196+
from licensedcode.models import get_rules_from_multiple_dirs
155197
from licensedcode.models import get_all_spdx_key_tokens
156198
from licensedcode.models import get_license_tokens
157199
from licensedcode.models import licenses_data_dir as ldd
158200
from licensedcode.models import rules_data_dir as rdd
159201
from licensedcode.models import load_licenses
202+
from licensedcode.models import load_licenses_from_multiple_dirs
160203
from licensedcode.legalese import common_license_words
161204

162205
licenses_data_dir = licenses_data_dir or ldd
163206
rules_data_dir = rules_data_dir or rdd
164207

165-
licenses_db = licenses_db or load_licenses(licenses_data_dir=licenses_data_dir)
166-
rules = get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir)
208+
if not licenses_db:
209+
if additional_directories:
210+
# combine the licenses in these additional directories with the licenses in the original DB
211+
additional_license_dirs = get_license_dirs(additional_dirs=additional_directories)
212+
combined_license_directories = [licenses_data_dir] + additional_license_dirs
213+
# generate a single combined license db with all licenses
214+
licenses_db = load_licenses_from_multiple_dirs(license_dirs=combined_license_directories)
215+
else:
216+
licenses_db = load_licenses(licenses_data_dir=licenses_data_dir)
217+
218+
if additional_directories:
219+
# if we have additional directories, extract the rules from them
220+
additional_rule_dirs = get_rule_dirs(additional_dirs=additional_directories)
221+
# then combine the rules in these additional directories with the rules in the original rules directory
222+
combined_rule_directories = [rules_data_dir] + additional_rule_dirs
223+
rules = get_rules_from_multiple_dirs(licenses_db=licenses_db, rule_directories=combined_rule_directories)
224+
else:
225+
rules = get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir)
167226

168227
legalese = common_license_words
169228
spdx_tokens = set(get_all_spdx_key_tokens(licenses_db))
@@ -299,20 +358,20 @@ def build_unknown_spdx_symbol(licenses_db=None):
299358
return LicenseSymbolLike(licenses_db['unknown-spdx'])
300359

301360

302-
def get_cache(force=False, index_all_languages=False):
361+
def get_cache(force=False, index_all_languages=False, additional_directories=None):
303362
"""
304363
Return a LicenseCache either rebuilt, cached or loaded from disk.
305364
306365
If ``index_all_languages`` is True, include texts in all languages when
307366
building the license index. Otherwise, only include the English license \
308367
texts and rules (the default)
309368
"""
310-
populate_cache(force=force, index_all_languages=index_all_languages)
369+
populate_cache(force=force, index_all_languages=index_all_languages, additional_directories=additional_directories)
311370
global _LICENSE_CACHE
312371
return _LICENSE_CACHE
313372

314373

315-
def populate_cache(force=False, index_all_languages=False):
374+
def populate_cache(force=False, index_all_languages=False, additional_directories=None):
316375
"""
317376
Load or build and cache a LicenseCache. Return None.
318377
"""
@@ -325,6 +384,7 @@ def populate_cache(force=False, index_all_languages=False):
325384
index_all_languages=index_all_languages,
326385
# used for testing only
327386
timeout=LICENSE_INDEX_LOCK_TIMEOUT,
387+
additional_directories=additional_directories,
328388
)
329389

330390

@@ -346,11 +406,15 @@ def load_cache_file(cache_file):
346406
raise Exception(msg) from e
347407

348408

349-
def get_index(force=False, index_all_languages=False):
409+
def get_index(force=False, index_all_languages=False, additional_directories=None):
350410
"""
351411
Return and eventually build and cache a LicenseIndex.
352412
"""
353-
return get_cache(force=force, index_all_languages=index_all_languages).index
413+
return get_cache(
414+
force=force,
415+
index_all_languages=index_all_languages,
416+
additional_directories=additional_directories
417+
).index
354418

355419

356420
get_cached_index = get_index

src/licensedcode/models.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from os.path import dirname
2121
from os.path import exists
2222
from os.path import join
23+
from pathlib import Path
2324

2425
import attr
2526
import saneyaml
@@ -772,6 +773,68 @@ def get_rules(
772773
return chain(licenses_as_rules, rules)
773774

774775

776+
def get_license_dirs(
777+
additional_dirs,
778+
):
779+
"""
780+
Takes in a list of additional directories specified during license detection
781+
and produces a list of all the subdirectories containing license files.
782+
"""
783+
# convert to absolute path in case user passes in a relative path, which messes up building rules from licenses
784+
return [f"{str(Path(path).absolute())}/licenses" for path in additional_dirs]
785+
786+
787+
def get_rule_dirs(
788+
additional_dirs,
789+
):
790+
"""
791+
Takes in a list of additional directories specified during license detection
792+
and produces a list of all the subdirectories containing rule files.
793+
"""
794+
return [f"{str(Path(path).absolute())}/rules" for path in additional_dirs]
795+
796+
797+
def load_licenses_from_multiple_dirs(
798+
license_directories,
799+
with_deprecated=False,
800+
):
801+
"""
802+
Takes in a list of directories containing additional licenses to use in
803+
license detection and combines all the licenses into the same mapping.
804+
"""
805+
combined_licenses = {}
806+
for license_dir in license_directories:
807+
licenses = load_licenses(licenses_data_dir=license_dir, with_deprecated=False)
808+
# this syntax for merging is described here: https://stackoverflow.com/a/26853961
809+
combined_licenses = {**combined_licenses, **licenses}
810+
return combined_licenses
811+
812+
813+
def get_rules_from_multiple_dirs(
814+
licenses_db,
815+
rule_directories,
816+
):
817+
"""
818+
Takes in a license database, which is a mapping from key->License objects,
819+
and a list of all directories containing rules to use in license detection.
820+
Combines all rules together into the same data structure and validates them.
821+
"""
822+
if rule_directories:
823+
combined_rules = []
824+
for rules_dir in rule_directories:
825+
r = list(load_rules(
826+
rules_data_dir=rules_dir,
827+
))
828+
combined_rules.append(r)
829+
# flatten lists of rules into a single iterable
830+
rules = list(chain.from_iterable(combined_rules))
831+
validate_rules(rules, licenses_db)
832+
licenses_as_rules = build_rules_from_licenses(licenses_db)
833+
return chain(licenses_as_rules, rules)
834+
else:
835+
return get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir)
836+
837+
775838
class InvalidRule(Exception):
776839
pass
777840

src/licensedcode/plugin_license.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from commoncode.resource import clean_path
1919
from plugincode.scan import ScanPlugin
2020
from plugincode.scan import scan_impl
21+
import click
2122

2223
from scancode.api import SCANCODE_LICENSEDB_URL
2324

@@ -139,6 +140,15 @@ class LicenseScanner(ScanPlugin):
139140
help_group=SCAN_OPTIONS_GROUP,
140141
),
141142

143+
PluggableCommandLineOption(
144+
('-dir', '--additional_directories'),
145+
required_options=['license'],
146+
multiple=True,
147+
type=click.Path(exists=True, readable=True, path_type=str),
148+
help='Include additional directories for license detection.',
149+
help_group=SCAN_OPTIONS_GROUP,
150+
),
151+
142152
PluggableCommandLineOption(
143153
('--reindex-licenses',),
144154
is_flag=True, is_eager=True,
@@ -167,7 +177,8 @@ def setup(self, **kwargs):
167177
loaded index.
168178
"""
169179
from licensedcode.cache import populate_cache
170-
populate_cache()
180+
additional_directories = kwargs.get('additional_directories')
181+
populate_cache(additional_directories=additional_directories)
171182

172183
def get_scanner(
173184
self,
@@ -176,6 +187,7 @@ def get_scanner(
176187
license_text_diagnostics=False,
177188
license_url_template=SCANCODE_LICENSEDB_URL,
178189
unknown_licenses=False,
190+
additional_directories=None,
179191
**kwargs
180192
):
181193

@@ -186,6 +198,7 @@ def get_scanner(
186198
license_text_diagnostics=license_text_diagnostics,
187199
license_url_template=license_url_template,
188200
unknown_licenses=unknown_licenses,
201+
additional_directories=additional_directories,
189202
)
190203

191204
def process_codebase(self, codebase, unknown_licenses, **kwargs):

src/scancode/api.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ def get_licenses(
142142
license_url_template=SCANCODE_LICENSEDB_URL,
143143
unknown_licenses=False,
144144
deadline=sys.maxsize,
145+
additional_directories=None,
145146
**kwargs,
146147
):
147148
"""
@@ -168,7 +169,7 @@ def get_licenses(
168169
from licensedcode import cache
169170
from licensedcode.spans import Span
170171

171-
idx = cache.get_index()
172+
idx = cache.get_index(additional_directories=additional_directories)
172173

173174
detected_licenses = []
174175
detected_expressions = []
@@ -252,6 +253,7 @@ def _licenses_data_from_match(
252253
result['homepage_url'] = lic.homepage_url
253254
result['text_url'] = lic.text_urls[0] if lic.text_urls else ''
254255
result['reference_url'] = license_url_template.format(lic.key)
256+
# TODO: change this in the case of a private license?
255257
result['scancode_text_url'] = SCANCODE_LICENSE_TEXT_URL.format(lic.key)
256258
result['scancode_data_url'] = SCANCODE_LICENSE_DATA_URL.format(lic.key)
257259

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
The quick brown fox jumps over the lazy dog.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
key: example1
2+
short_name: Example External License 1
3+
name: Example External License 1
4+
category: Permissive
5+
owner: NexB
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
The quick brown fox jumps over the lazy dog.
2+
The quick brown fox jumps over the lazy dog.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
license_expression: example1
2+
is_license_text: yes
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Lorem ipsum dolor sit amet, consectetur adipiscing elit,
2+
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
3+
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi
4+
ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit
5+
in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
6+
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia
7+
deserunt mollit anim id est laborum.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
key: example2
2+
short_name: Example External License 2
3+
name: Example External License 2
4+
category: Permissive
5+
owner: NexB

0 commit comments

Comments
 (0)