Skip to content

Commit f861310

Browse files
Add --licenses-reference option
Add new command line option to not inline license, licenseDB and license detection level information. Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent cb4ac9e commit f861310

File tree

9 files changed

+1309
-135
lines changed

9 files changed

+1309
-135
lines changed

src/licensedcode/detection.py

Lines changed: 74 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10+
import sys
1011
import os
1112
import logging
1213
from enum import Enum
@@ -95,6 +96,18 @@ class CombinationReason(Enum):
9596
UNVERSIONED_FOLLOWED_BY_VERSIONED = 'un-versioned-followed-by-versioned'
9697

9798

99+
@attr.s
100+
class FileRegion:
101+
"""
102+
A file has one or more file-regions, which are separate regions of the file
103+
containing some license information (separated by code/text/others in between),
104+
and identified by a start line and an end line.
105+
"""
106+
path = attr.ib(type=str)
107+
start_line = attr.ib(type=int)
108+
end_line = attr.ib(type=int)
109+
110+
98111
@attr.s(slots=True, eq=False, order=False)
99112
class LicenseDetection:
100113
"""
@@ -135,6 +148,14 @@ class LicenseDetection:
135148
)
136149
)
137150

151+
# Only used in unique detection calculation and referencing
152+
file_region = attr.ib(
153+
default=attr.Factory(dict),
154+
metadata=dict(
155+
help='File path and start end lines to locate the detection.'
156+
)
157+
)
158+
138159
@classmethod
139160
def from_matches(cls, matches):
140161
"""
@@ -162,6 +183,19 @@ def from_matches(cls, matches):
162183
spdx_license_expression=str(spdx_license_expression),
163184
combination_reasons=reasons,
164185
)
186+
187+
@classmethod
188+
def from_mapping(cls, detection):
189+
"""
190+
Return a LicenseDetection created out of `detection` dict with
191+
the same attributes.
192+
"""
193+
return cls(
194+
matches=detection['matches'],
195+
license_expression=detection['license_expression'],
196+
spdx_license_expression=detection['spdx_license_expression'],
197+
combination_reasons=detection['combination_reasons'],
198+
)
165199

166200
def __eq__(self, other):
167201
return (
@@ -181,36 +215,65 @@ def query(self):
181215
def qspans(self):
182216
return [match.qspan for match in self.matches]
183217

218+
def get_file_region(self, path):
219+
"""
220+
This is an identifier for a license detection, based on it's underlying
221+
license matches.
222+
"""
223+
start_line, end_line = self.get_start_end_line()
224+
return FileRegion(
225+
path=path,
226+
start_line=start_line,
227+
end_line=end_line,
228+
)
229+
184230
@property
185231
def identifier(self):
186232
"""
187233
This is an identifier for a license detection, based on it's underlying
188234
license matches.
235+
236+
This is not guaranteed to be unique for a detection, as certain small and
237+
unknown matches could have the same value for this identifier, but will
238+
be unique for most other cases.
189239
"""
190240
data = []
191-
for license_match in self.original_licenses:
192-
identifier = (license_match.rule_identifier, license_match.coverage(),)
241+
for match in self.matches:
242+
identifier = (match['licensedb_identifier'], match['match_coverage'],)
193243
data.append(identifier)
194244

195-
return tuple(data)
245+
# Return a positive hash value for the tuple
246+
return tuple(data).__hash__() % ((sys.maxsize + 1) * 2)
196247

197248
@property
198249
def identifier_with_text(self):
199250
"""
200-
This is an identifier for a issue, which is an unknown license intro,
201-
based on it's underlying license matches.
251+
This is an identifier for a license detection, based on it's underlying
252+
license matches with the tokenized matched_text.
253+
254+
This is guaranteed to be unique for a detection.
202255
"""
203256
data = []
204-
for license_match in self.original_licenses:
205-
tokenized_matched_text = tuple(query_tokenizer(license_match.matched_text))
257+
for match in self.matches:
258+
tokenized_matched_text = tuple(query_tokenizer(match['matched_text']))
206259
identifier = (
207-
license_match.rule_identifier,
208-
license_match.coverage(),
260+
match['licensedb_identifier'],
261+
match['match_coverage'],
209262
tokenized_matched_text,
210263
)
211264
data.append(identifier)
212265

213-
return tuple(data)
266+
# Return a positive hash value for the tuple
267+
return tuple(data).__hash__() % ((sys.maxsize + 1) * 2)
268+
269+
def get_start_end_line(self):
270+
"""
271+
Returns start and end line for a license detection issue, from the
272+
license match(es).
273+
"""
274+
start_line = min([match['start_line'] for match in self.matches])
275+
end_line = max([match['end_line'] for match in self.matches])
276+
return start_line, end_line
214277

215278
def rules_length(self):
216279
"""
@@ -373,6 +436,7 @@ def to_dict(
373436

374437
detection = attr.asdict(self)
375438
detection["matches"] = data_matches
439+
_file_region = detection.pop('file_region')
376440

377441
return detection
378442

src/licensedcode/plugin_licenses_reference.py

Lines changed: 190 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,16 @@
88
#
99

1010
import attr
11+
from collections import Counter
12+
1113
from commoncode.cliutils import PluggableCommandLineOption
1214
from commoncode.cliutils import POST_SCAN_GROUP
1315
from license_expression import Licensing
1416
from plugincode.post_scan import PostScanPlugin
1517
from plugincode.post_scan import post_scan_impl
1618

19+
from licensedcode.detection import LicenseDetection
20+
1721
# Set to True to enable debug tracing
1822
TRACE = False
1923

@@ -39,7 +43,11 @@ class LicensesReference(PostScanPlugin):
3943
"""
4044
Add a reference list of all licenses data and text.
4145
"""
42-
codebase_attributes = dict(licenses_reference=attr.ib(default=attr.Factory(list)))
46+
codebase_attributes = dict(
47+
license_references=attr.ib(default=attr.Factory(list)),
48+
licensedb_references=attr.ib(default=attr.Factory(list)),
49+
license_detection_references=attr.ib(default=attr.Factory(list))
50+
)
4351

4452
sort_order = 500
4553

@@ -55,30 +63,187 @@ def is_enabled(self, licenses_reference, **kwargs):
5563
return licenses_reference
5664

5765
def process_codebase(self, codebase, licenses_reference, **kwargs):
58-
from licensedcode.cache import get_licenses_db
59-
licensing = Licensing()
60-
61-
license_keys = set()
66+
"""
67+
Get Licenses and LicenseDB data from all license detections in a codebase level list
68+
and only refer to them in the resource level detections.
69+
"""
70+
licexps = []
71+
license_db_data = []
72+
license_detections_by_path = {}
6273

6374
for resource in codebase.walk():
64-
licexps = getattr(resource, 'license_expressions', []) or []
65-
for expression in licexps:
66-
if expression:
67-
license_keys.update(licensing.license_keys(expression))
68-
69-
packages = getattr(codebase, 'packages', []) or []
70-
for package in packages:
71-
# FXIME: license_expression attribute name is changing soon
72-
expression = package.get('license_expression')
73-
if expression:
74-
license_keys.update(licensing.license_keys(expression))
75-
76-
resource.save(codebase)
77-
78-
db = get_licenses_db()
79-
for key in sorted(license_keys):
80-
license_details = db[key].to_dict(
81-
include_ignorables=False,
82-
include_text=True,
75+
76+
# Get license_expressions from both package and license detections
77+
license_licexps = getattr(resource, 'license_expressions', []) or []
78+
package_data = getattr(resource, 'package_data', []) or []
79+
# TODO: license_expression attribute name is changing soon
80+
package_licexps = [pkg['license_expression'] for pkg in package_data]
81+
licexps.extend(license_licexps + package_licexps)
82+
83+
# Get license matches from both package and license detections
84+
licence_detections = getattr(resource, 'licenses', []) or []
85+
#TODO: report license detections (with license matches) for packages
86+
license_db_data.extend(
87+
get_license_db_reference_data(licence_detections=licence_detections)
88+
)
89+
90+
license_detections_by_path[resource.path] = licence_detections
91+
codebase.save_resource(resource)
92+
93+
license_references = get_license_references(license_expressions=licexps)
94+
codebase.attributes.license_references.extend(license_references)
95+
96+
licensedb_references = get_licensedb_references(license_db_data=license_db_data)
97+
codebase.attributes.licensedb_references.extend(licensedb_references)
98+
99+
license_detection_references = get_license_detection_references(license_detections_by_path)
100+
codebase.attributes.license_detection_references.extend(license_detection_references)
101+
102+
103+
def get_license_references(license_expressions, licensing=Licensing()):
104+
"""
105+
Get a list of License data from a list of `license_expression` strings.
106+
"""
107+
from licensedcode.cache import get_licenses_db
108+
109+
license_keys = set()
110+
license_references = []
111+
112+
for expression in license_expressions:
113+
if expression:
114+
license_keys.update(licensing.license_keys(expression))
115+
116+
db = get_licenses_db()
117+
for key in sorted(license_keys):
118+
license_references.append(
119+
db[key].to_dict(include_ignorables=False, include_text=True)
120+
)
121+
122+
return license_references
123+
124+
125+
def get_licensedb_references(license_db_data):
126+
"""
127+
"""
128+
licence_db_ids = set()
129+
licensedb_references = []
130+
131+
for licdb_ref in license_db_data:
132+
133+
licdb_id = licdb_ref['licensedb_identifier']
134+
if licdb_id not in licence_db_ids:
135+
licence_db_ids.update(licdb_id)
136+
licensedb_references.append(licdb_ref)
137+
138+
return licensedb_references
139+
140+
141+
def get_license_db_reference_data(licence_detections):
142+
"""
143+
"""
144+
licence_db_ids = set()
145+
license_db_reference_data = []
146+
147+
for detection in licence_detections:
148+
matches = detection['matches']
149+
150+
for match in matches:
151+
152+
licdb_id = match['licensedb_identifier']
153+
154+
ref_data = {}
155+
ref_data['license_expression'] = match['license_expression']
156+
ref_data['licensedb_identifier'] = licdb_id
157+
ref_data['referenced_filenames'] = match.pop('referenced_filenames')
158+
ref_data['is_license_text'] = match.pop('is_license_text')
159+
ref_data['is_license_notice'] = match.pop('is_license_notice')
160+
ref_data['is_license_reference'] = match.pop('is_license_reference')
161+
ref_data['is_license_tag'] = match.pop('is_license_tag')
162+
ref_data['is_license_intro'] = match.pop('is_license_intro')
163+
ref_data['rule_length'] = match.pop('rule_length')
164+
ref_data['rule_relevance'] = match.pop('rule_relevance')
165+
166+
if 'matched_text' in match:
167+
ref_data['matched_text'] = match.pop('matched_text')
168+
169+
_ = match.pop('licenses')
170+
171+
if licdb_id not in licence_db_ids:
172+
licence_db_ids.update(licdb_id)
173+
license_db_reference_data.append(ref_data)
174+
175+
return license_db_reference_data
176+
177+
178+
def get_license_detection_references(license_detections_by_path):
179+
"""
180+
"""
181+
detection_objects = []
182+
183+
for path, detections in license_detections_by_path.items():
184+
185+
for detection in detections:
186+
detection_obj = LicenseDetection.from_mapping(detection=detection)
187+
_matches = detection.pop('matches')
188+
_reasons = detection.pop('combination_reasons')
189+
detection_obj.file_region = detection_obj.get_file_region(path=path)
190+
detection["id"] = detection_obj.identifier
191+
192+
detection_objects.append(detection_obj)
193+
194+
detection_references = UniqueDetection.get_unique_detections(detection_objects)
195+
return detection_references
196+
197+
198+
@attr.s
199+
class UniqueDetection:
200+
"""
201+
An unique License Detection.
202+
"""
203+
unique_identifier = attr.ib(type=int)
204+
license_detection = attr.ib()
205+
files = attr.ib(factory=list)
206+
207+
@classmethod
208+
def get_unique_detections(cls, license_detections):
209+
"""
210+
Get all unique license detections from a list of
211+
LicenseDetections.
212+
"""
213+
identifiers = get_identifiers(license_detections)
214+
unique_detection_counts = dict(Counter(identifiers))
215+
216+
unique_license_detections = []
217+
for detection_identifier in unique_detection_counts.keys():
218+
file_regions = (
219+
detection.file_region
220+
for detection in license_detections
221+
if detection_identifier == detection.identifier
222+
)
223+
all_detections = (
224+
detection
225+
for detection in license_detections
226+
if detection_identifier == detection.identifier
83227
)
84-
codebase.attributes.licenses_reference.append(license_details)
228+
229+
detection = next(all_detections)
230+
unique_license_detections.append(
231+
cls(
232+
files=list(file_regions),
233+
license_detection=attr.asdict(detection),
234+
unique_identifier=detection.identifier,
235+
)
236+
)
237+
238+
return unique_license_detections
239+
240+
241+
def get_identifiers(license_detections):
242+
"""
243+
Get identifiers for all license detections.
244+
"""
245+
identifiers = (
246+
detection.identifier
247+
for detection in license_detections
248+
)
249+
return identifiers

0 commit comments

Comments
 (0)