Skip to content

Commit 400a53d

Browse files
committed
[colic] Add category reference implementation and corresponding tests
Signed-off-by: inishchith <[email protected]>
1 parent e0430d1 commit 400a53d

File tree

7 files changed

+326
-25
lines changed

7 files changed

+326
-25
lines changed

grimoire_elk/enriched/cocom.py

Lines changed: 68 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@
1313
# GNU General Public License for more details.
1414
#
1515
# You should have received a copy of the GNU General Public License
16-
# along with this program; if not, write to the Free Software
17-
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
16+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
1817
#
1918
# Authors:
2019
# Valerio Cosentino <[email protected]>
@@ -32,6 +31,7 @@
3231
get_unique_repository,
3332
get_files_at_time)
3433
from .utils import fix_field_date
34+
from ..elastic_mapping import Mapping as BaseMapping
3535

3636
from grimoirelab_toolkit.datetime import datetime_utcnow
3737
from grimoire_elk.elastic import ElasticSearch
@@ -41,6 +41,71 @@
4141
logger = logging.getLogger(__name__)
4242

4343

44+
class Mapping(BaseMapping):
45+
46+
@staticmethod
47+
def get_elastic_mappings(es_major):
48+
"""Get Elasticsearch mapping.
49+
50+
Ensure data.message is string, since it can be very large
51+
52+
:param es_major: major version of Elasticsearch, as string
53+
:returns: dictionary with a key, 'items', with the mapping
54+
"""
55+
56+
mapping = '''
57+
{
58+
"dynamic":true,
59+
"properties": {
60+
"id" : {
61+
"type" : "keyword"
62+
},
63+
"interval_months" : {
64+
"type" : "long"
65+
},
66+
"origin" : {
67+
"type" : "keyword"
68+
},
69+
"study_creation_date" : {
70+
"type" : "date"
71+
},
72+
"total_blanks" : {
73+
"type" : "long"
74+
},
75+
"total_blanks_per_loc" : {
76+
"type" : "float"
77+
},
78+
"total_ccn" : {
79+
"type" : "long"
80+
},
81+
"total_comments" : {
82+
"type" : "long"
83+
},
84+
"total_comments_per_loc" : {
85+
"type" : "float"
86+
},
87+
"total_files" : {
88+
"type" : "long"
89+
},
90+
"total_loc" : {
91+
"type" : "long"
92+
},
93+
"total_loc_per_function" : {
94+
"type" : "float"
95+
},
96+
"total_num_funs" : {
97+
"type" : "long"
98+
},
99+
"total_tokens" : {
100+
"type" : "long"
101+
}
102+
}
103+
}
104+
'''
105+
106+
return {"items": mapping}
107+
108+
44109
class CocomEnrich(Enrich):
45110
metrics = ["ccn", "num_funs", "tokens", "loc", "comments", "blanks"]
46111

@@ -198,7 +263,7 @@ def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=Fal
198263
ins_items = 0
199264

200265
for repository_url in repositories:
201-
es_out = ElasticSearch(enrich_backend.elastic.url, out_index)
266+
es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping)
202267
evolution_items = []
203268

204269
for interval in interval_months:

grimoire_elk/enriched/colic.py

Lines changed: 87 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@
1313
# GNU General Public License for more details.
1414
#
1515
# You should have received a copy of the GNU General Public License
16-
# along with this program; if not, write to the Free Software
17-
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
16+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
1817
#
1918
# Authors:
2019
# Nishchith Shetty <[email protected]>
@@ -30,6 +29,7 @@
3029
get_unique_repository,
3130
get_files_at_time)
3231
from .utils import fix_field_date
32+
from ..elastic_mapping import Mapping as BaseMapping
3333

3434
from grimoirelab_toolkit.datetime import datetime_utcnow
3535
from grimoire_elk.elastic import ElasticSearch
@@ -39,6 +39,50 @@
3939
logger = logging.getLogger(__name__)
4040

4141

42+
class Mapping(BaseMapping):
43+
44+
@staticmethod
45+
def get_elastic_mappings(es_major):
46+
"""Get Elasticsearch mapping.
47+
48+
Ensure data.message is string, since it can be very large
49+
50+
:param es_major: major version of Elasticsearch, as string
51+
:returns: dictionary with a key, 'items', with the mapping
52+
"""
53+
54+
mapping = '''
55+
{
56+
"dynamic":true,
57+
"properties": {
58+
"id" : {
59+
"type" : "keyword"
60+
},
61+
"interval_months" : {
62+
"type" : "long"
63+
},
64+
"origin" : {
65+
"type" : "keyword"
66+
},
67+
"study_creation_date" : {
68+
"type" : "date"
69+
},
70+
"total_files": {
71+
"type": "long"
72+
},
73+
"licensed_files": {
74+
"type": "long"
75+
},
76+
"copyrighted_files": {
77+
"type": "long"
78+
}
79+
}
80+
}
81+
'''
82+
83+
return {"items": mapping}
84+
85+
4286
class ColicEnrich(Enrich):
4387

4488
def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=None,
@@ -167,8 +211,8 @@ def extract_modules(self, file_path):
167211
return modules
168212

169213
@metadata
170-
def get_rich_item(self, file_analysis):
171-
# TODO: requires adjustments regarding category of backend used
214+
def __get_rich_scancode(self, file_analysis):
215+
# Scancode and Scancode-CLI Implementation
172216

173217
eitem = {}
174218
eitem["file_path"] = file_analysis["file_path"]
@@ -192,14 +236,44 @@ def get_rich_item(self, file_analysis):
192236

193237
return eitem
194238

239+
@metadata
240+
def __get_rich_nomossa(self, file_analysis):
241+
# NOMOS analyzer implementation
242+
243+
eitem = {}
244+
eitem["file_path"] = file_analysis["file_path"]
245+
eitem["modules"] = self.extract_modules(eitem["file_path"])
246+
eitem["licenses"] = []
247+
eitem["license_name"] = []
248+
eitem["has_license"] = 0
249+
250+
if file_analysis["licenses"] != "No_license_found":
251+
eitem["has_license"] = 1
252+
for _license in file_analysis["licenses"]:
253+
eitem["licenses"].append(_license)
254+
eitem["license_name"].append(_license)
255+
256+
# NOMOS doesn't provide copyright information.
257+
eitem["copyrights"] = []
258+
eitem["has_copyright"] = 0
259+
260+
return eitem
261+
195262
def get_rich_items(self, item):
196-
# The real data
197-
entry = item['data']
263+
"""
264+
:category: code_license_scancode_cli(default)
265+
"""
198266

267+
if item["category"] == "code_license_nomos":
268+
get_rich_item = self.__get_rich_nomossa
269+
else:
270+
get_rich_item = self.__get_rich_scancode
271+
272+
entry = item['data']
199273
enriched_items = []
200274

201275
for file_analysis in entry["analysis"]:
202-
eitem = self.get_rich_item(file_analysis)
276+
eitem = get_rich_item(file_analysis)
203277

204278
for f in self.RAW_FIELDS_COPY:
205279
if f in item:
@@ -208,13 +282,14 @@ def get_rich_items(self, item):
208282
eitem[f] = None
209283

210284
# common attributes
211-
eitem['commit_sha'] = entry['commit']
212285
eitem['author'] = entry['Author']
213-
eitem['committer'] = entry['Commit']
214-
eitem['commit'] = entry['commit']
215-
eitem['message'] = entry['message']
216286
eitem['author_date'] = fix_field_date(entry['AuthorDate'])
287+
eitem["category"] = item["category"]
288+
eitem['commit'] = entry['commit']
289+
eitem['committer'] = entry['Commit']
217290
eitem['commit_date'] = fix_field_date(entry['CommitDate'])
291+
eitem['commit_sha'] = entry['commit']
292+
eitem['message'] = entry['message']
218293

219294
if self.prjs_map:
220295
eitem.update(self.get_item_project(eitem))
@@ -280,7 +355,7 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa
280355
ins_items = 0
281356

282357
for repository_url in repositories:
283-
es_out = ElasticSearch(enrich_backend.elastic.url, out_index)
358+
es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping)
284359
evolution_items = []
285360

286361
for interval in interval_months:

grimoire_elk/enriched/graal_study_evolution.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@
1313
# GNU General Public License for more details.
1414
#
1515
# You should have received a copy of the GNU General Public License
16-
# along with this program; if not, write to the Free Software
17-
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
16+
# You should have received a copy of the GNU General Public License
1817
#
1918
# Authors:
2019
# Valerio Cosentino <[email protected]>
@@ -33,7 +32,8 @@ def get_unique_repository():
3332
"aggs": {
3433
"unique_repos": {
3534
"terms": {
36-
"field": "origin"
35+
"field": "origin",
36+
"size": 5000
3737
}
3838
}
3939
}

grimoire_elk/raw/graal.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@
1313
# GNU General Public License for more details.
1414
#
1515
# You should have received a copy of the GNU General Public License
16-
# along with this program; if not, write to the Free Software
17-
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
16+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
1817
#
1918
# Authors:
2019
# Nishchith Shetty <[email protected]>
@@ -37,14 +36,18 @@ def get_elastic_mappings(es_major):
3736
"""
3837

3938
mapping = '''
40-
{
39+
{
4140
"dynamic":true,
4241
"properties": {
4342
"data": {
4443
"properties": {
4544
"message": {
4645
"type": "text",
4746
"index": true
47+
},
48+
"analysis": {
49+
"dynamic":false,
50+
"properties": {}
4851
}
4952
}
5053
}

0 commit comments

Comments
 (0)