diff --git a/grimoire_elk/enriched/cocom.py b/grimoire_elk/enriched/cocom.py new file mode 100644 index 000000000..79a7883a6 --- /dev/null +++ b/grimoire_elk/enriched/cocom.py @@ -0,0 +1,338 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2015-2019 Bitergia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Authors: +# Valerio Cosentino +# Nishchith Shetty +# + +import logging +from dateutil.relativedelta import relativedelta + +from elasticsearch import Elasticsearch as ES, RequestsHttpConnection + +from .enrich import (Enrich, + metadata) +from .graal_study_evolution import (get_to_date, + get_unique_repository, + get_files_at_time) +from .utils import fix_field_date +from ..elastic_mapping import Mapping as BaseMapping + +from grimoirelab_toolkit.datetime import datetime_utcnow +from grimoire_elk.elastic import ElasticSearch + +MAX_SIZE_BULK_ENRICHED_ITEMS = 200 + +logger = logging.getLogger(__name__) + + +class Mapping(BaseMapping): + + @staticmethod + def get_elastic_mappings(es_major): + """Get Elasticsearch mapping. + + Ensure data.message is string, since it can be very large + + :param es_major: major version of Elasticsearch, as string + :returns: dictionary with a key, 'items', with the mapping + """ + + mapping = ''' + { + "dynamic":true, + "properties": { + "id" : { + "type" : "keyword" + }, + "interval_months" : { + "type" : "long" + }, + "origin" : { + "type" : "keyword" + }, + "study_creation_date" : { + "type" : "date" + }, + "total_blanks" : { + "type" : "long" + }, + "total_blanks_per_loc" : { + "type" : "float" + }, + "total_ccn" : { + "type" : "long" + }, + "total_comments" : { + "type" : "long" + }, + "total_comments_per_loc" : { + "type" : "float" + }, + "total_files" : { + "type" : "long" + }, + "total_loc" : { + "type" : "long" + }, + "total_loc_per_function" : { + "type" : "float" + }, + "total_num_funs" : { + "type" : "long" + }, + "total_tokens" : { + "type" : "long" + } + } + } + ''' + + return {"items": mapping} + + +class CocomEnrich(Enrich): + metrics = ["ccn", "num_funs", "tokens", "loc", "comments", "blanks"] + + def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=None, + db_user='', db_password='', db_host=''): + super().__init__(db_sortinghat, db_projects_map, json_projects_map, + db_user, db_password, db_host) + + self.studies = [] + self.studies.append(self.enrich_cocom_analysis) + + def get_identities(self, item): + """ Return the identities from an item """ + identities = [] + + return identities + + def has_identities(self): + """ Return whether the enriched items contains identities """ + + return False + + def get_field_unique_id(self): + return "id" + + def extract_modules(self, file_path): + """ Extracts module path from the given file path """ + path_chunks = file_path.split('/') + + modules = [] + for idx in range(len(path_chunks)): + sub_path = '/'.join(path_chunks[:idx]) + + if sub_path: + modules.append(sub_path) + + return modules + + @metadata + def get_rich_item(self, file_analysis): + + eitem = {} + for metric in self.metrics: + if file_analysis.get(metric, None) is not None: + eitem[metric] = file_analysis[metric] + else: + eitem[metric] = None + + eitem["file_path"] = file_analysis.get("file_path", None) + eitem["ext"] = file_analysis.get("ext", None) + eitem['modules'] = self.extract_modules(eitem['file_path']) + eitem = self.__add_derived_metrics(file_analysis, eitem) + + return eitem + + def get_rich_items(self, item): + # The real data + entry = item['data'] + + enriched_items = [] + + for file_analysis in entry["analysis"]: + eitem = self.get_rich_item(file_analysis) + + for f in self.RAW_FIELDS_COPY: + if f in item: + eitem[f] = item[f] + else: + eitem[f] = None + + # common attributes + eitem['commit_sha'] = entry['commit'] + eitem['author'] = entry['Author'] + eitem['committer'] = entry['Commit'] + eitem['message'] = entry['message'] + eitem['author_date'] = fix_field_date(entry['AuthorDate']) + eitem['commit_date'] = fix_field_date(entry['CommitDate']) + + if self.prjs_map: + eitem.update(self.get_item_project(eitem)) + + # uuid + eitem['id'] = "{}_{}".format(eitem['commit_sha'], eitem['file_path']) + + eitem.update(self.get_grimoire_fields(entry["AuthorDate"], "file")) + + self.add_repository_labels(eitem) + self.add_metadata_filter_raw(eitem) + + enriched_items.append(eitem) + + return enriched_items + + def __add_derived_metrics(self, file_analysis, eitem): + """ Add derived metrics fields """ + + # TODO: Fix Logic: None rather than 1 + if eitem["loc"] is not None and eitem["comments"] is not None and eitem["num_funs"] is not None: + eitem["comments_per_loc"] = round(eitem["comments"] / max(eitem["loc"], 1), 2) + eitem["blanks_per_loc"] = round(eitem["blanks"] / max(eitem["loc"], 1), 2) + eitem["loc_per_function"] = round(eitem["loc"] / max(eitem["num_funs"], 1), 2) + else: + eitem["comments_per_loc"] = None + eitem["blanks_per_loc"] = None + eitem["loc_per_function"] = None + + return eitem + + def enrich_items(self, ocean_backend, events=False): + items_to_enrich = [] + num_items = 0 + ins_items = 0 + + for item in ocean_backend.fetch(): + rich_items = self.get_rich_items(item) + + items_to_enrich.extend(rich_items) + if len(items_to_enrich) < MAX_SIZE_BULK_ENRICHED_ITEMS: + continue + + num_items += len(items_to_enrich) + ins_items += self.elastic.bulk_upload(items_to_enrich, self.get_field_unique_id()) + items_to_enrich = [] + + if len(items_to_enrich) > 0: + num_items += len(items_to_enrich) + ins_items += self.elastic.bulk_upload(items_to_enrich, self.get_field_unique_id()) + + if num_items != ins_items: + missing = num_items - ins_items + logger.error("%s/%s missing items for Cocom", str(missing), str(num_items)) + else: + logger.info("%s items inserted for Cocom", str(num_items)) + + return num_items + + def enrich_cocom_analysis(self, ocean_backend, enrich_backend, no_incremental=False, + out_index="cocom_enrich_graal_repo", interval_months=[3], + date_field="grimoire_creation_date"): + + logger.info("[enrich-cocom-analysis] Start enrich_cocom_analysis study") + + es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, + verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) + in_index = enrich_backend.elastic.index + interval_months = list(map(int, interval_months)) + + unique_repos = es_in.search( + index=in_index, + body=get_unique_repository()) + + repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])] + current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0) + + logger.info("[enrich-cocom-analysis] {} repositories to process".format(len(repositories))) + es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) + es_out.add_alias("cocom_study") + + num_items = 0 + ins_items = 0 + + for repository_url in repositories: + logger.info("[enrich-cocom-analysis] Start analysis for {}".format(repository_url)) + evolution_items = [] + + for interval in interval_months: + + to_month = get_to_date(es_in, in_index, out_index, repository_url, interval) + to_month = to_month.replace(month=int(interval), day=1, hour=0, minute=0, second=0) + + while to_month < current_month: + files_at_time = es_in.search( + index=in_index, + body=get_files_at_time(repository_url, to_month.isoformat()) + )['aggregations']['file_stats'].get("buckets", []) + + if not len(files_at_time): + to_month = to_month + relativedelta(months=+interval) + continue + + repository_name = repository_url.split("/")[-1] + evolution_item = { + "id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval), + "origin": repository_url, + "interval_months": interval, + "study_creation_date": to_month.isoformat(), + "total_files": len(files_at_time) + } + + for file_ in files_at_time: + file_details = file_["1"]["hits"]["hits"][0]["_source"] + + for metric in self.metrics: + total_metric = "total_" + metric + evolution_item[total_metric] = evolution_item.get(total_metric, 0) + evolution_item[total_metric] += file_details[metric] if file_details[metric] is not None else 0 + + # TODO: Fix Logic: None rather than 1 + evolution_item["total_comments_per_loc"] = round( + evolution_item["total_comments"] / max(evolution_item["total_loc"], 1), 2) + evolution_item["total_blanks_per_loc"] = round( + evolution_item["total_blanks"] / max(evolution_item["total_loc"], 1), 2) + evolution_item["total_loc_per_function"] = round( + evolution_item["total_loc"] / max(evolution_item["total_num_funs"], 1), 2) + + evolution_item.update(self.get_grimoire_fields(evolution_item["study_creation_date"], "stats")) + evolution_items.append(evolution_item) + + if len(evolution_items) >= self.elastic.max_items_bulk: + num_items += len(evolution_items) + ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) + evolution_items = [] + + to_month = to_month + relativedelta(months=+interval) + + if len(evolution_items) > 0: + num_items += len(evolution_items) + ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) + + if num_items != ins_items: + missing = num_items - ins_items + logger.error( + "[enrich-cocom-analysis] %s/%s missing items for Graal CoCom Analysis Study", str(missing), str(num_items) + ) + else: + logger.info("[enrich-cocom-analysis] %s items inserted for Graal CoCom Analysis Study", str(num_items)) + + logger.info("[enrich-cocom-analysis] End analysis for {} with month interval".format(repository_url, interval)) + + logger.info("[enrich-cocom-analysis] End enrich_cocom_analysis study") diff --git a/grimoire_elk/enriched/colic.py b/grimoire_elk/enriched/colic.py new file mode 100644 index 000000000..68234e575 --- /dev/null +++ b/grimoire_elk/enriched/colic.py @@ -0,0 +1,454 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2015-2019 Bitergia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Authors: +# Nishchith Shetty +# + +import logging +from dateutil.relativedelta import relativedelta + +from elasticsearch import Elasticsearch as ES, RequestsHttpConnection +from .enrich import (Enrich, + metadata) +from .graal_study_evolution import (get_to_date, + get_unique_repository) +from .utils import fix_field_date +from ..elastic_mapping import Mapping as BaseMapping + +from grimoirelab_toolkit.datetime import datetime_utcnow +from grimoire_elk.elastic import ElasticSearch + +MAX_SIZE_BULK_ENRICHED_ITEMS = 200 + +logger = logging.getLogger(__name__) + + +class Mapping(BaseMapping): + + @staticmethod + def get_elastic_mappings(es_major): + """Get Elasticsearch mapping. + + Ensure data.message is string, since it can be very large + + :param es_major: major version of Elasticsearch, as string + :returns: dictionary with a key, 'items', with the mapping + """ + + mapping = ''' + { + "dynamic":true, + "properties": { + "id" : { + "type" : "keyword" + }, + "interval_months" : { + "type" : "long" + }, + "origin" : { + "type" : "keyword" + }, + "study_creation_date" : { + "type" : "date" + }, + "total_files": { + "type": "long" + }, + "licensed_files": { + "type": "long" + }, + "copyrighted_files": { + "type": "long" + } + } + } + ''' + + return {"items": mapping} + + +class ColicEnrich(Enrich): + + def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=None, + db_user='', db_password='', db_host=''): + super().__init__(db_sortinghat, db_projects_map, json_projects_map, + db_user, db_password, db_host) + + self.studies = [] + self.studies.append(self.enrich_colic_analysis) + + def get_identities(self, item): + """ Return the identities from an item """ + identities = [] + + return identities + + def has_identities(self): + """ Return whether the enriched items contains identities """ + + return False + + def get_field_unique_id(self): + return "id" + + def __get_total_files(self, repository_url, to_date): + """ Retrieve total number for files until to_date, corresponding + to the given repository + """ + + query_total_files = """ + { + "size": 0, + "aggs": { + "1": { + "cardinality": { + "field": "file_path" + } + } + }, + "query": { + "bool": { + "filter": [{ + "term": { + "origin": "%s" + } + }, + { + "range": { + "metadata__updated_on": { + "lte": "%s" + } + } + }] + } + } + } + """ % (repository_url, to_date) + + return query_total_files + + def __get_licensed_files(self, repository_url, to_date): + """ Retrieve all the licensed files until the to_date, corresponding + to the given repository. + """ + + query_licensed_files = """ + { + "size": 0, + "aggs": { + "1": { + "cardinality": { + "field": "file_path" + } + } + }, + "query": { + "bool": { + "filter": [{ + "term": { + "has_license": 1 + } + }, + { + "term": { + "origin": "%s" + } + }, + { + "range": { + "metadata__updated_on": { + "lte": "%s" + } + } + }] + } + } + } + """ % (repository_url, to_date) + + return query_licensed_files + + def __get_copyrighted_files(self, repository_url, to_date): + """ Retrieve all the copyrighted files until the to_date, corresponding + to the given repository. + """ + + query_copyrighted_files = """ + { + "size": 0, + "aggs": { + "1": { + "cardinality": { + "field": "file_path" + } + } + }, + "query": { + "bool": { + "filter": [{ + "term": { + "has_copyright": 1 + } + }, + { + "term": { + "origin": "%s" + } + }, + { + "range": { + "metadata__updated_on": { + "lte": "%s" + } + } + }] + } + } + } + """ % (repository_url, to_date) + + return query_copyrighted_files + + def extract_modules(self, file_path): + """ Extracts module path from the given file path """ + path_chunks = file_path.split('/') + + modules = [] + for idx in range(len(path_chunks)): + sub_path = '/'.join(path_chunks[:idx]) + + if sub_path: + modules.append(sub_path) + + return modules + + @metadata + def __get_rich_scancode(self, file_analysis): + # Scancode and Scancode-CLI Implementation + + eitem = {} + eitem["file_path"] = file_analysis["file_path"] + eitem["modules"] = self.extract_modules(eitem["file_path"]) + eitem["copyrights"] = [] + eitem["licenses"] = [] + eitem["license_name"] = [] + eitem["has_license"] = 0 + eitem["has_copyright"] = 0 + + if file_analysis.get("licenses", False): + eitem["has_license"] = 1 + for _license in file_analysis["licenses"]: + eitem["licenses"].extend(_license["matched_rule"]["licenses"]) + eitem["license_name"].append(_license["name"]) + + if file_analysis.get("copyrights", False): + eitem["has_copyright"] = 1 + for _copyright in file_analysis["copyrights"]: + eitem["copyrights"].append(_copyright["value"]) + + return eitem + + @metadata + def __get_rich_nomossa(self, file_analysis): + # NOMOS analyzer implementation + + eitem = {} + eitem["file_path"] = file_analysis["file_path"] + eitem["modules"] = self.extract_modules(eitem["file_path"]) + eitem["licenses"] = [] + eitem["license_name"] = [] + eitem["has_license"] = 0 + + if file_analysis["licenses"] != "No_license_found": + eitem["has_license"] = 1 + for _license in file_analysis["licenses"]: + eitem["licenses"].append(_license) + eitem["license_name"].append(_license) + + # NOMOS doesn't provide copyright information. + eitem["copyrights"] = [] + eitem["has_copyright"] = 0 + + return eitem + + def get_rich_items(self, item): + """ + :category: code_license_scancode_cli(default) + """ + + if item["category"] == "code_license_nomos": + get_rich_item = self.__get_rich_nomossa + else: + get_rich_item = self.__get_rich_scancode + + entry = item['data'] + enriched_items = [] + + for file_analysis in entry["analysis"]: + eitem = get_rich_item(file_analysis) + + for f in self.RAW_FIELDS_COPY: + if f in item: + eitem[f] = item[f] + else: + eitem[f] = None + + # common attributes + eitem['author'] = entry['Author'] + eitem['author_date'] = fix_field_date(entry['AuthorDate']) + eitem["category"] = item["category"] + eitem['commit'] = entry['commit'] + eitem['committer'] = entry['Commit'] + eitem['commit_date'] = fix_field_date(entry['CommitDate']) + eitem['commit_sha'] = entry['commit'] + eitem['message'] = entry['message'] + + if self.prjs_map: + eitem.update(self.get_item_project(eitem)) + + # uuid + eitem['id'] = "{}_{}".format(eitem['commit_sha'], eitem['file_path']) + + eitem.update(self.get_grimoire_fields(entry["AuthorDate"], "file")) + + self.add_repository_labels(eitem) + self.add_metadata_filter_raw(eitem) + + enriched_items.append(eitem) + + return enriched_items + + def enrich_items(self, ocean_backend, events=False): + items_to_enrich = [] + num_items = 0 + ins_items = 0 + + for item in ocean_backend.fetch(): + rich_items = self.get_rich_items(item) + + items_to_enrich.extend(rich_items) + if len(items_to_enrich) < MAX_SIZE_BULK_ENRICHED_ITEMS: + continue + + num_items += len(items_to_enrich) + ins_items += self.elastic.bulk_upload(items_to_enrich, self.get_field_unique_id()) + items_to_enrich = [] + + if len(items_to_enrich) > 0: + num_items += len(items_to_enrich) + ins_items += self.elastic.bulk_upload(items_to_enrich, self.get_field_unique_id()) + + if num_items != ins_items: + missing = num_items - ins_items + logger.error("%s/%s missing items for CoLic", str(missing), str(num_items)) + else: + logger.info("%s items inserted for CoLic", str(num_items)) + + return num_items + + def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=False, + out_index="colic_enrich_graal_repo", interval_months=[3], + date_field="grimoire_creation_date"): + + logger.info("[enrich-colic-analysis] Start enrich_colic_analysis study") + + es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, + verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) + in_index = enrich_backend.elastic.index + interval_months = list(map(int, interval_months)) + + unique_repos = es_in.search( + index=in_index, + body=get_unique_repository()) + + repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])] + + logger.info("[enrich-colic-analysis] {} repositories to process".format(len(repositories))) + es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) + es_out.add_alias("colic_study") + + current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0) + num_items = 0 + ins_items = 0 + + for repository_url in repositories: + logger.info("[enrich-colic-analysis] Start analysis for {}".format(repository_url)) + evolution_items = [] + + for interval in interval_months: + + to_month = get_to_date(es_in, in_index, out_index, repository_url, interval) + to_month = to_month.replace(month=int(interval), day=1, hour=0, minute=0, second=0) + + while to_month < current_month: + copyrighted_files_at_time = es_in.search( + index=in_index, + body=self.__get_copyrighted_files(repository_url, to_month.isoformat())) + + licensed_files_at_time = es_in.search( + index=in_index, + body=self.__get_licensed_files(repository_url, to_month.isoformat())) + + files_at_time = es_in.search( + index=in_index, + body=self.__get_total_files(repository_url, to_month.isoformat())) + + licensed_files = int(licensed_files_at_time["aggregations"]["1"]["value"]) + copyrighted_files = int(copyrighted_files_at_time["aggregations"]["1"]["value"]) + total_files = int(files_at_time["aggregations"]["1"]["value"]) + + if not total_files: + to_month = to_month + relativedelta(months=+interval) + continue + + repository_name = repository_url.split("/")[-1] + evolution_item = { + "id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval), + "origin": repository_url, + "interval_months": interval, + "study_creation_date": to_month.isoformat(), + "licensed_files": licensed_files, + "copyrighted_files": copyrighted_files, + "total_files": total_files + } + + evolution_item.update(self.get_grimoire_fields(evolution_item["study_creation_date"], "stats")) + evolution_items.append(evolution_item) + + if len(evolution_items) >= self.elastic.max_items_bulk: + num_items += len(evolution_items) + ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) + evolution_items = [] + + to_month = to_month + relativedelta(months=+interval) + + if len(evolution_items) > 0: + num_items += len(evolution_items) + ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) + + if num_items != ins_items: + missing = num_items - ins_items + logger.error( + "[enrich-colic-analysis] %s/%s missing items for Graal CoLic Analysis Study", str(missing), str(num_items) + ) + else: + logger.info("[enrich-colic-analysis] %s items inserted for Graal CoLic Analysis Study", str(num_items)) + + logger.info("[enrich-colic-analysis] End analysis for {} with month interval".format(repository_url, interval)) + + logger.info("[enrich-colic-analysis] End enrich_colic_analysis study") diff --git a/grimoire_elk/enriched/graal_study_evolution.py b/grimoire_elk/enriched/graal_study_evolution.py new file mode 100644 index 000000000..c840a15f5 --- /dev/null +++ b/grimoire_elk/enriched/graal_study_evolution.py @@ -0,0 +1,192 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2015-2019 Bitergia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# You should have received a copy of the GNU General Public License +# +# Authors: +# Valerio Cosentino +# Nishchith Shetty +# + +from grimoirelab_toolkit.datetime import str_to_datetime + + +def get_unique_repository(): + """ Retrieve all the repository names from the index. """ + + query_unique_repository = """ + { + "size": 0, + "aggs": { + "unique_repos": { + "terms": { + "field": "origin", + "size": 5000 + } + } + } + } + """ + + return query_unique_repository + + +def get_last_study_date(repository_url, interval): + """ Retrieve the last study_creation_date of the item corresponding + to given repository from the study index. + """ + + query_last_study_date = """ + { + "size": 0, + "aggs": { + "1": { + "max": { + "field": "study_creation_date" + } + } + }, + "query": { + "bool": { + "filter": [{ + "term": { + "origin.keyword": "%s" + } + },{ + "term":{ + "interval_months": "%s" + } + }] + } + } + } + """ % (repository_url, interval) + + return query_last_study_date + + +def get_first_enriched_date(repository_url): + """ Retrieve the first/oldest metadata__updated_on of the item + corresponding to given repository. + """ + + query_first_enriched_date = """ + { + "size": 0, + "aggs": { + "1": { + "top_hits": { + "docvalue_fields": [ + "metadata__updated_on" + ], + "_source": "metadata__updated_on", + "size": 1, + "sort": [{ + "commit_date": { + "order": "asc" + } + }] + } + } + }, + "query": { + "bool": { + "filter": [{ + "term": { + "origin": "%s" + } + }] + } + } + } + """ % (repository_url) + + return query_first_enriched_date + + +def get_files_at_time(repository_url, to_date): + """ Retrieve all the latest changes wrt files until the to_date, + corresponding to the given repository. + """ + + query_files_at_time = """ + { + "size": 0, + "aggs": { + "file_stats": { + "terms": { + "field": "file_path", + "size": 2147483647, + "order": { + "_key": "desc" + } + }, + "aggs": { + "1": { + "top_hits": { + "size": 1, + "sort": [{ + "metadata__updated_on": { + "order": "desc" + } + }] + } + } + } + } + }, + "query": { + "bool": { + "filter": [{ + "term": { + "origin": "%s" + } + }, + { + "range": { + "metadata__updated_on": { + "lte": "%s" + } + } + }] + } + } + } + """ % (repository_url, to_date) + + return query_files_at_time + + +def get_to_date(es_in, in_index, out_index, repository_url, interval): + """ Get the appropriate to_date value for incremental insertion. """ + study_data_available = False + + if es_in.indices.exists(index=out_index): + last_study_date = es_in.search( + index=out_index, + body=get_last_study_date(repository_url, interval))["aggregations"]["1"] + + if last_study_date["value"] is not None: + study_data_available = True + to_date = str_to_datetime(last_study_date["value_as_string"]) + + if not study_data_available: + first_item_date = es_in.search( + index=in_index, + body=get_first_enriched_date(repository_url))["aggregations"]["1"]["hits"]["hits"][0]["_source"] + + to_date = str_to_datetime(first_item_date["metadata__updated_on"]) + + return to_date diff --git a/grimoire_elk/enriched/utils.py b/grimoire_elk/enriched/utils.py index 7b9f5cf78..8de4a8ef0 100755 --- a/grimoire_elk/enriched/utils.py +++ b/grimoire_elk/enriched/utils.py @@ -29,7 +29,8 @@ import urllib3 -from grimoirelab_toolkit.datetime import datetime_utcnow +from grimoirelab_toolkit.datetime import (datetime_utcnow, + str_to_datetime) BACKOFF_FACTOR = 0.2 @@ -212,3 +213,16 @@ def get_diff_current_date(days=0, hours=0, minutes=0): before_date = datetime_utcnow() - datetime.timedelta(days=days, hours=hours, minutes=minutes) return before_date + + +def fix_field_date(date_value): + """Fix possible errors in the field date""" + + field_date = str_to_datetime(date_value) + + try: + _ = int(field_date.strftime("%z")[0:3]) + except ValueError: + field_date = field_date.replace(tzinfo=None) + + return field_date.isoformat() diff --git a/grimoire_elk/raw/graal.py b/grimoire_elk/raw/graal.py new file mode 100644 index 000000000..df998de38 --- /dev/null +++ b/grimoire_elk/raw/graal.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2015-2019 Bitergia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Authors: +# Nishchith Shetty +# + +from .elastic import ElasticOcean +from ..elastic_mapping import Mapping as BaseMapping + + +class Mapping(BaseMapping): + + @staticmethod + def get_elastic_mappings(es_major): + """Get Elasticsearch mapping. + + Ensure data.message is string, since it can be very large + + :param es_major: major version of Elasticsearch, as string + :returns: dictionary with a key, 'items', with the mapping + """ + + mapping = ''' + { + "dynamic":true, + "properties": { + "data": { + "properties": { + "message": { + "type": "text", + "index": true + }, + "analysis": { + "dynamic":false, + "properties": {} + } + } + } + } + } + ''' + + return {"items": mapping} + + +class GraalOcean(ElasticOcean): + """Graal Ocean feeder""" + + mapping = Mapping + + @classmethod + def get_perceval_params_from_url(cls, url): + params = [] + tokens = url.split(' ', 1) # Just split the URL not the filter + url = tokens[0] + params.append(url) + + return params diff --git a/grimoire_elk/utils.py b/grimoire_elk/utils.py index 8aba1ab02..72b487082 100755 --- a/grimoire_elk/utils.py +++ b/grimoire_elk/utils.py @@ -28,6 +28,9 @@ from grimoire_elk.elastic import ElasticConnectException from grimoire_elk.elastic import ElasticSearch +# Connectors for Graal +from graal.backends.core.cocom import CoCom, CoComCommand +from graal.backends.core.colic import CoLic, CoLicCommand # Connectors for Perceval from grimoire_elk.raw.hyperkitty import HyperKittyOcean from perceval.backends.core.askbot import Askbot, AskbotCommand @@ -70,6 +73,8 @@ from .enriched.askbot import AskbotEnrich from .enriched.bugzilla import BugzillaEnrich from .enriched.bugzillarest import BugzillaRESTEnrich +from .enriched.cocom import CocomEnrich +from .enriched.colic import ColicEnrich from .enriched.confluence import ConfluenceEnrich from .enriched.crates import CratesEnrich from .enriched.discourse import DiscourseEnrich @@ -119,6 +124,7 @@ from .raw.github import GitHubOcean from .raw.gitlab import GitLabOcean from .raw.google_hits import GoogleHitsOcean +from .raw.graal import GraalOcean from .raw.groupsio import GroupsioOcean from .raw.jenkins import JenkinsOcean from .raw.jira import JiraOcean @@ -199,6 +205,8 @@ def get_connectors(): return {"askbot": [Askbot, AskbotOcean, AskbotEnrich, AskbotCommand], "bugzilla": [Bugzilla, BugzillaOcean, BugzillaEnrich, BugzillaCommand], "bugzillarest": [BugzillaREST, BugzillaRESTOcean, BugzillaRESTEnrich, BugzillaRESTCommand], + "cocom": [CoCom, GraalOcean, CocomEnrich, CoComCommand], + "colic": [CoLic, GraalOcean, ColicEnrich, CoLicCommand], "confluence": [Confluence, ConfluenceOcean, ConfluenceEnrich, ConfluenceCommand], "crates": [Crates, CratesOcean, CratesEnrich, CratesCommand], "discourse": [Discourse, DiscourseOcean, DiscourseEnrich, DiscourseCommand], diff --git a/requirements.txt b/requirements.txt index 484e0a24d..5b3d433f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ pandas==0.18.1 -e git+https://github.com/chaoss/grimoirelab-toolkit/#egg=grimoirelab-toolkit -e git+https://github.com/chaoss/grimoirelab-cereslib/#egg=grimoirelab-cereslib -e git+https://github.com/chaoss/grimoirelab-kingarthur/#egg=grimoirelab-kingarthur +-e git+https://github.com/chaoss/grimoirelab-graal/#egg=grimoirelab-graal -e git+https://github.com/chaoss/grimoirelab-perceval/#egg=grimoirelab-perceval -e git+https://github.com/chaoss/grimoirelab-perceval-mozilla/#egg=grimoirelab-perceval-mozilla -e git+https://github.com/chaoss/grimoirelab-perceval-opnfv/#egg=grimoirelab-perceval-opnfv diff --git a/schema/graal_cocom.csv b/schema/graal_cocom.csv new file mode 100644 index 000000000..b4e2acb9a --- /dev/null +++ b/schema/graal_cocom.csv @@ -0,0 +1,31 @@ +name,type,aggregatable,description +author,string,true,"Author name." +author_date,date,true,"Author date (when the original author made the commit)." +blanks,number,true,"Number of blank lines in a file." +blanks_per_loc,number,true,"Number of blank lines per line of code." +ccn,number,true,"Code Complexity of a file." +comments,number,true,"Number of comments in a file." +comments_per_loc,number,true,"Number of comment lines per line of code." +commit_date,date,true,"Date when committer made this commit." +commit_sha,string,true,"Commit hash." +committer,string,true,"Author name." +ext,string,true,"File extension" +file_path,string,true,"File Path" +grimoire_creation_date,date,true,"Commit date (when the original author made the commit)." +id,string,true,"Graal Item Id." +loc,number,true,"Lines of code in a file." +loc_per_function,number,true,"Number of lines of code per function definition." +message,string,true,"Commit message as a single String." +metadata__enriched_on,date,true,"Date when the item was enriched." +metadata__gelk_backend_name,keyword,true,"Name of the backend used to enrich information." +metadata__gelk_version,keyword,true,"Version of the backend used to enrich information." +metadata__timestamp,date,true,"Date when the item was stored in RAW index." +metadata__updated_on,date,true,"Date when the item was updated in its original data source." +modules,string,true,"Modules which the file is part of" +num_funs,number,true,"Number of function definition in the file" +origin,keyword,true,"Original URL where the repository was retrieved from." +project_1,keyword,true,"Project (if more than one level is allowed in project hierarchy)" +project,keyword,true,"Project." +tag,keyword,true,"Graal tag." +tokens,number,true,"Number of tokens in a file" +uuid,keyword,true,"Graal UUID." diff --git a/schema/graal_colic.csv b/schema/graal_colic.csv new file mode 100644 index 000000000..676844644 --- /dev/null +++ b/schema/graal_colic.csv @@ -0,0 +1,26 @@ +name,type,aggregatable,description +author,string,true,"Author name." +author_date,date,true,"Author date (when the original author made the commit)." +commit_date,date,true,"Date when committer made this commit." +commit_sha,string,true,"Commit hash." +committer,string,true,"Author name." +copyrights,string,true,"Copyright definitions found in file by analyzer." +file_path,string,true,"File Path" +grimoire_creation_date,date,true,"Commit date (when the original author made the commit)." +has_copyright,number,true,"1 if copyright definition exists else 0" +has_license,number,true,"1 if license definition exists else 0" +id,string,true,"Graal Item Id." +license_name,string,true,"License definitions found in file by analyzer" +licenses,string,true,"License keyword/tag." +message,string,true,"Commit message as a single String." +metadata__enriched_on,date,true,"Date when the item was enriched." +metadata__gelk_backend_name,keyword,true,"Name of the backend used to enrich information." +metadata__gelk_version,keyword,true,"Version of the backend used to enrich information." +metadata__timestamp,date,true,"Date when the item was stored in RAW index." +metadata__updated_on,date,true,"Date when the item was updated in its original data source." +modules,string,true,"Modules which the file is part of." +origin,keyword,true,"Original URL where the repository was retrieved from." +project_1,keyword,true,"Project (if more than one level is allowed in project hierarchy)" +project,keyword,true,"Project." +tag,keyword,true,"Graal tag." +uuid,keyword,true,"Graal UUID." diff --git a/tests/data/cocom.json b/tests/data/cocom.json new file mode 100644 index 000000000..2d65c6700 --- /dev/null +++ b/tests/data/cocom.json @@ -0,0 +1,221 @@ +[{ + "backend_name": "CoCom", + "backend_version": "0.2.5", + "category": "code_complexity_lizard_file", + "data": { + "Author": "Valerio Cosentino ", + "AuthorDate": "Sun Jun 2 18:34:23 2019 +0200", + "Commit": "Valerio Cosentino ", + "CommitDate": "Sun Jun 2 18:34:23 2019 +0200", + "Signed-off-by": ["Valerio Cosentino "], + "analysis": [{ + "avg_ccn": 2.4193548387096775, + "avg_loc": 8.419354838709678, + "avg_tokens": 60.96774193548387, + "blanks": 158, + "ccn": 75, + "comments": 193, + "ext": "py", + "file_path": "graal/graal.py", + "loc": 372, + "num_funs": 31, + "tokens": 2207 + }], + "analyzer": "lizard_file", + "commit": "692ed86f888d2e7a5ce81a5b8a90f47d05cc5588", + "message": "[graal] Derive `git_path` from `uri`\n \n This code derives the `git_path` of a target repository\n based on its `uri`. This change is needed to allow the\n execution from mordred/ELK, as done with Perceval.\n\n Signed-off-by: Valerio Cosentino " + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1562053790.544543, + "updated_on": 1559493263.0, + "uuid": "f86b37d493386ec7467976ff5a707d9c72c54cf9" + }, + { + "backend_name": "CoCom", + "backend_version": "0.2.5", + "category": "code_complexity_lizard_file", + "data": { + "Author": "inishchith ", + "AuthorDate": "Mon Jun 3 22:44:15 2019 +0530", + "Commit": "inishchith ", + "CommitDate": "Mon Jun 3 22:47:27 2019 +0530", + "Signed-off-by": ["inishchith "], + "analysis": [{ + "avg_ccn": 1.3461538461538463, + "avg_loc": 8.826923076923077, + "avg_tokens": 81.92307692307692, + "blanks": 204, + "ccn": 70, + "comments": 77, + "ext": "py", + "file_path": "tests/test_graal.py", + "loc": 527, + "num_funs": 52, + "tokens": 4623 + }], + "analyzer": "lizard_file", + "commit": "41f207a9349ae497055ac03157d9915ae81031e0", + "message": "[tests] Add test for deriving `git_path` from `uri`\n \n Signed-off-by: inishchith " + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1562053790.902134, + "updated_on": 1559582247.0, + "uuid": "fc17ad9f41767d66c4d2aed6d4b0ba5d072c9980" + }, + { + "backend_name": "CoCom", + "backend_version": "0.2.5", + "category": "code_complexity_lizard_file", + "data": { + "Author": "Valerio Cosentino ", + "AuthorDate": "Thu Jun 27 09:25:50 2019 +0200", + "Commit": "Valerio Cosentino ", + "CommitDate": "Thu Jun 27 09:25:50 2019 +0200", + "Merge": "5a526a6 26921fe", + "analysis": [{ + "avg_ccn": 2.6666666666666665, + "avg_loc": 19.333333333333332, + "avg_tokens": 129.66666666666666, + "blanks": 26, + "ccn": 8, + "comments": 63, + "ext": "py", + "file_path": "graal/backends/core/analyzers/lizard.py", + "loc": 80, + "num_funs": 3, + "tokens": 421 + }, + { + "avg_ccn": 2.6363636363636362, + "avg_loc": 8.818181818181818, + "avg_tokens": 57.63636363636363, + "blanks": 58, + "ccn": 29, + "comments": 107, + "ext": "py", + "file_path": "graal/backends/core/cocom.py", + "loc": 178, + "num_funs": 11, + "tokens": 938 + }, + { + "avg_ccn": 1.4, + "avg_loc": 13.533333333333333, + "avg_tokens": 127.26666666666667, + "blanks": 71, + "ccn": 21, + "comments": 39, + "ext": "py", + "file_path": "tests/test_cocom.py", + "loc": 234, + "num_funs": 15, + "tokens": 2056 + }, + { + "avg_ccn": 1.3333333333333333, + "avg_loc": 26.666666666666668, + "avg_tokens": 269.3333333333333, + "blanks": 17, + "ccn": 4, + "comments": 25, + "ext": "py", + "file_path": "tests/test_lizard.py", + "loc": 89, + "num_funs": 3, + "tokens": 852 + } + ], + "analyzer": "lizard_file", + "commit": "bfe91c3f9ca046084143f15e117bdd691e0fe12f", + "message": "Merge branch repository_level_cocom_lizard of https: //github.com/inishchith/graal\n \nMerges #39" + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1562053800.728394, + "updated_on": 1561620350.0, + "uuid": "49a416e4ab44e6f3b02eb96b08a026abdb6afa96" + }, + { + "backend_name": "CoCom", + "backend_version": "0.2.5", + "category": "code_complexity_lizard_file", + "data": { + "Author": "Valerio Cosentino ", + "AuthorDate": "Thu May 17 17:26:14 2018 +0200", + "Commit": "Valerio Cosentino ", + "CommitDate": "Thu May 17 17:26:14 2018 +0200", + "analysis": [{ + "avg_ccn": null, + "avg_loc": null, + "avg_tokens": null, + "blanks": null, + "ccn": null, + "comments": null, + "file_path": "tests/data/analyzers/sample_code.py", + "loc": null, + "num_funs": null, + "tokens": null + }, + { + "avg_ccn": 2.259259259259259, + "avg_loc": 7.851851851851852, + "avg_tokens": 55.81481481481482, + "blanks": 135, + "ccn": 61, + "comments": 169, + "ext": "py", + "file_path": "graal/graal.py", + "loc": 315, + "num_funs": 27, + "tokens": 1837 + }, + { + "avg_ccn": 2.3333333333333335, + "avg_loc": 6.555555555555555, + "avg_tokens": 55.0, + "blanks": 27, + "ccn": 21, + "comments": 31, + "ext": "py", + "file_path": "tests/data/sample_code.py", + "loc": 72, + "num_funs": 9, + "tokens": 535 + }, + { + "avg_ccn": null, + "avg_loc": null, + "avg_tokens": null, + "blanks": null, + "ccn": null, + "comments": null, + "file_path": "tests/data/graal/graaltest.zip", + "loc": null, + "num_funs": null, + "tokens": null + }, + { + "blanks": 62, + "comments": 39, + "ext": "zip", + "file_path": "tests/data/graaltest.zip", + "loc": 145 + } + ], + "analyzer": "lizard_file", + "commit": "f858376fdb3232417c8de196e04ce9db0e05c3e4", + "message": "[graal] Modify git_path parameterThis code replaces the parameter `git_path` to `gitpath` to ease\nthe integration with arthur. Thus git and graal tasks share somecommon parameters." + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1564575285.20279, + "updated_on": 1526570774.0, + "uuid": "0387fc9162b87ae8ad06f626be921d796e32c687" + } +] diff --git a/tests/data/colic.json b/tests/data/colic.json new file mode 100644 index 000000000..f559075de --- /dev/null +++ b/tests/data/colic.json @@ -0,0 +1,319 @@ +[{ + "backend_name": "CoLic", + "backend_version": "0.5.0", + "category": "code_license_scancode_cli", + "data": { + "Author": "Valerio Cosentino ", + "AuthorDate": "Sun May 6 13:11:43 2018 +0200", + "Commit": "Valerio Cosentino ", + "CommitDate": "Sun May 6 13:11:43 2018 +0200", + "analysis": [{ + "authors": [], + "base_name": "LICENSE", + "copyrights": [{ + "end_line": 6, + "start_line": 4, + "value": "Copyright (c) 2007 Free Software Foundation, Inc. " + }], + "date": "2019-07-08", + "dirs_count": 0, + "extension": "", + "file_path": "LICENSE", + "file_type": "ASCII text", + "files_count": 0, + "holders": [{ + "end_line": 6, + "start_line": 4, + "value": "Free Software Foundation, Inc." + }], + "is_archive": false, + "is_binary": false, + "is_media": false, + "is_script": false, + "is_source": false, + "is_text": true, + "license_expressions": [ + "gpl-3.0" + ], + "licenses": [{ + "category": "Copyleft", + "end_line": 674, + "homepage_url": "http://www.gnu.org/licenses/gpl-3.0.html", + "is_exception": false, + "key": "gpl-3.0", + "matched_rule": { + "identifier": "gpl-3.0.LICENSE", + "is_license_notice": false, + "is_license_reference": false, + "is_license_tag": false, + "is_license_text": true, + "license_expression": "gpl-3.0", + "licenses": [ + "gpl-3.0" + ], + "match_coverage": 100.0, + "matched_length": 5700, + "matcher": "1-hash", + "rule_length": 5700, + "rule_relevance": 100 + }, + "matched_text": "GNU GENERAL PUBLIC LICENSE\n Version 3, 29 June 2007\n\n Copyright (C) 2007 Free Software Foundation, Inc. \n Everyone is permitted to copy and distribute verbatim copies\n of this license document, but changing it is not allowed.\n\n", + "name": "GNU General Public License 3.0", + "owner": "Free Software Foundation (FSF)", + "reference_url": "https://enterprise.dejacode.com/urn/urn:dje:license:gpl-3.0", + "score": 100.0, + "short_name": "GPL 3.0", + "spdx_license_key": "GPL-3.0-only", + "spdx_url": "https://spdx.org/licenses/GPL-3.0-only", + "start_line": 1, + "text_url": "http://www.gnu.org/licenses/gpl-3.0-standalone.html" + }], + "md5": "d32239bcb673463ab874e80d47fae504", + "mime_type": "text/plain", + "name": "LICENSE", + "path": "LICENSE", + "programming_language": null, + "scan_errors": [], + "sha1": "8624bcdae55baeef00cd11d5dfcfa60f68710a02", + "size": 35147, + "size_count": 0, + "type": "file" + }], + "analyzer": "scancode_cli", + "commit": "2fb9a49363021922eb0fcc9874baabfc252a827c", + "message": "[graal] Initial commit" + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1562563540.615095, + "updated_on": 1525605103.0, + "uuid": "29d7a294d2316825de824f1084a783f8479073e0" + }, + { + "backend_name": "CoLic", + "backend_version": "0.5.0", + "category": "code_license_scancode_cli", + "data": { + "Author": "Valerio Cosentino ", + "AuthorDate": "Sun May 6 13:56:51 2018 +0200", + "Commit": "Valerio Cosentino ", + "CommitDate": "Sun May 6 13:56:51 2018 +0200", + "analysis": [{ + "authors": [{ + "end_line": 20, + "start_line": 19, + "value": "Valerio Cosentino " + }], + "base_name": "codecomplexity", + "copyrights": [{ + "end_line": 3, + "start_line": 3, + "value": "Copyright (c) 2015-2018 Bitergia" + }], + "date": "2019-07-08", + "dirs_count": 0, + "extension": ".py", + "file_path": "graal/codecomplexity.py", + "file_type": "Python script, ASCII text executable", + "files_count": 0, + "holders": [{ + "end_line": 3, + "start_line": 3, + "value": "Bitergia" + }], + "is_archive": false, + "is_binary": false, + "is_media": false, + "is_script": true, + "is_source": true, + "is_text": true, + "license_expressions": [ + "gpl-3.0-plus" + ], + "licenses": [{ + "category": "Copyleft", + "end_line": 17, + "homepage_url": "http://www.gnu.org/licenses/gpl-3.0-standalone.html", + "is_exception": false, + "key": "gpl-3.0-plus", + "matched_rule": { + "identifier": "gpl-3.0-plus_12.RULE", + "is_license_notice": true, + "is_license_reference": false, + "is_license_tag": false, + "is_license_text": false, + "license_expression": "gpl-3.0-plus", + "licenses": [ + "gpl-3.0-plus" + ], + "match_coverage": 98.2, + "matched_length": 109, + "matcher": "3-seq", + "rule_length": 111, + "rule_relevance": 100 + }, + "matched_text": "This program is free software; you can redistribute it and/or modify\n# it under the terms of the GNU General Public License as published by\n# the Free Software Foundation; either version 3 [of] [the] [License], or\n# (at your option) any later version.\n#\n# This program is distributed in the hope that it will be useful,\n# but WITHOUT ANY WARRANTY; without even the implied warranty of\n# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n# GNU General Public License for more details.\n#\n# You should have received a copy of the GNU General Public License\n# along with this program; if not, write to the Free Software\n# Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-", + "name": "GNU General Public License 3.0 or later", + "owner": "Free Software Foundation (FSF)", + "reference_url": "https://enterprise.dejacode.com/urn/urn:dje:license:gpl-3.0-plus", + "score": 98.2, + "short_name": "GPL 3.0 or later", + "spdx_license_key": "GPL-3.0-or-later", + "spdx_url": "https://spdx.org/licenses/GPL-3.0-or-later", + "start_line": 5, + "text_url": "http://www.gnu.org/licenses/gpl-3.0-standalone.html" + }], + "md5": "aa66e700b06ead2a28c2dc29633ebc00", + "mime_type": "text/x-python", + "name": "codecomplexity.py", + "path": "codecomplexity.py", + "programming_language": "Python", + "scan_errors": [], + "sha1": "124e07ae6c850eb232aaf07f43cdb2b2ad2a1db1", + "size": 7817, + "size_count": 0, + "type": "file" + }], + "analyzer": "scancode_cli", + "commit": "a957488c9bd95e3b72a30611edc61496ee152430", + "message": "[codecomplexity] Enable analysis with no file filtering\n\nThis patch allows to handle analysis without file filtering." + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1562563562.34835, + "updated_on": 1525607811.0, + "uuid": "ce7c47568fd87100aff497dd7677b0736d85db1e" + }, + { + "backend_name": "CoLic", + "backend_version": "0.5.0", + "category": "code_license_scancode", + "data": { + "Author": "Valerio Cosentino ", + "AuthorDate": "Sun May 6 13:56:51 2018 +0200", + "Commit": "Valerio Cosentino ", + "CommitDate": "Sun May 6 13:56:51 2018 +0200", + "analysis": [{ + "copyrights": [{ + "end_line": 3, + "start_line": 3, + "value": "Copyright (c) 2015-2018 Bitergia" + }], + "file_path": "graal/codecomplexity.py", + "licenses": [{ + "category": "Copyleft", + "end_line": 17, + "homepage_url": "http://www.gnu.org/licenses/gpl-3.0-standalone.html", + "is_exception": false, + "key": "gpl-3.0-plus", + "matched_rule": { + "identifier": "gpl-3.0-plus_117.RULE", + "is_license_notice": true, + "is_license_reference": false, + "is_license_tag": false, + "is_license_text": false, + "license_expression": "gpl-3.0-plus", + "licenses": [ + "gpl-3.0-plus" + ], + "match_coverage": 97.35, + "matched_length": 110, + "matcher": "3-seq", + "rule_length": 113, + "rule_relevance": 100.0 + }, + "name": "GNU General Public License 3.0 or later", + "owner": "Free Software Foundation (FSF)", + "reference_url": "https://enterprise.dejacode.com/urn/urn:dje:license:gpl-3.0-plus", + "score": 97.35, + "short_name": "GPL 3.0 or later", + "spdx_license_key": "GPL-3.0-or-later", + "spdx_url": "https://spdx.org/licenses/GPL-3.0-or-later", + "start_line": 5, + "text_url": "http://www.gnu.org/licenses/gpl-3.0-standalone.html" + }] + }], + "analyzer": "scancode", + "commit": "a957488c9bd95e3b72a30611edc61496ee152430", + "message": "[codecomplexity] Enable analysis with no file filtering\n\nThis patch allows to handle analysis without file filtering." + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1565108843.015344, + "updated_on": 1525607811.0, + "uuid": "ce7c47568fd87100aff497dd7677b0736d85db1e" + }, + { + "backend_name": "CoLic", + "backend_version": "0.5.0", + "category": "code_license_scancode", + "data": { + "Author": "valerio ", + "AuthorDate": "Sun May 6 14:02:36 2018 +0200", + "Commit": "GitHub ", + "CommitDate": "Sun May 6 14:02:36 2018 +0200", + "analysis": [{ + "copyrights": [], + "file_path": "README.md", + "licenses": [] + }], + "analyzer": "scancode", + "commit": "8aedf09e36008fee19192985c0eb51879c6c61e4", + "message": "Create README.md" + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1565108866.965087, + "updated_on": 1525608156.0, + "uuid": "856beb87d6b324b136e718295cc6ad69343a1066" + }, + { + "backend_name": "CoLic", + "backend_version": "0.5.0", + "category": "code_license_nomos", + "data": { + "Author": "inishchith ", + "AuthorDate": "Mon Feb 25 21:44:23 2019 +0530", + "Commit": "Valerio Cosentino ", + "CommitDate": "Tue Feb 26 16:24:43 2019 +0100", + "analysis": [{ + "file_path": "tests/test_colic.py", + "licenses": [ + "GPL-3.0" + ] + }, + { + "file_path": "tests/test_nomos.py", + "licenses": [ + "GPL-3.0" + ] + }, + { + "file_path": "tests/test_scancode.py", + "licenses": [ + "GPL-3.0" + ] + }, + { + "file_path": "tests/utils.py", + "licenses": [ + "GPL-3.0" + ] + } + ], + "analyzer": "nomos", + "commit": "dda651a12eb05b2d604522b4fbdbf07d3e213eff", + "message": "[tests] Move executable path to utils.py\n\nMove executable paths - NOMOS_PATH and SCANCODE_PATH to utils.py" + }, + "graal_version": "0.2.1", + "origin": "https://github.com/chaoss/grimoirelab-graal", + "tag": "https://github.com/chaoss/grimoirelab-graal", + "timestamp": 1565204679.302685, + "updated_on": 1551194683.0, + "uuid": "79a561015d5d49c3ec6754a05db24735f957814e" + } +] diff --git a/tests/test_cocom.py b/tests/test_cocom.py new file mode 100644 index 000000000..16f4129d9 --- /dev/null +++ b/tests/test_cocom.py @@ -0,0 +1,143 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2015-2019 Bitergia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Authors: +# Nishchith Shetty +# +import logging +import unittest + +from base import TestBaseBackend +from grimoire_elk.enriched.cocom import logger + + +HEADER_JSON = {"Content-Type": "application/json"} + + +class TestCoCom(TestBaseBackend): + """Test CoCom backend""" + + connector = "cocom" + ocean_index = "test_" + connector + enrich_index = "test_" + connector + "_enrich" + + def test_has_identites(self): + """Test value of has_identities method""" + + enrich_backend = self.connectors[self.connector][2]() + self.assertFalse(enrich_backend.has_identities()) + + def test_items_to_raw(self): + """Test whether JSON items are properly inserted into ES""" + + result = self._test_items_to_raw() + + self.assertGreater(result['items'], 0) + self.assertGreater(result['raw'], 0) + self.assertGreaterEqual(result['items'], result['raw']) + + def test_raw_to_enrich(self): + """Test whether the raw index is properly enriched""" + + result = self._test_raw_to_enrich() + + self.assertGreater(result['raw'], 0) + self.assertGreater(result['enrich'], 0) + self.assertGreaterEqual(result['enrich'], result['raw']) + + enrich_backend = self.connectors[self.connector][2]() + + item = self.items[0] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['ccn'], 75) + self.assertEqual(eitem['num_funs'], 31) + self.assertEqual(eitem['tokens'], 2207) + self.assertEqual(eitem['loc'], 372) + self.assertEqual(eitem['ext'], "py") + self.assertEqual(eitem['blanks'], 158) + self.assertEqual(eitem['comments'], 193) + self.assertEqual(eitem['file_path'], "graal/graal.py") + self.assertEqual(eitem['modules'], ["graal"]) + self.assertEqual(eitem["comments_per_loc"], 0.52) + self.assertEqual(eitem["blanks_per_loc"], 0.42) + self.assertEqual(eitem["loc_per_function"], 12.0) + + item = self.items[1] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['ccn'], 70) + self.assertEqual(eitem['num_funs'], 52) + self.assertEqual(eitem['tokens'], 4623) + self.assertEqual(eitem['loc'], 527) + self.assertEqual(eitem['ext'], "py") + self.assertEqual(eitem['blanks'], 204) + self.assertEqual(eitem['comments'], 77) + self.assertEqual(eitem['file_path'], "tests/test_graal.py") + self.assertEqual(eitem['modules'], ["tests"]) + self.assertEqual(eitem["comments_per_loc"], 0.15) + self.assertEqual(eitem["blanks_per_loc"], 0.39) + self.assertEqual(eitem["loc_per_function"], 10.13) + + item = self.items[2] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['ccn'], 8) + self.assertEqual(eitem['num_funs'], 3) + self.assertEqual(eitem['tokens'], 421) + self.assertEqual(eitem['loc'], 80) + self.assertEqual(eitem['ext'], "py") + self.assertEqual(eitem['blanks'], 26) + self.assertEqual(eitem['comments'], 63) + self.assertEqual(eitem['file_path'], "graal/backends/core/analyzers/lizard.py") + self.assertEqual(eitem['modules'], ["graal", "graal/backends", "graal/backends/core", "graal/backends/core/analyzers"]) + self.assertEqual(eitem["comments_per_loc"], 0.79) + self.assertEqual(eitem["blanks_per_loc"], 0.33) + self.assertEqual(eitem["loc_per_function"], 26.67) + + item = self.items[3] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['ccn'], None) + self.assertEqual(eitem['num_funs'], None) + self.assertEqual(eitem['tokens'], None) + self.assertEqual(eitem['loc'], None) + self.assertEqual(eitem['ext'], None) + self.assertEqual(eitem['blanks'], None) + self.assertEqual(eitem['comments'], None) + self.assertEqual(eitem['file_path'], "tests/data/analyzers/sample_code.py") + self.assertEqual(eitem['modules'], ["tests", "tests/data", "tests/data/analyzers"]) + self.assertEqual(eitem["comments_per_loc"], None) + self.assertEqual(eitem["blanks_per_loc"], None) + self.assertEqual(eitem["loc_per_function"], None) + + def test_cocom_analysis_study(self): + """ Test that the cocom analysis study works correctly """ + + study, ocean_backend, enrich_backend = self._test_study('enrich_cocom_analysis') + + with self.assertLogs(logger, level='INFO') as cm: + + if study.__name__ == "enrich_cocom_analysis": + study(ocean_backend, enrich_backend) + self.assertEqual(cm.output[0], 'INFO:grimoire_elk.enriched.cocom:[enrich-cocom-analysis] Start ' + 'enrich_cocom_analysis study') + self.assertEqual(cm.output[-1], 'INFO:grimoire_elk.enriched.cocom:[enrich-cocom-analysis] End ' + 'enrich_cocom_analysis study') + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') + logging.getLogger("urllib3").setLevel(logging.WARNING) + logging.getLogger("requests").setLevel(logging.WARNING) + unittest.main(warnings='ignore') diff --git a/tests/test_colic.py b/tests/test_colic.py new file mode 100644 index 000000000..b7c81dfa0 --- /dev/null +++ b/tests/test_colic.py @@ -0,0 +1,133 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2015-2019 Bitergia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Authors: +# Nishchith Shetty +# +import logging +import unittest + +from base import TestBaseBackend +from grimoire_elk.enriched.colic import logger + + +HEADER_JSON = {"Content-Type": "application/json"} + + +class TestCoLic(TestBaseBackend): + """Test CoLic backend""" + + connector = "colic" + ocean_index = "test_" + connector + enrich_index = "test_" + connector + "_enrich" + + def test_has_identites(self): + """Test value of has_identities method""" + + enrich_backend = self.connectors[self.connector][2]() + self.assertFalse(enrich_backend.has_identities()) + + def test_items_to_raw(self): + """Test whether JSON items are properly inserted into ES""" + + result = self._test_items_to_raw() + + self.assertGreater(result['items'], 0) + self.assertGreater(result['raw'], 0) + self.assertGreaterEqual(result['items'], result['raw']) + + def test_raw_to_enrich(self): + """Test whether the raw index is properly enriched""" + + result = self._test_raw_to_enrich() + + self.assertGreater(result['raw'], 0) + self.assertGreater(result['enrich'], 0) + self.assertGreaterEqual(result['enrich'], result['raw']) + + enrich_backend = self.connectors[self.connector][2]() + + item = self.items[0] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['licenses'], ["gpl-3.0"]) + self.assertEqual(eitem['has_license'], 1) + self.assertEqual(eitem['license_name'], ["GNU General Public License 3.0"]) + self.assertEqual(eitem['copyrights'], ["Copyright (c) 2007 Free Software Foundation, Inc. "]) + self.assertEqual(eitem['has_copyright'], 1) + self.assertEqual(eitem['modules'], []) + self.assertEqual(eitem['file_path'], "LICENSE") + + item = self.items[1] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['licenses'], ["gpl-3.0-plus"]) + self.assertEqual(eitem['has_license'], 1) + self.assertEqual(eitem['license_name'], ["GNU General Public License 3.0 or later"]) + self.assertEqual(eitem['copyrights'], ["Copyright (c) 2015-2018 Bitergia"]) + self.assertEqual(eitem['has_copyright'], 1) + self.assertEqual(eitem['modules'], ["graal"]) + self.assertEqual(eitem['file_path'], "graal/codecomplexity.py") + + item = self.items[2] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['licenses'], ["gpl-3.0-plus"]) + self.assertEqual(eitem['has_license'], 1) + self.assertEqual(eitem['license_name'], ["GNU General Public License 3.0 or later"]) + self.assertEqual(eitem['copyrights'], ["Copyright (c) 2015-2018 Bitergia"]) + self.assertEqual(eitem['has_copyright'], 1) + self.assertEqual(eitem['modules'], ["graal"]) + self.assertEqual(eitem['file_path'], "graal/codecomplexity.py") + + item = self.items[3] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['licenses'], []) + self.assertEqual(eitem['has_license'], 0) + self.assertEqual(eitem['license_name'], []) + self.assertEqual(eitem['copyrights'], []) + self.assertEqual(eitem['has_copyright'], 0) + self.assertEqual(eitem['modules'], []) + self.assertEqual(eitem['file_path'], "README.md") + + item = self.items[4] + eitem = enrich_backend.get_rich_items(item)[0] + self.assertEqual(eitem['licenses'], ["GPL-3.0"]) + self.assertEqual(eitem['has_license'], 1) + self.assertEqual(eitem['license_name'], ["GPL-3.0"]) + self.assertEqual(eitem['copyrights'], []) + self.assertEqual(eitem['has_copyright'], 0) + self.assertEqual(eitem['modules'], ["tests"]) + self.assertEqual(eitem['file_path'], "tests/test_colic.py") + + def test_colic_analysis_study(self): + """ Test that the colic analysis study works correctly """ + + study, ocean_backend, enrich_backend = self._test_study('enrich_colic_analysis') + + with self.assertLogs(logger, level='INFO') as cm: + + if study.__name__ == "enrich_colic_analysis": + study(ocean_backend, enrich_backend) + self.assertEqual(cm.output[0], 'INFO:grimoire_elk.enriched.colic:[enrich-colic-analysis] Start ' + 'enrich_colic_analysis study') + self.assertEqual(cm.output[-1], 'INFO:grimoire_elk.enriched.colic:[enrich-colic-analysis] End ' + 'enrich_colic_analysis study') + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') + logging.getLogger("urllib3").setLevel(logging.WARNING) + logging.getLogger("requests").setLevel(logging.WARNING) + unittest.main(warnings='ignore')