-
Notifications
You must be signed in to change notification settings - Fork 122
[graal] Add support of Graal CoCom & CoLic Backend #672
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
valeriocos
merged 8 commits into
chaoss:master
from
inishchith:gsoc-graal-2019-integration
Aug 23, 2019
Merged
Changes from all commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
7afa5c5
[requirements.txt] Add Graal module dependency for integration
inishchith 82b2a11
[graal] Add CoCom & CoLic enricher along with study implementation
inishchith 1308300
[graal] Add connector for Graal Backends and Segregate common methods
inishchith 7e79734
[graal:tests] Add appropriate tests for Graal integration (WIP)
inishchith 4b4e088
[schema] Add CoCom and CoLic schema
inishchith e0430d1
[tests:study] Add tests for CoCom & CoLic study implementation
inishchith 400a53d
[colic] Add category reference implementation and corresponding tests
inishchith effc887
[logger] Add logs for study and Fix CoLic query
inishchith File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,338 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2015-2019 Bitergia | ||
# | ||
# This program is free software; you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation; either version 3 of the License, or | ||
# (at your option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License | ||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
# | ||
# Authors: | ||
# Valerio Cosentino <[email protected]> | ||
# Nishchith Shetty <[email protected]> | ||
# | ||
|
||
import logging | ||
from dateutil.relativedelta import relativedelta | ||
|
||
from elasticsearch import Elasticsearch as ES, RequestsHttpConnection | ||
|
||
from .enrich import (Enrich, | ||
metadata) | ||
from .graal_study_evolution import (get_to_date, | ||
get_unique_repository, | ||
get_files_at_time) | ||
from .utils import fix_field_date | ||
from ..elastic_mapping import Mapping as BaseMapping | ||
|
||
from grimoirelab_toolkit.datetime import datetime_utcnow | ||
from grimoire_elk.elastic import ElasticSearch | ||
|
||
MAX_SIZE_BULK_ENRICHED_ITEMS = 200 | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Mapping(BaseMapping): | ||
|
||
@staticmethod | ||
def get_elastic_mappings(es_major): | ||
"""Get Elasticsearch mapping. | ||
|
||
Ensure data.message is string, since it can be very large | ||
|
||
:param es_major: major version of Elasticsearch, as string | ||
:returns: dictionary with a key, 'items', with the mapping | ||
""" | ||
|
||
mapping = ''' | ||
{ | ||
"dynamic":true, | ||
"properties": { | ||
"id" : { | ||
"type" : "keyword" | ||
}, | ||
"interval_months" : { | ||
"type" : "long" | ||
}, | ||
"origin" : { | ||
"type" : "keyword" | ||
}, | ||
"study_creation_date" : { | ||
"type" : "date" | ||
}, | ||
"total_blanks" : { | ||
"type" : "long" | ||
}, | ||
"total_blanks_per_loc" : { | ||
"type" : "float" | ||
}, | ||
"total_ccn" : { | ||
"type" : "long" | ||
}, | ||
"total_comments" : { | ||
"type" : "long" | ||
}, | ||
"total_comments_per_loc" : { | ||
"type" : "float" | ||
}, | ||
"total_files" : { | ||
"type" : "long" | ||
}, | ||
"total_loc" : { | ||
"type" : "long" | ||
}, | ||
"total_loc_per_function" : { | ||
"type" : "float" | ||
}, | ||
"total_num_funs" : { | ||
"type" : "long" | ||
}, | ||
"total_tokens" : { | ||
"type" : "long" | ||
} | ||
} | ||
} | ||
''' | ||
|
||
return {"items": mapping} | ||
|
||
|
||
class CocomEnrich(Enrich): | ||
metrics = ["ccn", "num_funs", "tokens", "loc", "comments", "blanks"] | ||
|
||
def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=None, | ||
db_user='', db_password='', db_host=''): | ||
super().__init__(db_sortinghat, db_projects_map, json_projects_map, | ||
db_user, db_password, db_host) | ||
|
||
self.studies = [] | ||
self.studies.append(self.enrich_cocom_analysis) | ||
|
||
def get_identities(self, item): | ||
""" Return the identities from an item """ | ||
identities = [] | ||
|
||
return identities | ||
|
||
def has_identities(self): | ||
""" Return whether the enriched items contains identities """ | ||
|
||
return False | ||
|
||
def get_field_unique_id(self): | ||
return "id" | ||
|
||
def extract_modules(self, file_path): | ||
""" Extracts module path from the given file path """ | ||
path_chunks = file_path.split('/') | ||
|
||
modules = [] | ||
for idx in range(len(path_chunks)): | ||
sub_path = '/'.join(path_chunks[:idx]) | ||
|
||
if sub_path: | ||
modules.append(sub_path) | ||
|
||
return modules | ||
|
||
@metadata | ||
def get_rich_item(self, file_analysis): | ||
|
||
eitem = {} | ||
for metric in self.metrics: | ||
if file_analysis.get(metric, None) is not None: | ||
eitem[metric] = file_analysis[metric] | ||
else: | ||
eitem[metric] = None | ||
|
||
eitem["file_path"] = file_analysis.get("file_path", None) | ||
eitem["ext"] = file_analysis.get("ext", None) | ||
eitem['modules'] = self.extract_modules(eitem['file_path']) | ||
eitem = self.__add_derived_metrics(file_analysis, eitem) | ||
|
||
return eitem | ||
|
||
def get_rich_items(self, item): | ||
# The real data | ||
entry = item['data'] | ||
|
||
enriched_items = [] | ||
|
||
for file_analysis in entry["analysis"]: | ||
eitem = self.get_rich_item(file_analysis) | ||
|
||
for f in self.RAW_FIELDS_COPY: | ||
if f in item: | ||
eitem[f] = item[f] | ||
else: | ||
eitem[f] = None | ||
|
||
# common attributes | ||
eitem['commit_sha'] = entry['commit'] | ||
eitem['author'] = entry['Author'] | ||
eitem['committer'] = entry['Commit'] | ||
eitem['message'] = entry['message'] | ||
eitem['author_date'] = fix_field_date(entry['AuthorDate']) | ||
eitem['commit_date'] = fix_field_date(entry['CommitDate']) | ||
|
||
if self.prjs_map: | ||
eitem.update(self.get_item_project(eitem)) | ||
|
||
# uuid | ||
eitem['id'] = "{}_{}".format(eitem['commit_sha'], eitem['file_path']) | ||
|
||
eitem.update(self.get_grimoire_fields(entry["AuthorDate"], "file")) | ||
|
||
self.add_repository_labels(eitem) | ||
self.add_metadata_filter_raw(eitem) | ||
|
||
enriched_items.append(eitem) | ||
|
||
return enriched_items | ||
|
||
def __add_derived_metrics(self, file_analysis, eitem): | ||
""" Add derived metrics fields """ | ||
|
||
# TODO: Fix Logic: None rather than 1 | ||
if eitem["loc"] is not None and eitem["comments"] is not None and eitem["num_funs"] is not None: | ||
eitem["comments_per_loc"] = round(eitem["comments"] / max(eitem["loc"], 1), 2) | ||
eitem["blanks_per_loc"] = round(eitem["blanks"] / max(eitem["loc"], 1), 2) | ||
eitem["loc_per_function"] = round(eitem["loc"] / max(eitem["num_funs"], 1), 2) | ||
else: | ||
eitem["comments_per_loc"] = None | ||
eitem["blanks_per_loc"] = None | ||
eitem["loc_per_function"] = None | ||
|
||
return eitem | ||
|
||
def enrich_items(self, ocean_backend, events=False): | ||
items_to_enrich = [] | ||
num_items = 0 | ||
ins_items = 0 | ||
|
||
for item in ocean_backend.fetch(): | ||
rich_items = self.get_rich_items(item) | ||
|
||
items_to_enrich.extend(rich_items) | ||
if len(items_to_enrich) < MAX_SIZE_BULK_ENRICHED_ITEMS: | ||
continue | ||
|
||
num_items += len(items_to_enrich) | ||
ins_items += self.elastic.bulk_upload(items_to_enrich, self.get_field_unique_id()) | ||
items_to_enrich = [] | ||
|
||
if len(items_to_enrich) > 0: | ||
num_items += len(items_to_enrich) | ||
ins_items += self.elastic.bulk_upload(items_to_enrich, self.get_field_unique_id()) | ||
|
||
if num_items != ins_items: | ||
missing = num_items - ins_items | ||
logger.error("%s/%s missing items for Cocom", str(missing), str(num_items)) | ||
else: | ||
logger.info("%s items inserted for Cocom", str(num_items)) | ||
|
||
return num_items | ||
|
||
def enrich_cocom_analysis(self, ocean_backend, enrich_backend, no_incremental=False, | ||
out_index="cocom_enrich_graal_repo", interval_months=[3], | ||
date_field="grimoire_creation_date"): | ||
|
||
logger.info("[enrich-cocom-analysis] Start enrich_cocom_analysis study") | ||
|
||
es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100, | ||
verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection) | ||
in_index = enrich_backend.elastic.index | ||
interval_months = list(map(int, interval_months)) | ||
|
||
unique_repos = es_in.search( | ||
index=in_index, | ||
body=get_unique_repository()) | ||
|
||
repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])] | ||
current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0) | ||
|
||
logger.info("[enrich-cocom-analysis] {} repositories to process".format(len(repositories))) | ||
es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping) | ||
es_out.add_alias("cocom_study") | ||
|
||
num_items = 0 | ||
ins_items = 0 | ||
|
||
for repository_url in repositories: | ||
logger.info("[enrich-cocom-analysis] Start analysis for {}".format(repository_url)) | ||
evolution_items = [] | ||
|
||
for interval in interval_months: | ||
|
||
to_month = get_to_date(es_in, in_index, out_index, repository_url, interval) | ||
to_month = to_month.replace(month=int(interval), day=1, hour=0, minute=0, second=0) | ||
|
||
while to_month < current_month: | ||
files_at_time = es_in.search( | ||
index=in_index, | ||
body=get_files_at_time(repository_url, to_month.isoformat()) | ||
)['aggregations']['file_stats'].get("buckets", []) | ||
|
||
if not len(files_at_time): | ||
to_month = to_month + relativedelta(months=+interval) | ||
continue | ||
|
||
repository_name = repository_url.split("/")[-1] | ||
evolution_item = { | ||
"id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval), | ||
"origin": repository_url, | ||
"interval_months": interval, | ||
"study_creation_date": to_month.isoformat(), | ||
"total_files": len(files_at_time) | ||
} | ||
|
||
for file_ in files_at_time: | ||
file_details = file_["1"]["hits"]["hits"][0]["_source"] | ||
|
||
for metric in self.metrics: | ||
total_metric = "total_" + metric | ||
evolution_item[total_metric] = evolution_item.get(total_metric, 0) | ||
evolution_item[total_metric] += file_details[metric] if file_details[metric] is not None else 0 | ||
|
||
# TODO: Fix Logic: None rather than 1 | ||
inishchith marked this conversation as resolved.
Show resolved
Hide resolved
|
||
evolution_item["total_comments_per_loc"] = round( | ||
evolution_item["total_comments"] / max(evolution_item["total_loc"], 1), 2) | ||
evolution_item["total_blanks_per_loc"] = round( | ||
evolution_item["total_blanks"] / max(evolution_item["total_loc"], 1), 2) | ||
evolution_item["total_loc_per_function"] = round( | ||
evolution_item["total_loc"] / max(evolution_item["total_num_funs"], 1), 2) | ||
|
||
evolution_item.update(self.get_grimoire_fields(evolution_item["study_creation_date"], "stats")) | ||
evolution_items.append(evolution_item) | ||
inishchith marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if len(evolution_items) >= self.elastic.max_items_bulk: | ||
num_items += len(evolution_items) | ||
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) | ||
evolution_items = [] | ||
|
||
to_month = to_month + relativedelta(months=+interval) | ||
|
||
if len(evolution_items) > 0: | ||
num_items += len(evolution_items) | ||
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id()) | ||
|
||
if num_items != ins_items: | ||
inishchith marked this conversation as resolved.
Show resolved
Hide resolved
|
||
missing = num_items - ins_items | ||
logger.error( | ||
"[enrich-cocom-analysis] %s/%s missing items for Graal CoCom Analysis Study", str(missing), str(num_items) | ||
) | ||
else: | ||
logger.info("[enrich-cocom-analysis] %s items inserted for Graal CoCom Analysis Study", str(num_items)) | ||
|
||
logger.info("[enrich-cocom-analysis] End analysis for {} with month interval".format(repository_url, interval)) | ||
|
||
inishchith marked this conversation as resolved.
Show resolved
Hide resolved
|
||
logger.info("[enrich-cocom-analysis] End enrich_cocom_analysis study") |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.