Skip to content

Commit 5373bfd

Browse files
committed
[logger] Add logs for study and Fix CoLic query
Add appropriate logs for the enrichers Update CoCom Study method name Fix Tests Signed-off-by: inishchith <[email protected]>
1 parent 400a53d commit 5373bfd

File tree

4 files changed

+93
-49
lines changed

4 files changed

+93
-49
lines changed

grimoire_elk/enriched/cocom.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=N
115115
db_user, db_password, db_host)
116116

117117
self.studies = []
118-
self.studies.append(self.enrich_repo_analysis)
118+
self.studies.append(self.enrich_cocom_analysis)
119119

120120
def get_identities(self, item):
121121
""" Return the identities from an item """
@@ -242,11 +242,11 @@ def enrich_items(self, ocean_backend, events=False):
242242

243243
return num_items
244244

245-
def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=False,
246-
out_index="cocom_enrich_graal_repo", interval_months=[3],
247-
date_field="grimoire_creation_date"):
245+
def enrich_cocom_analysis(self, ocean_backend, enrich_backend, no_incremental=False,
246+
out_index="cocom_enrich_graal_repo", interval_months=[3],
247+
date_field="grimoire_creation_date"):
248248

249-
logger.info("[cocom] Starting enrich_repository_analysis study")
249+
logger.info("[enrich-cocom-analysis] Start enrich_cocom_analysis study")
250250

251251
es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100,
252252
verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection)
@@ -259,11 +259,16 @@ def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=Fal
259259

260260
repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])]
261261
current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0)
262+
263+
logger.info("[enrich-cocom-analysis] {} repositories to process".format(len(repositories)))
264+
es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping)
265+
es_out.add_alias("cocom_study")
266+
262267
num_items = 0
263268
ins_items = 0
264269

265270
for repository_url in repositories:
266-
es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping)
271+
logger.info("[enrich-cocom-analysis] Start analysis for {}".format(repository_url))
267272
evolution_items = []
268273

269274
for interval in interval_months:
@@ -321,8 +326,12 @@ def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=Fal
321326

322327
if num_items != ins_items:
323328
missing = num_items - ins_items
324-
logger.error("%s/%s missing items for Graal CoCom Analysis Study", str(missing), str(num_items))
329+
logger.error(
330+
"[enrich-cocom-analysis] %s/%s missing items for Graal CoCom Analysis Study", str(missing), str(num_items)
331+
)
325332
else:
326-
logger.info("%s items inserted for Graal CoCom Analysis Study", str(num_items))
333+
logger.info("[enrich-cocom-analysis] %s items inserted for Graal CoCom Analysis Study", str(num_items))
334+
335+
logger.info("[enrich-cocom-analysis] End analysis for {} with month interval".format(repository_url, interval))
327336

328-
logger.info("[cocom] Ending enrich_repository_analysis study")
337+
logger.info("[enrich-cocom-analysis] End enrich_cocom_analysis study")

grimoire_elk/enriched/colic.py

Lines changed: 67 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,7 @@
2626
from .enrich import (Enrich,
2727
metadata)
2828
from .graal_study_evolution import (get_to_date,
29-
get_unique_repository,
30-
get_files_at_time)
29+
get_unique_repository)
3130
from .utils import fix_field_date
3231
from ..elastic_mapping import Mapping as BaseMapping
3332

@@ -107,7 +106,43 @@ def has_identities(self):
107106
def get_field_unique_id(self):
108107
return "id"
109108

110-
def get_licensed_files(self, repository_url, to_date):
109+
def __get_total_files(self, repository_url, to_date):
110+
""" Retrieve total number for files until to_date, corresponding
111+
to the given repository
112+
"""
113+
114+
query_total_files = """
115+
{
116+
"size": 0,
117+
"aggs": {
118+
"1": {
119+
"cardinality": {
120+
"field": "file_path"
121+
}
122+
}
123+
},
124+
"query": {
125+
"bool": {
126+
"filter": [{
127+
"term": {
128+
"origin": "%s"
129+
}
130+
},
131+
{
132+
"range": {
133+
"metadata__updated_on": {
134+
"lte": "%s"
135+
}
136+
}
137+
}]
138+
}
139+
}
140+
}
141+
""" % (repository_url, to_date)
142+
143+
return query_total_files
144+
145+
def __get_licensed_files(self, repository_url, to_date):
111146
""" Retrieve all the licensed files until the to_date, corresponding
112147
to the given repository.
113148
"""
@@ -124,18 +159,14 @@ def get_licensed_files(self, repository_url, to_date):
124159
},
125160
"query": {
126161
"bool": {
127-
"must": [{
128-
"match_phrase": {
129-
"has_license": {
130-
"query": 1
131-
}
162+
"filter": [{
163+
"term": {
164+
"has_license": 1
132165
}
133166
},
134167
{
135-
"match_phrase": {
136-
"origin": {
137-
"query": "%s"
138-
}
168+
"term": {
169+
"origin": "%s"
139170
}
140171
},
141172
{
@@ -152,7 +183,7 @@ def get_licensed_files(self, repository_url, to_date):
152183

153184
return query_licensed_files
154185

155-
def get_copyrighted_files(self, repository_url, to_date):
186+
def __get_copyrighted_files(self, repository_url, to_date):
156187
""" Retrieve all the copyrighted files until the to_date, corresponding
157188
to the given repository.
158189
"""
@@ -169,18 +200,14 @@ def get_copyrighted_files(self, repository_url, to_date):
169200
},
170201
"query": {
171202
"bool": {
172-
"must": [{
173-
"match_phrase": {
174-
"has_copyright": {
175-
"query": 1
176-
}
203+
"filter": [{
204+
"term": {
205+
"has_copyright": 1
177206
}
178207
},
179208
{
180-
"match_phrase": {
181-
"origin": {
182-
"query": "%s"
183-
}
209+
"term": {
210+
"origin": "%s"
184211
}
185212
},
186213
{
@@ -338,7 +365,7 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa
338365
out_index="colic_enrich_graal_repo", interval_months=[3],
339366
date_field="grimoire_creation_date"):
340367

341-
logger.info("[colic] Starting enrich_colic_analysis study")
368+
logger.info("[enrich-colic-analysis] Start enrich_colic_analysis study")
342369

343370
es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100,
344371
verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection)
@@ -350,12 +377,17 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa
350377
body=get_unique_repository())
351378

352379
repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])]
380+
381+
logger.info("[enrich-colic-analysis] {} repositories to process".format(len(repositories)))
382+
es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping)
383+
es_out.add_alias("colic_study")
384+
353385
current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0)
354386
num_items = 0
355387
ins_items = 0
356388

357389
for repository_url in repositories:
358-
es_out = ElasticSearch(enrich_backend.elastic.url, out_index, mappings=Mapping)
390+
logger.info("[enrich-colic-analysis] Start analysis for {}".format(repository_url))
359391
evolution_items = []
360392

361393
for interval in interval_months:
@@ -366,20 +398,19 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa
366398
while to_month < current_month:
367399
copyrighted_files_at_time = es_in.search(
368400
index=in_index,
369-
body=self.get_copyrighted_files(repository_url, to_month.isoformat()))
401+
body=self.__get_copyrighted_files(repository_url, to_month.isoformat()))
370402

371403
licensed_files_at_time = es_in.search(
372404
index=in_index,
373-
body=self.get_licensed_files(repository_url, to_month.isoformat()))
405+
body=self.__get_licensed_files(repository_url, to_month.isoformat()))
374406

375407
files_at_time = es_in.search(
376408
index=in_index,
377-
body=get_files_at_time(repository_url, to_month.isoformat()))
409+
body=self.__get_total_files(repository_url, to_month.isoformat()))
378410

379411
licensed_files = int(licensed_files_at_time["aggregations"]["1"]["value"])
380412
copyrighted_files = int(copyrighted_files_at_time["aggregations"]["1"]["value"])
381-
# TODO: Fix - need more efficient query
382-
total_files = len(files_at_time['aggregations']['file_stats'].get("buckets", []))
413+
total_files = int(files_at_time["aggregations"]["1"]["value"])
383414

384415
if not total_files:
385416
to_month = to_month + relativedelta(months=+interval)
@@ -411,8 +442,12 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa
411442

412443
if num_items != ins_items:
413444
missing = num_items - ins_items
414-
logger.error("%s/%s missing items for Graal CoLic Analysis Study", str(missing), str(num_items))
445+
logger.error(
446+
"[enrich-colic-analysis] %s/%s missing items for Graal CoLic Analysis Study", str(missing), str(num_items)
447+
)
415448
else:
416-
logger.info("%s items inserted for Graal CoLic Analysis Study", str(num_items))
449+
logger.info("[enrich-colic-analysis] %s items inserted for Graal CoLic Analysis Study", str(num_items))
450+
451+
logger.info("[enrich-colic-analysis] End analysis for {} with month interval".format(repository_url, interval))
417452

418-
logger.info("[colic] Ending enrich_colic_analysis study")
453+
logger.info("[enrich-colic-analysis] End enrich_colic_analysis study")

tests/test_cocom.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -124,16 +124,16 @@ def test_raw_to_enrich(self):
124124
def test_cocom_analysis_study(self):
125125
""" Test that the cocom analysis study works correctly """
126126

127-
study, ocean_backend, enrich_backend = self._test_study('enrich_repo_analysis')
127+
study, ocean_backend, enrich_backend = self._test_study('enrich_cocom_analysis')
128128

129129
with self.assertLogs(logger, level='INFO') as cm:
130130

131-
if study.__name__ == "enrich_repo_analysis":
131+
if study.__name__ == "enrich_cocom_analysis":
132132
study(ocean_backend, enrich_backend)
133-
self.assertEqual(cm.output[0], 'INFO:grimoire_elk.enriched.cocom:[cocom] Starting '
134-
'enrich_repository_analysis study')
135-
self.assertEqual(cm.output[-1], 'INFO:grimoire_elk.enriched.cocom:[cocom] Ending '
136-
'enrich_repository_analysis study')
133+
self.assertEqual(cm.output[0], 'INFO:grimoire_elk.enriched.cocom:[enrich-cocom-analysis] Start '
134+
'enrich_cocom_analysis study')
135+
self.assertEqual(cm.output[-1], 'INFO:grimoire_elk.enriched.cocom:[enrich-cocom-analysis] End '
136+
'enrich_cocom_analysis study')
137137

138138

139139
if __name__ == "__main__":

tests/test_colic.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,9 @@ def test_colic_analysis_study(self):
120120

121121
if study.__name__ == "enrich_colic_analysis":
122122
study(ocean_backend, enrich_backend)
123-
self.assertEqual(cm.output[0], 'INFO:grimoire_elk.enriched.colic:[colic] Starting '
123+
self.assertEqual(cm.output[0], 'INFO:grimoire_elk.enriched.colic:[enrich-colic-analysis] Start '
124124
'enrich_colic_analysis study')
125-
self.assertEqual(cm.output[-1], 'INFO:grimoire_elk.enriched.colic:[colic] Ending '
125+
self.assertEqual(cm.output[-1], 'INFO:grimoire_elk.enriched.colic:[enrich-colic-analysis] End '
126126
'enrich_colic_analysis study')
127127

128128

0 commit comments

Comments
 (0)