Skip to content

Commit 8c64017

Browse files
committed
[graal:tests] Add appropriate tests for Graal integration (WIP)
Signed-off-by: inishchith <[email protected]>
1 parent 1308300 commit 8c64017

File tree

8 files changed

+659
-110
lines changed

8 files changed

+659
-110
lines changed

grimoire_elk/enriched/cocom.py

Lines changed: 59 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,8 @@ def __add_derived_metrics(self, file_analysis, eitem):
139139

140140
# TODO: Fix Logic: None rather than 1
141141
if None not in [eitem["loc"], eitem["comments"], eitem["num_funs"]]:
142-
eitem["loc_per_comment_lines"] = eitem["loc"] / max(eitem["comments"], 1)
143-
eitem["loc_per_blank_lines"] = eitem["loc"] / max(eitem["blanks"], 1)
142+
eitem["comments_per_loc"] = eitem["comments"] / max(eitem["loc"], 1)
143+
eitem["blanks_per_loc"] = eitem["blanks"] / max(eitem["loc"], 1)
144144
eitem["loc_per_function"] = eitem["loc"] / max(eitem["num_funs"], 1)
145145
else:
146146
eitem["loc_per_comment_lines"] = eitem["loc_per_blank_lines"] = eitem["loc_per_function"] = None
@@ -176,7 +176,7 @@ def enrich_items(self, ocean_backend, events=False):
176176
return num_items
177177

178178
def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=False,
179-
out_index="cocom_enrich_graal_repo", interval_months=3,
179+
out_index="cocom_enrich_graal_repo", interval_months=[3],
180180
date_field="grimoire_creation_date"):
181181

182182
logger.info("Doing enrich_repository_analysis study for index {}"
@@ -185,71 +185,75 @@ def enrich_repo_analysis(self, ocean_backend, enrich_backend, no_incremental=Fal
185185
es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100,
186186
verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection)
187187
in_index = enrich_backend.elastic.index
188+
interval_months = list(map(int, interval_months))
188189

189190
unique_repos = es_in.search(
190191
index=in_index,
191192
body=get_unique_repository())
192193

193194
repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])]
195+
current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0)
194196
num_items = 0
195197
ins_items = 0
196198

197199
for repository_url in repositories:
198200
es_out = ElasticSearch(enrich_backend.elastic.url, out_index)
199201
evolution_items = []
200202

201-
to_month = get_to_date(es_in, in_index, out_index, repository_url)
202-
to_month = to_month.replace(day=1, hour=0, minute=0, second=0)
203-
current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0)
204-
205-
while to_month < current_month:
206-
files_at_time = es_in.search(
207-
index=in_index,
208-
body=get_files_at_time(repository_url, to_month.isoformat())
209-
)['aggregations']['file_stats'].get("buckets", [])
210-
211-
if not len(files_at_time):
212-
to_month = to_month + relativedelta(months=+interval_months)
213-
continue
214-
215-
repository_name = repository_url.split("/")[-1]
216-
evolution_item = {
217-
"id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval_months),
218-
"origin": repository_url,
219-
"interval_months": interval_months,
220-
"study_creation_date": to_month.isoformat(),
221-
"total_files": len(files_at_time)
222-
}
223-
224-
for file_ in files_at_time:
225-
file_details = file_["1"]["hits"]["hits"][0]["_source"]
226-
227-
for metric in self.metrics:
228-
total_metric = "total_" + metric
229-
evolution_item[total_metric] = evolution_item.get(total_metric, 0)
230-
evolution_item[total_metric] += file_details[metric] if file_details[metric] is not None else 0
231-
232-
# TODO: Fix Logic: None rather than 1
233-
evolution_item["total_loc_per_comment_lines"] = evolution_item["total_loc"] / \
234-
max(evolution_item["total_comments"], 1)
235-
evolution_item["total_loc_per_blank_lines"] = evolution_item["total_loc"] / max(evolution_item["total_blanks"], 1)
236-
evolution_item["total_loc_per_function"] = evolution_item["total_loc"] / max(evolution_item["total_num_funs"], 1)
237-
238-
evolution_items.append(evolution_item)
239-
240-
if len(evolution_items) >= self.elastic.max_items_bulk:
241-
num_items += len(evolution_items)
242-
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
243-
evolution_items = []
203+
for interval in interval_months:
244204

245-
to_month = to_month + relativedelta(months=+interval_months)
205+
to_month = get_to_date(es_in, in_index, out_index, repository_url, interval)
206+
to_month = to_month.replace(month=int(interval), day=1, hour=0, minute=0, second=0)
246207

247-
if len(evolution_items) > 0:
248-
num_items += len(evolution_items)
249-
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
208+
while to_month < current_month:
209+
files_at_time = es_in.search(
210+
index=in_index,
211+
body=get_files_at_time(repository_url, to_month.isoformat())
212+
)['aggregations']['file_stats'].get("buckets", [])
250213

251-
if num_items != ins_items:
252-
missing = num_items - ins_items
253-
logger.error("%s/%s missing items for Graal CoCom Analysis Study", str(missing), str(num_items))
254-
else:
255-
logger.info("%s items inserted for Graal CoCom Analysis Study", str(num_items))
214+
if not len(files_at_time):
215+
to_month = to_month + relativedelta(months=+interval)
216+
continue
217+
218+
repository_name = repository_url.split("/")[-1]
219+
evolution_item = {
220+
"id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval),
221+
"origin": repository_url,
222+
"interval_months": interval,
223+
"study_creation_date": to_month.isoformat(),
224+
"total_files": len(files_at_time)
225+
}
226+
227+
for file_ in files_at_time:
228+
file_details = file_["1"]["hits"]["hits"][0]["_source"]
229+
230+
for metric in self.metrics:
231+
total_metric = "total_" + metric
232+
evolution_item[total_metric] = evolution_item.get(total_metric, 0)
233+
evolution_item[total_metric] += file_details[metric] if file_details[metric] is not None else 0
234+
235+
# TODO: Fix Logic: None rather than 1
236+
evolution_item["total_comments_per_loc"] = evolution_item["total_comments"] / \
237+
max(evolution_item["total_loc"], 1)
238+
evolution_item["total_blanks_per_loc"] = evolution_item["total_blanks"] / max(evolution_item["total_loc"], 1)
239+
evolution_item["total_loc_per_function"] = evolution_item["total_loc"] / \
240+
max(evolution_item["total_num_funs"], 1)
241+
242+
evolution_items.append(evolution_item)
243+
244+
if len(evolution_items) >= self.elastic.max_items_bulk:
245+
num_items += len(evolution_items)
246+
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
247+
evolution_items = []
248+
249+
to_month = to_month + relativedelta(months=+interval)
250+
251+
if len(evolution_items) > 0:
252+
num_items += len(evolution_items)
253+
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
254+
255+
if num_items != ins_items:
256+
missing = num_items - ins_items
257+
logger.error("%s/%s missing items for Graal CoCom Analysis Study", str(missing), str(num_items))
258+
else:
259+
logger.info("%s items inserted for Graal CoCom Analysis Study", str(num_items))

grimoire_elk/enriched/colic.py

Lines changed: 56 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def has_identities(self):
6363
def get_field_unique_id(self):
6464
return "id"
6565

66-
def get_licensed_files(repository_url, to_date):
66+
def get_licensed_files(self, repository_url, to_date):
6767
""" Retrieve all the licensed files until the to_date, corresponding
6868
to the given repository.
6969
"""
@@ -108,7 +108,7 @@ def get_licensed_files(repository_url, to_date):
108108

109109
return query_licensed_files
110110

111-
def get_copyrighted_files(repository_url, to_date):
111+
def get_copyrighted_files(self, repository_url, to_date):
112112
""" Retrieve all the copyrighted files until the to_date, corresponding
113113
to the given repository.
114114
"""
@@ -260,7 +260,7 @@ def enrich_items(self, ocean_backend, events=False):
260260
return num_items
261261

262262
def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=False,
263-
out_index="colic_enrich_graal_repo", interval_months=3,
263+
out_index="colic_enrich_graal_repo", interval_months=[3],
264264
date_field="grimoire_creation_date"):
265265

266266
logger.info("Doing enrich_colic_analysis study for index {}"
@@ -269,66 +269,74 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa
269269
es_in = ES([enrich_backend.elastic_url], retry_on_timeout=True, timeout=100,
270270
verify_certs=self.elastic.requests.verify, connection_class=RequestsHttpConnection)
271271
in_index = enrich_backend.elastic.index
272+
interval_months = list(map(int, interval_months))
272273

273274
unique_repos = es_in.search(
274275
index=in_index,
275276
body=get_unique_repository())
276277

277278
repositories = [repo['key'] for repo in unique_repos['aggregations']['unique_repos'].get('buckets', [])]
279+
current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0)
278280
num_items = 0
279281
ins_items = 0
280282

281283
for repository_url in repositories:
282284
es_out = ElasticSearch(enrich_backend.elastic.url, out_index)
283285
evolution_items = []
284286

285-
to_month = get_to_date(es_in, in_index, out_index, repository_url)
286-
to_month = to_month.replace(day=1, hour=0, minute=0, second=0)
287-
current_month = datetime_utcnow().replace(day=1, hour=0, minute=0, second=0)
288-
289-
while to_month < current_month:
290-
copyrighted_files_at_time = es_in.search(
291-
index=in_index,
292-
body=self.get_copyrighted_files(repository_url, to_month.isoformat()))
293-
294-
licensed_files_at_time = es_in.search(
295-
index=in_index,
296-
body=self.get_licensed_files(repository_url, to_month.isoformat()))
297-
298-
files_at_time = es_in.search(
299-
index=in_index,
300-
body=get_files_at_time(repository_url, to_month.isoformat()))
301-
302-
licensed_files = int(licensed_files_at_time["aggregations"]["1"]["value"])
303-
copyrighted_files = int(copyrighted_files_at_time["aggregations"]["1"]["value"])
304-
total_files = int(files_at_time["aggregations"]["1"]["value"])
305-
306-
repository_name = repository_url.split("/")[-1]
307-
evolution_item = {
308-
"id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval_months),
309-
"origin": repository_url,
310-
"interval_months": interval_months,
311-
"study_creation_date": to_month.isoformat(),
312-
"licensed_files": licensed_files,
313-
"copyrighted_files": copyrighted_files,
314-
"total_files": total_files
315-
}
287+
for interval in interval_months:
288+
289+
to_month = get_to_date(es_in, in_index, out_index, repository_url, interval)
290+
to_month = to_month.replace(month=int(interval), day=1, hour=0, minute=0, second=0)
291+
292+
while to_month < current_month:
293+
copyrighted_files_at_time = es_in.search(
294+
index=in_index,
295+
body=self.get_copyrighted_files(repository_url, to_month.isoformat()))
296+
297+
licensed_files_at_time = es_in.search(
298+
index=in_index,
299+
body=self.get_licensed_files(repository_url, to_month.isoformat()))
300+
301+
files_at_time = es_in.search(
302+
index=in_index,
303+
body=get_files_at_time(repository_url, to_month.isoformat()))
304+
305+
licensed_files = int(licensed_files_at_time["aggregations"]["1"]["value"])
306+
copyrighted_files = int(copyrighted_files_at_time["aggregations"]["1"]["value"])
307+
# TODO: Fix - need more efficient query
308+
total_files = len(files_at_time['aggregations']['file_stats'].get("buckets", []))
309+
310+
if not total_files:
311+
to_month = to_month + relativedelta(months=+interval)
312+
continue
313+
314+
repository_name = repository_url.split("/")[-1]
315+
evolution_item = {
316+
"id": "{}_{}_{}".format(to_month.isoformat(), repository_name, interval),
317+
"origin": repository_url,
318+
"interval_months": interval,
319+
"study_creation_date": to_month.isoformat(),
320+
"licensed_files": licensed_files,
321+
"copyrighted_files": copyrighted_files,
322+
"total_files": total_files
323+
}
316324

317-
evolution_items.append(evolution_item)
325+
evolution_items.append(evolution_item)
318326

319-
if len(evolution_items) >= self.elastic.max_items_bulk:
320-
num_items += len(evolution_items)
321-
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
322-
evolution_items = []
327+
if len(evolution_items) >= self.elastic.max_items_bulk:
328+
num_items += len(evolution_items)
329+
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
330+
evolution_items = []
323331

324-
to_month = to_month + relativedelta(months=+interval_months)
332+
to_month = to_month + relativedelta(months=+interval)
325333

326-
if len(evolution_items) > 0:
327-
num_items += len(evolution_items)
328-
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
334+
if len(evolution_items) > 0:
335+
num_items += len(evolution_items)
336+
ins_items += es_out.bulk_upload(evolution_items, self.get_field_unique_id())
329337

330-
if num_items != ins_items:
331-
missing = num_items - ins_items
332-
logger.error("%s/%s missing items for Graal CoLic Analysis Study", str(missing), str(num_items))
333-
else:
334-
logger.info("%s items inserted for Graal CoLic Analysis Study", str(num_items))
338+
if num_items != ins_items:
339+
missing = num_items - ins_items
340+
logger.error("%s/%s missing items for Graal CoLic Analysis Study", str(missing), str(num_items))
341+
else:
342+
logger.info("%s items inserted for Graal CoLic Analysis Study", str(num_items))

grimoire_elk/enriched/graal_study_evolution.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def get_unique_repository():
4343
return query_unique_repository
4444

4545

46-
def get_last_study_date(repository_url):
46+
def get_last_study_date(repository_url, interval):
4747
""" Retrieve the last study_creation_date of the item corresponding
4848
to given repository from the study index.
4949
"""
@@ -64,11 +64,15 @@ def get_last_study_date(repository_url):
6464
"term": {
6565
"origin.keyword": "%s"
6666
}
67+
},{
68+
"term":{
69+
"interval_months": "%s"
70+
}
6771
}]
6872
}
6973
}
7074
}
71-
""" % (repository_url)
75+
""" % (repository_url, interval)
7276

7377
return query_last_study_date
7478

@@ -117,8 +121,6 @@ def get_files_at_time(repository_url, to_date):
117121
corresponding to the given repository.
118122
"""
119123

120-
# TODO: Fix for interval month matching
121-
122124
query_files_at_time = """
123125
{
124126
"size": 0,
@@ -167,14 +169,14 @@ def get_files_at_time(repository_url, to_date):
167169
return query_files_at_time
168170

169171

170-
def get_to_date(es_in, in_index, out_index, repository_url):
172+
def get_to_date(es_in, in_index, out_index, repository_url, interval):
171173
""" Get the appropriate to_date value for incremental insertion. """
172174
study_data_available = False
173175

174176
if es_in.indices.exists(index=out_index):
175177
last_study_date = es_in.search(
176178
index=out_index,
177-
body=get_last_study_date(repository_url))["aggregations"]["1"]
179+
body=get_last_study_date(repository_url, interval))["aggregations"]["1"]
178180

179181
if last_study_date["value"] is not None:
180182
study_data_available = True

grimoire_elk/raw/graal.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def get_elastic_mappings(es_major):
5656

5757

5858
class GraalOcean(ElasticOcean):
59-
"""CoLic Ocean feeder"""
59+
"""Graal Ocean feeder"""
6060

6161
mapping = Mapping
6262

0 commit comments

Comments
 (0)