26
26
from .enrich import (Enrich ,
27
27
metadata )
28
28
from .graal_study_evolution import (get_to_date ,
29
- get_unique_repository ,
30
- get_files_at_time )
29
+ get_unique_repository )
31
30
from .utils import fix_field_date
32
31
from ..elastic_mapping import Mapping as BaseMapping
33
32
@@ -107,7 +106,43 @@ def has_identities(self):
107
106
def get_field_unique_id (self ):
108
107
return "id"
109
108
110
- def get_licensed_files (self , repository_url , to_date ):
109
+ def __get_total_files (self , repository_url , to_date ):
110
+ """ Retrieve total number for files until to_date, corresponding
111
+ to the given repository
112
+ """
113
+
114
+ query_total_files = """
115
+ {
116
+ "size": 0,
117
+ "aggs": {
118
+ "1": {
119
+ "cardinality": {
120
+ "field": "file_path"
121
+ }
122
+ }
123
+ },
124
+ "query": {
125
+ "bool": {
126
+ "filter": [{
127
+ "term": {
128
+ "origin": "%s"
129
+ }
130
+ },
131
+ {
132
+ "range": {
133
+ "metadata__updated_on": {
134
+ "lte": "%s"
135
+ }
136
+ }
137
+ }]
138
+ }
139
+ }
140
+ }
141
+ """ % (repository_url , to_date )
142
+
143
+ return query_total_files
144
+
145
+ def __get_licensed_files (self , repository_url , to_date ):
111
146
""" Retrieve all the licensed files until the to_date, corresponding
112
147
to the given repository.
113
148
"""
@@ -124,18 +159,14 @@ def get_licensed_files(self, repository_url, to_date):
124
159
},
125
160
"query": {
126
161
"bool": {
127
- "must": [{
128
- "match_phrase": {
129
- "has_license": {
130
- "query": 1
131
- }
162
+ "filter": [{
163
+ "term": {
164
+ "has_license": 1
132
165
}
133
166
},
134
167
{
135
- "match_phrase": {
136
- "origin": {
137
- "query": "%s"
138
- }
168
+ "term": {
169
+ "origin": "%s"
139
170
}
140
171
},
141
172
{
@@ -152,7 +183,7 @@ def get_licensed_files(self, repository_url, to_date):
152
183
153
184
return query_licensed_files
154
185
155
- def get_copyrighted_files (self , repository_url , to_date ):
186
+ def __get_copyrighted_files (self , repository_url , to_date ):
156
187
""" Retrieve all the copyrighted files until the to_date, corresponding
157
188
to the given repository.
158
189
"""
@@ -169,18 +200,14 @@ def get_copyrighted_files(self, repository_url, to_date):
169
200
},
170
201
"query": {
171
202
"bool": {
172
- "must": [{
173
- "match_phrase": {
174
- "has_copyright": {
175
- "query": 1
176
- }
203
+ "filter": [{
204
+ "term": {
205
+ "has_copyright": 1
177
206
}
178
207
},
179
208
{
180
- "match_phrase": {
181
- "origin": {
182
- "query": "%s"
183
- }
209
+ "term": {
210
+ "origin": "%s"
184
211
}
185
212
},
186
213
{
@@ -338,7 +365,7 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa
338
365
out_index = "colic_enrich_graal_repo" , interval_months = [3 ],
339
366
date_field = "grimoire_creation_date" ):
340
367
341
- logger .info ("[colic] Starting enrich_colic_analysis study" )
368
+ logger .info ("[enrich- colic-analysis] Start enrich_colic_analysis study" )
342
369
343
370
es_in = ES ([enrich_backend .elastic_url ], retry_on_timeout = True , timeout = 100 ,
344
371
verify_certs = self .elastic .requests .verify , connection_class = RequestsHttpConnection )
@@ -350,12 +377,17 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa
350
377
body = get_unique_repository ())
351
378
352
379
repositories = [repo ['key' ] for repo in unique_repos ['aggregations' ]['unique_repos' ].get ('buckets' , [])]
380
+
381
+ logger .info ("[enrich-colic-analysis] {} repositories to process" .format (len (repositories )))
382
+ es_out = ElasticSearch (enrich_backend .elastic .url , out_index , mappings = Mapping )
383
+ es_out .add_alias ("colic_study" )
384
+
353
385
current_month = datetime_utcnow ().replace (day = 1 , hour = 0 , minute = 0 , second = 0 )
354
386
num_items = 0
355
387
ins_items = 0
356
388
357
389
for repository_url in repositories :
358
- es_out = ElasticSearch ( enrich_backend . elastic . url , out_index , mappings = Mapping )
390
+ logger . info ( "[enrich-colic-analysis] Start analysis for {}" . format ( repository_url ) )
359
391
evolution_items = []
360
392
361
393
for interval in interval_months :
@@ -366,20 +398,19 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa
366
398
while to_month < current_month :
367
399
copyrighted_files_at_time = es_in .search (
368
400
index = in_index ,
369
- body = self .get_copyrighted_files (repository_url , to_month .isoformat ()))
401
+ body = self .__get_copyrighted_files (repository_url , to_month .isoformat ()))
370
402
371
403
licensed_files_at_time = es_in .search (
372
404
index = in_index ,
373
- body = self .get_licensed_files (repository_url , to_month .isoformat ()))
405
+ body = self .__get_licensed_files (repository_url , to_month .isoformat ()))
374
406
375
407
files_at_time = es_in .search (
376
408
index = in_index ,
377
- body = get_files_at_time (repository_url , to_month .isoformat ()))
409
+ body = self . __get_total_files (repository_url , to_month .isoformat ()))
378
410
379
411
licensed_files = int (licensed_files_at_time ["aggregations" ]["1" ]["value" ])
380
412
copyrighted_files = int (copyrighted_files_at_time ["aggregations" ]["1" ]["value" ])
381
- # TODO: Fix - need more efficient query
382
- total_files = len (files_at_time ['aggregations' ]['file_stats' ].get ("buckets" , []))
413
+ total_files = int (files_at_time ["aggregations" ]["1" ]["value" ])
383
414
384
415
if not total_files :
385
416
to_month = to_month + relativedelta (months = + interval )
@@ -411,8 +442,12 @@ def enrich_colic_analysis(self, ocean_backend, enrich_backend, no_incremental=Fa
411
442
412
443
if num_items != ins_items :
413
444
missing = num_items - ins_items
414
- logger .error ("%s/%s missing items for Graal CoLic Analysis Study" , str (missing ), str (num_items ))
445
+ logger .error (
446
+ "[enrich-colic-analysis] %s/%s missing items for Graal CoLic Analysis Study" , str (missing ), str (num_items )
447
+ )
415
448
else :
416
- logger .info ("%s items inserted for Graal CoLic Analysis Study" , str (num_items ))
449
+ logger .info ("[enrich-colic-analysis] %s items inserted for Graal CoLic Analysis Study" , str (num_items ))
450
+
451
+ logger .info ("[enrich-colic-analysis] End analysis for {} with month interval" .format (repository_url , interval ))
417
452
418
- logger .info ("[colic] Ending enrich_colic_analysis study" )
453
+ logger .info ("[enrich- colic-analysis] End enrich_colic_analysis study" )
0 commit comments