Skip to content

Commit bae82fb

Browse files
committed
preparing ecco_access kitchen-sink package
1 parent df80e06 commit bae82fb

File tree

4 files changed

+614
-132
lines changed

4 files changed

+614
-132
lines changed

ECCO-ACCESS/__init__.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from .ecco_access import ecco_podaac_access
2+
3+
from .ecco_download import ecco_podaac_query
4+
from .ecco_download import ecco_podaac_download
5+
from .ecco_download import ecco_podaac_download_diskaware
6+
from .ecco_download import ecco_podaac_download
7+
from .ecco_download import ecco_podaac_download_subset
8+
9+
from .ecco_s3_retrieve import ecco_podaac_s3_query
10+
from .ecco_s3_retrieve import ecco_podaac_s3_open
11+
from .ecco_s3_retrieve import ecco_podaac_s3_get
12+
from .ecco_s3_retrieve import ecco_podaac_s3_get_diskaware
13+
14+
__all__ = ['ecco_access',
15+
'ecco_download',
16+
'ecco_s3_retrieve']

ECCO-ACCESS/ecco_access.py

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
### This function allows users to query ECCO variables and datasets, and then gain access via direct download, or opening files remotely on S3
2+
3+
4+
def ecco_podaac_access(query,version='v4r4',grid=None,time_res='all',\
5+
StartDate=None,EndDate=None,\
6+
mode='download_ifspace',download_root_dir=None,**kwargs):
7+
"""
8+
9+
This function queries and accesses ECCO datasets from PO.DAAC. The core query and download functions are adapted from Jupyter notebooks
10+
created by Jack McNelis and Ian Fenty
11+
(https://github.com/ECCO-GROUP/ECCO-ACCESS/blob/master/PODAAC/Downloading_ECCO_datasets_from_PODAAC/README.md)
12+
and modified by Andrew Delman (https://ecco-v4-python-tutorial.readthedocs.io).
13+
14+
Parameters
15+
----------
16+
query: str, list, or dict, defines datasets or variables to access.
17+
If query is str, it specifies either a dataset ShortName (which is
18+
assumed if the string begins with 'ECCO_'), or a text string that
19+
can be used to search the ShortNames, variable names, and descriptions.
20+
A query may also be a list of multiple ShortNames and/or text searches,
21+
or a dict that contains grid,time_res specifiers as keys and ShortNames
22+
or text searches as values, e.g.,
23+
{'native,monthly':['ECCO_L4_SSH_LLC0090GRID_MONTHLY_V4R4',
24+
'THETA']}
25+
will query the native grid monthly SSH datasets, and all native grid
26+
monthly datasets with variables or descriptions matching 'THETA'.
27+
28+
version: ('v4r4'), specifies ECCO version to query
29+
30+
grid: ('native','latlon',None), specifies whether to query datasets with output
31+
on the native grid or the interpolated lat/lon grid.
32+
The default None will query both types of grids, unless specified
33+
otherwise in a query dict (e.g., the example above).
34+
35+
time_res: ('monthly','daily','snapshot','all'), specifies which time resolution
36+
to include in query and downloads. 'all' includes all time resolutions,
37+
and datasets that have no time dimension, such as the grid parameter
38+
and mixing coefficient datasets.
39+
40+
41+
StartDate,EndDate: str, in 'YYYY', 'YYYY-MM', or 'YYYY-MM-DD' format,
42+
define date range [StartDate,EndDate] for download.
43+
EndDate is included in the time range (unlike typical Python ranges).
44+
ECCOv4r4 date range is '1992-01-01' to '2017-12-31'.
45+
For 'SNAPSHOT' datasets, an additional day is added to EndDate to enable closed budgets
46+
within the specified date range.
47+
48+
mode: str, one of the following:
49+
'ls' or 'query': Query dataset ShortNames and variable names/
50+
descriptions only; no downloads.
51+
's3_ls' or 's3_query': Query dataset ShortNames and variable names/
52+
descriptions only; return paths on S3.
53+
'download': Download datasets using NASA Earthdata URLs
54+
'download_ifspace': Check storage availability before downloading.
55+
Download only if storage footprint of downloads
56+
<= max_avail_frac*(available storage)
57+
'download_subset': Download spatial and temporal subsets of datasets
58+
via Opendap; query help(ecco_podaac_download_subset)
59+
to see keyword arguments that can be used in this mode.
60+
The following modes work within the AWS cloud only:
61+
's3_open': Access datasets on S3 without downloading.
62+
's3_get': Download from S3 (to AWS EC2 instance).
63+
's3_get_ifspace': Check storage availability before downloading;
64+
download if storage footprint
65+
<= max_avail_frac*(available storage).
66+
Otherwise data are opened "remotely" from S3 bucket.
67+
's3_fsspec': Use `fsspec` json files (generated with `kerchunk`)
68+
for expedited loading of datasets.
69+
70+
download_root_dir: str, defines parent directory to download files to.
71+
Files will be downloaded to directory download_root_dir/ShortName/.
72+
If not specified, parent directory defaults to '~/Downloads/ECCO_V4r4_PODAAC/'.
73+
74+
Additional keyword arguments*:
75+
*This is not an exhaustive list, especially for
76+
'download_subset' mode; use help(ecco_podaac_download_subset) to display
77+
options specific to that mode
78+
79+
max_avail_frac: float, maximum fraction of remaining available disk space to
80+
use in storing ECCO datasets.
81+
If storing the datasets exceeds this fraction, an error is returned.
82+
Valid range is [0,0.9]. If number provided is outside this range, it is replaced by the closer
83+
endpoint of the range.
84+
85+
n_workers: int, number of workers to use in concurrent downloads. Benefits typically taper off above 5-6.
86+
87+
force_redownload: bool, if True, existing files will be redownloaded and replaced;
88+
if False (default), existing files will not be replaced.
89+
90+
return_granules: bool, if True (default), str or list of queried or
91+
downloaded granules/files (including ones that
92+
were already on disk and not replaced) is returned.
93+
if False, the function returns nothing.
94+
95+
Returns
96+
-------
97+
download_files: str, list, or dict, queried or downloaded file(s)
98+
with either URLs (if in 'query' mode), or paths that can be
99+
passed directly to xarray (open_dataset or open_mfdataset).
100+
A str is returned if query finds only one granule/file.
101+
A list is returned if query finds multiple granules in the
102+
same dataset.
103+
A dict (with ShortNames as keys) is returned if the query
104+
finds granules in multiple datasets.
105+
Only returned if return_granules=True (default).
106+
107+
"""
108+
109+
pass
110+
111+
112+
## query varlists as needed to obtain shortnames
113+
114+
def shortnames_find(query_list,grid,time_res):
115+
shortnames_list = []
116+
for query_item in query_list:
117+
if 'ECCO_' in query_item:
118+
shortnames_list.append(query_item)
119+
else:
120+
121+
return shortnames_list
122+
123+
124+
if isinstance(query,str):
125+
query = [query]
126+
if isinstance(query,dict):
127+
shortnames = []
128+
for gridtime_spec,curr_query in query.items():
129+
if isinstance(curr_query,str):
130+
curr_query = [curr_query]
131+
shortnames += shortnames_find(curr_query,\
132+
grid=curr_grid,\
133+
time_res=curr_time_res)
134+
else:
135+
shortnames = shortnames_find(query,grid=grid,time_res=time_res)
136+
137+
138+
## query NASA Earthdata CMR and download granules
139+
'ls' or 'query': Query dataset ShortNames and variable names/
140+
descriptions only; no downloads.
141+
's3_ls' or 's3_query': Query dataset ShortNames and variable names/
142+
descriptions only; return paths on S3.
143+
'download': Download datasets using NASA Earthdata URLs
144+
'download_ifspace': Check storage availability before downloading.
145+
Download only if storage footprint of downloads
146+
<= max_avail_frac*(available storage)
147+
'download_subset': Download spatial and temporal subsets of datasets
148+
via Opendap; query help(ecco_podaac_download_subset)
149+
to see keyword arguments that can be used in this mode.
150+
The following modes work within the AWS cloud only:
151+
's3_open': Access datasets on S3 without downloading.
152+
's3_get': Download from S3 (to AWS EC2 instance).
153+
's3_get_ifspace': Check storage availability before downloading;
154+
download if storage footprint
155+
<= max_avail_frac*(available storage).
156+
Otherwise data are opened "remotely" from S3 bucket.
157+
's3_fsspec': Use `fsspec` json files (generated with `kerchunk`)
158+
for expedited loading of datasets.
159+
160+
possible_mode_list = "['ls','query','s3_ls','s3_query','download',\n"\
161+
+"'download_ifspace','download_subset',\n"\
162+
+"'s3_open','s3_get','s3_get_ifspace','s3_fsspec']"
163+
# set some default keyword arguments
164+
kwargs_dict = {}
165+
if (('n_workers' not in locals()) and (mode != 'download_subset')):
166+
kwargs_dict['n_workers'] = 6
167+
if 'force_redownload' not in locals():
168+
kwargs_dict['force_redownload'] = False
169+
170+
171+
# download or otherwise access granules, depending on mode
172+
173+
if mode in ['download_ifspace','s3_get_ifspace']:
174+
if 'max_avail_frac' not in locals():
175+
kwargs_dict['max_avail_frac'] = 0.5
176+
if mode == 'download_ifspace':
177+
granule_files = ecco_podaac_download_diskaware(\
178+
shortnames,StartDate,EndDate,**kwargs_dict)
179+
elif mode == 's3_get_ifspace':
180+
granule_files = ecco_podaac_s3_get_diskaware(\
181+
shortnames,StartDate,EndDate,**kwargs_dict)
182+
else:
183+
raise ValueError('Invalid mode specified; please specify one of the following:'\
184+
+'\n'+possible_mode_list)
185+
else:
186+
granule_files = {}
187+
for shortname in shortnames:
188+
if mode in ['ls','query']:
189+
urls = ecco_podaac_query(shortname,StartDate,EndDate)
190+
granule_files[shortname] = urls
191+
elif mode in ['s3_ls','s3_query']:
192+
granule_files[shortname] = ecco_podaac_s3_query(\
193+
shortname,StartDate,EndDate)
194+
elif mode == 'download':
195+
kwargs_dict['return_downloaded_files'] = True
196+
granule_files[shortname] = ecco_podaac_download(\
197+
shortname,StartDate,EndDate,\
198+
download_root_dir=download_root_dir,\
199+
**kwargs_dict)
200+
elif mode == 'download_subset':
201+
if 'n_workers' not in locals():
202+
kwargs_dict['n_workers'] = 4
203+
kwargs_dict['return_downloaded_files'] = True
204+
granule_files[shortname] = ecco_podaac_download_subset(\
205+
shortname,StartDate,EndDate,\
206+
**kwargs_dict)
207+
elif mode == 's3_open':
208+
granule_files[shortname] = ecco_podaac_s3_open(\
209+
shortname,StartDate,EndDate)
210+
elif mode == 's3_get':
211+
kwargs_dict['return_downloaded_files'] = True
212+
granule_files[shortname] = ecco_podaac_s3_get(\
213+
shortname,StartDate,EndDate,\
214+
download_root_dir=download_root_dir,\
215+
**kwargs_dict)
216+
elif mode == 's3_fsspec':
217+
218+
else:
219+
raise ValueError('Invalid mode specified; please specify one of the following:'\
220+
+'\n'+possible_mode_list)
221+
222+
223+
# return granule/file list
224+
225+
if 'return_granules' not in locals():
226+
return_granules = True
227+
if return_granules:
228+
for shortname in granule_files.keys():
229+
if len(granule_files[shortname]) == 1:
230+
# if only 1 file is downloaded, return a string of filename instead of a list
231+
granule_files = granule_files[0]
232+
233+
return granule_files

0 commit comments

Comments
 (0)