Accessing remote files with earthaccess¶
When we search for data using earthaccess we get back a list of results from NASA's Common Metadata Repository or CMR for short. These results contain all the information
we need to access the files represented by the metadata. earthaccess
offers 2 access methods that operate with these results, the first method is the well known, download()
where we copy the results from their location to our local disk, if we are running the code in AWS say on a Jupyterhub the files will be copied to the local VM disk.
The other method is open()
, earthaccess uses fsspec to open remote files as if they were local. open
has advantages and some disadvantages that we must know before using it.
The main advantage for open()
is that we don't have to download the file, we can stream it into memory however depending on how we do it we may run into network performance issues. Again, if we run the code next to the data this would be fast, if we do it locally in our laptopts it will be slow.
import earthaccess
auth = earthaccess.login()
results = earthaccess.search_data(
short_name="ATL06",
cloud_hosted=False,
temporal=("2019-01", "2019-02"),
polygon=[(-100, 40), (-110, 40), (-105, 38), (-100, 40)],
)
results[0]
nsidc_url = "https://n5eil01u.ecs.nsidc.org/DP7/ATLAS/ATL06.005/2019.02.21/ATL06_20190221121851_08410203_005_01.h5"
lpcloud_url = "https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/EMITL2ARFL.001/EMIT_L2A_RFL_001_20220903T163129_2224611_012/EMIT_L2A_RFL_001_20220903T163129_2224611_012.nc"
session = earthaccess.get_requests_https_session()
headers = {"Range": "bytes=0-100"}
r = session.get(lpcloud_url, headers=headers)
r
<Response [206]>
fs = earthaccess.get_fsspec_https_session()
with fs.open(lpcloud_url) as f:
data = f.read(100)
data
b'\x89HDF\r\n\x1a\n\x00\x00\x00\x00\x00\x08\x08\x00\x04\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xd7HUn\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00`\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00OHDR'
%%time
import xarray as xr
files = earthaccess.open(results[0:2])
ds = xr.open_dataset(files[0], group="/gt1r/land_ice_segments")
ds
An exception occurred while trying to access remote files via HTTPS Traceback (most recent call last): File "/home/docs/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/fsspec/implementations/http.py", line 437, in _info await _file_info( File "/home/docs/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/fsspec/implementations/http.py", line 853, in _file_info r.raise_for_status() File "/home/docs/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/aiohttp/client_reqrep.py", line 629, in raise_for_status raise ClientResponseError( aiohttp.client_exceptions.ClientResponseError: 404, message='Not Found', url='https://n5eil01u.ecs.nsidc.org/DP5/ATLAS/ATL06.007/2019.01.03/ATL06_20190103064855_00890206_007_01.h5' The above exception was the direct cause of the following exception: Traceback (most recent call last): File "/home/docs/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/earthaccess/store.py", line 903, in _open_urls_https return _open_files( ^^^^^^^^^^^^ File "/home/docs/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/earthaccess/store.py", line 136, in _open_files return pqdm( ^^^^^ File "/home/docs/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/pqdm/threads.py", line 22, in pqdm return _parallel_process( ^^^^^^^^^^^^^^^^^^ File "/home/docs/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/pqdm/_base.py", line 79, in _parallel_process raise e File "/home/docs/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/pqdm/_base.py", line 76, in _parallel_process results.append(future.result()) ^^^^^^^^^^^^^^^ File "/home/docs/.asdf/installs/python/3.11.12/lib/python3.11/concurrent/futures/_base.py", line 449, in result return self.__get_result() ^^^^^^^^^^^^^^^^^^^ File "/home/docs/.asdf/installs/python/3.11.12/lib/python3.11/concurrent/futures/_base.py", line 401, in __get_result raise self._exception File "/home/docs/.asdf/installs/python/3.11.12/lib/python3.11/concurrent/futures/thread.py", line 58, in run result = self.fn(*self.args, **self.kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/docs/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/earthaccess/store.py", line 123, in multi_thread_open f_size = fs.info(url)["size"] ^^^^^^^^^^^^ File "/home/docs/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/fsspec/asyn.py", line 118, in wrapper return sync(self.loop, func, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/docs/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/fsspec/asyn.py", line 103, in sync raise return_result File "/home/docs/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/fsspec/asyn.py", line 56, in _runner result[0] = await coro ^^^^^^^^^^ File "/home/docs/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/fsspec/implementations/http.py", line 450, in _info raise FileNotFoundError(url) from exc FileNotFoundError: https://n5eil01u.ecs.nsidc.org/DP5/ATLAS/ATL06.007/2019.01.03/ATL06_20190103064855_00890206_007_01.h5
CPU times: user 324 ms, sys: 74.7 ms, total: 398 ms Wall time: 485 ms
--------------------------------------------------------------------------- ClientResponseError Traceback (most recent call last) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/fsspec/implementations/http.py:437, in HTTPFileSystem._info(self, url, **kwargs) 435 try: 436 info.update( --> 437 await _file_info( 438 self.encode_url(url), 439 size_policy=policy, 440 session=session, 441 **self.kwargs, 442 **kwargs, 443 ) 444 ) 445 if info.get("size") is not None: File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/fsspec/implementations/http.py:853, in _file_info(url, session, size_policy, **kwargs) 852 async with r: --> 853 r.raise_for_status() 855 if "Content-Length" in r.headers: 856 # Some servers may choose to ignore Accept-Encoding and return 857 # compressed content, in which case the returned size is unreliable. File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/aiohttp/client_reqrep.py:629, in ClientResponse.raise_for_status(self) 627 self.release() --> 629 raise ClientResponseError( 630 self.request_info, 631 self.history, 632 status=self.status, 633 message=self.reason, 634 headers=self.headers, 635 ) ClientResponseError: 404, message='Not Found', url='https://n5eil01u.ecs.nsidc.org/DP5/ATLAS/ATL06.007/2019.01.03/ATL06_20190103064855_00890206_007_01.h5' The above exception was the direct cause of the following exception: FileNotFoundError Traceback (most recent call last) Cell In[7], line 1 ----> 1 get_ipython().run_cell_magic('time', '', '\nimport xarray as xr\n\nfiles = earthaccess.open(results[0:2])\n\nds = xr.open_dataset(files[0], group="/gt1r/land_ice_segments")\nds\n') File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/IPython/core/interactiveshell.py:2565, in InteractiveShell.run_cell_magic(self, magic_name, line, cell) 2563 with self.builtin_trap: 2564 args = (magic_arg_s, cell) -> 2565 result = fn(*args, **kwargs) 2567 # The code below prevents the output from being displayed 2568 # when using magics with decorator @output_can_be_silenced 2569 # when the last Python token in the expression is a ';'. 2570 if getattr(fn, magic.MAGIC_OUTPUT_CAN_BE_SILENCED, False): File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/IPython/core/magics/execution.py:1470, in ExecutionMagics.time(self, line, cell, local_ns) 1468 if interrupt_occured: 1469 if exit_on_interrupt and captured_exception: -> 1470 raise captured_exception 1471 return 1472 return out File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/IPython/core/magics/execution.py:1434, in ExecutionMagics.time(self, line, cell, local_ns) 1432 st = clock2() 1433 try: -> 1434 exec(code, glob, local_ns) 1435 out = None 1436 # multi-line %%time case File <timed exec>:3 File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/earthaccess/api.py:416, in open(granules, provider, credentials_endpoint, show_progress, pqdm_kwargs, open_kwargs) 389 def open( 390 granules: Union[List[str], List[DataGranule]], 391 provider: Optional[str] = None, (...) 396 open_kwargs: Optional[Dict[str, Any]] = None, 397 ) -> List[AbstractFileSystem]: 398 """Returns a list of file-like objects that can be used to access files 399 hosted on S3 or HTTPS by third party libraries like xarray. 400 (...) 414 A list of "file pointers" to remote (i.e. s3 or https) files. 415 """ --> 416 return earthaccess.__store__.open( 417 granules=granules, 418 provider=_normalize_location(provider), 419 credentials_endpoint=credentials_endpoint, 420 show_progress=show_progress, 421 pqdm_kwargs=pqdm_kwargs, 422 open_kwargs=open_kwargs, 423 ) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/earthaccess/store.py:439, in Store.open(self, granules, provider, show_progress, credentials_endpoint, pqdm_kwargs, open_kwargs) 432 pqdm_kwargs = { 433 "exception_behaviour": "immediate", 434 "n_jobs": 8, 435 "disable": not show_progress, 436 **(pqdm_kwargs or {}), 437 } 438 if len(granules): --> 439 return self._open( 440 granules, 441 provider, 442 credentials_endpoint=credentials_endpoint, 443 pqdm_kwargs=pqdm_kwargs, 444 open_kwargs=open_kwargs, 445 ) 446 return [] File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/multimethod/__init__.py:350, in multimethod.__call__(self, *args, **kwargs) 348 func = self.dispatch(*args) 349 try: --> 350 return func(*args, **kwargs) 351 except TypeError as ex: 352 raise DispatchError(f"Function {func.__code__}") from ex File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/earthaccess/store.py:517, in Store._open_granules(self, granules, provider, credentials_endpoint, pqdm_kwargs, open_kwargs) 515 else: 516 url_mapping = _get_url_granule_mapping(granules, access="on_prem") --> 517 fileset = self._open_urls_https( 518 url_mapping, pqdm_kwargs=pqdm_kwargs, open_kwargs=open_kwargs 519 ) 521 return fileset File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/earthaccess/store.py:903, in Store._open_urls_https(self, url_mapping, pqdm_kwargs, open_kwargs) 900 https_fs = self.get_fsspec_session() 902 try: --> 903 return _open_files( 904 url_mapping, https_fs, pqdm_kwargs=pqdm_kwargs, open_kwargs=open_kwargs 905 ) 906 except Exception: 907 logger.exception( 908 "An exception occurred while trying to access remote files via HTTPS" 909 ) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/earthaccess/store.py:136, in _open_files(url_mapping, fs, pqdm_kwargs, open_kwargs) 133 return EarthAccessFile(f, granule) # type: ignore 135 # this {#n_jobs} is for the unittests as this method is not public and pqdm will have values at this point --> 136 return pqdm( 137 url_mapping.items(), multi_thread_open, **(pqdm_kwargs or {"n_jobs": 8}) 138 ) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/pqdm/threads.py:22, in pqdm(array, function, n_jobs, argument_type, bounded, exception_behaviour, tqdm_class, **kwargs) 12 def pqdm( 13 array: Iterable[Any], 14 function: Callable[[Any], Any], (...) 20 **kwargs 21 ): ---> 22 return _parallel_process( 23 iterable=array, 24 function=function, 25 argument_type=argument_type, 26 n_jobs=n_jobs, 27 executor=BoundedThreadPoolExecutor if bounded else ThreadPoolExecutor, 28 exception_behaviour=exception_behaviour, 29 tqdm_class=tqdm_class, 30 **kwargs 31 ) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/pqdm/_base.py:79, in _parallel_process(iterable, function, n_jobs, executor, argument_type, exception_behaviour, tqdm_class, **kwargs) 77 except Exception as e: 78 if exception_behaviour == ExceptionBehaviour.IMMEDIATE: ---> 79 raise e 80 if exception_behaviour == ExceptionBehaviour.IGNORE: 81 results.append(e) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/pqdm/_base.py:76, in _parallel_process(iterable, function, n_jobs, executor, argument_type, exception_behaviour, tqdm_class, **kwargs) 74 for i, future in tqdm_class(enumerate(futures), **collecting_opts): 75 try: ---> 76 results.append(future.result()) 77 except Exception as e: 78 if exception_behaviour == ExceptionBehaviour.IMMEDIATE: File ~/.asdf/installs/python/3.11.12/lib/python3.11/concurrent/futures/_base.py:449, in Future.result(self, timeout) 447 raise CancelledError() 448 elif self._state == FINISHED: --> 449 return self.__get_result() 451 self._condition.wait(timeout) 453 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]: File ~/.asdf/installs/python/3.11.12/lib/python3.11/concurrent/futures/_base.py:401, in Future.__get_result(self) 399 if self._exception: 400 try: --> 401 raise self._exception 402 finally: 403 # Break a reference cycle with the exception in self._exception 404 self = None File ~/.asdf/installs/python/3.11.12/lib/python3.11/concurrent/futures/thread.py:58, in _WorkItem.run(self) 55 return 57 try: ---> 58 result = self.fn(*self.args, **self.kwargs) 59 except BaseException as exc: 60 self.future.set_exception(exc) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/earthaccess/store.py:123, in _open_files.<locals>.multi_thread_open(data) 121 def multi_thread_open(data: tuple[str, Optional[DataGranule]]) -> EarthAccessFile: 122 url, granule = data --> 123 f_size = fs.info(url)["size"] 124 default_cache_type = "background" # block cache with background fetching 125 default_block_size = _optimal_fsspec_block_size(f_size) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/fsspec/asyn.py:118, in sync_wrapper.<locals>.wrapper(*args, **kwargs) 115 @functools.wraps(func) 116 def wrapper(*args, **kwargs): 117 self = obj or args[0] --> 118 return sync(self.loop, func, *args, **kwargs) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/fsspec/asyn.py:103, in sync(loop, func, timeout, *args, **kwargs) 101 raise FSTimeoutError from return_result 102 elif isinstance(return_result, BaseException): --> 103 raise return_result 104 else: 105 return return_result File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/fsspec/asyn.py:56, in _runner(event, coro, result, timeout) 54 coro = asyncio.wait_for(coro, timeout=timeout) 55 try: ---> 56 result[0] = await coro 57 except Exception as ex: 58 result[0] = ex File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1035/lib/python3.11/site-packages/fsspec/implementations/http.py:450, in HTTPFileSystem._info(self, url, **kwargs) 447 except Exception as exc: 448 if policy == "get": 449 # If get failed, then raise a FileNotFoundError --> 450 raise FileNotFoundError(url) from exc 451 logger.debug("", exc_info=exc) 453 return {"name": url, "size": None, **info, "type": "file"} FileNotFoundError: https://n5eil01u.ecs.nsidc.org/DP5/ATLAS/ATL06.007/2019.01.03/ATL06_20190103064855_00890206_007_01.h5