adehecq · godinlu · Aug 1, 2024 · Jul 31, 2024 · Jul 31, 2024 · Jul 31, 2024
diff --git a/README.md b/README.md
@@ -33,6 +33,9 @@ The package can be installed using pip.
 
 ```bash
 pip install usgsxplore
+
+# or with pipx
+pipx install usgsxplore
 ```
 
 # Usage
@@ -56,6 +59,7 @@ Options:
 
 Commands:
   download  Download scenes with their entity ids provided in the textfile.
+  download-browse  Download browse images of a vector data file localy.
   search    Search scenes in a dataset with filters.
 ```
 
@@ -143,7 +147,7 @@ Options:
   --overwrite               Overwrite existing files
   --help                    Show this message and exit.
 ```
-This command download scenes from their entity ids in the `TEXTFILE` and save the results in `--output-dir`. It can display different types of progression bars depending on the `--pbar` value:
-- **0** : no progress bar displayed.
-- **1** : one progress bar for all scene downloads.
-- **2** : one progress bar for each scene download, with state information.
+This command download scenes from their entity ids in the `TEXTFILE` and save the results in `--output-dir`. It can display different type of progression depends on `--pbar` value:
+- **0** : display nothing.
+- **1** : display one progress bar for all scenes downloading.
+- **2** : display a progress bar for each scenes downloading, with state information.
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -66,10 +66,11 @@ def test_get_entity_id(self):
 
     def test_scene_search(self):
         "Test the scene search method"
-        result = self.api.scene_search("landsat_tm_c2_l1", max_results=1, metadata_type=None)
+        scene_filter = filt.SceneFilter.from_args(date_interval=("1900-01-01", "2024-08-01"))
+        result = self.api.scene_search("landsat_tm_c2_l1", scene_filter, max_results=1, metadata_type=None)
 
         assert result["recordsReturned"] == 1
-        assert result["totalHits"] == 2940421
+        assert result["totalHits"] == 2940410
         assert result["startingNumber"] == 1
         assert result["results"][0]["metadata"] == []
 

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -11,7 +11,14 @@
 import pytest
 
 from usgsxplore.api import API
-from usgsxplore.utils import read_textfile, sort_strings_by_similarity, to_gpkg
+from usgsxplore.utils import (
+    download_browse_img,
+    read_textfile,
+    save_in_gfile,
+    sort_strings_by_similarity,
+    to_gdf,
+    update_gdf_browse,
+)
 
 
 @pytest.fixture(scope="module")
@@ -24,20 +31,29 @@ def scenes_metadata() -> list[dict]:
     return scenes
 
 
-def test_to_gpkg(scenes_metadata: list[dict]):
-    "Test the to_gpkg functions"
+def test_to_gdf(scenes_metadata: list[dict]) -> None:
+    "Test the to_gdf function"
+    gdf = to_gdf(scenes_metadata)
+    assert gdf.shape[0] == 10
+    assert gdf.shape[1] == 35
+
+
+def test_save_in_gfile(scenes_metadata: list[dict]):
+    "Test the save_in_gfile functions"
+    gdf = to_gdf(scenes_metadata)
+
     with TemporaryDirectory() as tmpdir:
         gpkg_file = os.path.join(tmpdir, "tmp.gpkg")
         shapefile = os.path.join(tmpdir, "tmp.shp")
         geojson = os.path.join(tmpdir, "tmp.geojson")
         invalid_file = os.path.join(tmpdir, "tmp.invalid")
 
-        to_gpkg(scenes_metadata, gpkg_file)
+        save_in_gfile(gdf, gpkg_file)
         with pytest.warns(UserWarning):
-            to_gpkg(scenes_metadata, shapefile)
-        to_gpkg(scenes_metadata, geojson)
+            save_in_gfile(gdf, shapefile)
+        save_in_gfile(gdf, geojson)
         with pytest.raises(ValueError):
-            to_gpkg(scenes_metadata, invalid_file)
+            save_in_gfile(gdf, invalid_file)
 
         assert os.path.exists(gpkg_file)
         assert os.path.exists(shapefile)
@@ -70,4 +86,25 @@ def test_read_textfile() -> None:
         assert len(list_id) == 2
 
 
+def test_download_browse_img(scenes_metadata: list[dict]) -> None:
+    "Test the download_browse_img function"
+    gdf = to_gdf(scenes_metadata)
+    url_list = gdf["browse_url"].tolist()
+
+    with TemporaryDirectory() as tmpdir:
+        dl_recap = download_browse_img(url_list, tmpdir, False)
+        assert dl_recap.shape == (10, 2)
+        assert len(os.listdir(tmpdir)) == 10
+
+
+def test_update_gdf_browse(scenes_metadata: list[dict]) -> None:
+    "Test the update_gdf_browse function"
+    gdf = to_gdf(scenes_metadata)
+
+    gdf = update_gdf_browse(gdf, "images")
+
+    # test if the browse_path key exist in column
+    gdf["browse_path"]
+
+
 # End-of-file (EOF)
diff --git a/usgsxplore/cli.py b/usgsxplore/cli.py
@@ -7,13 +7,22 @@
 Author: Luc Godin
 """
 import json
+import os
 
 import click
+import geopandas as gpd
 
 from usgsxplore.api import API
 from usgsxplore.errors import FilterFieldError, FilterValueError, USGSInvalidDataset
 from usgsxplore.filter import SceneFilter
-from usgsxplore.utils import read_textfile, sort_strings_by_similarity, to_gpkg
+from usgsxplore.utils import (
+    download_browse_img,
+    read_textfile,
+    save_in_gfile,
+    sort_strings_by_similarity,
+    to_gdf,
+    update_gdf_browse,
+)
 
 
 # ----------------------------------------------------------------------------------------------------
@@ -68,6 +77,13 @@ def is_text_file(ctx: click.Context, param: click.Parameter, value: str) -> str:
     return value
 
 
+def is_vector_file(ctx: click.Context, param: click.Parameter, value: str) -> str:
+    "callback for verify the validity of the vector file"
+    if not value.endswith((".shp", ".gpkg", ".geojson")):
+        raise click.BadParameter(f"'{value}' must be a vector data file (.gpkg, .shp, .geojson)", ctx=ctx, param=param)
+    return value
+
+
 # ----------------------------------------------------------------------------------------------------
 # 									COMMAND LINE INTERFACE
 # ----------------------------------------------------------------------------------------------------
@@ -176,7 +192,8 @@ def search(
                 scenes = []
                 for batch_scenes in api.batch_search(dataset, scene_filter, limit, "full", pbar):
                     scenes += batch_scenes
-                to_gpkg(scenes, output)
+                gdf = to_gdf(scenes)
+                save_in_gfile(gdf, output)
 
     # if dataset is invalid print a list of similar dataset for the user
     except USGSInvalidDataset:
@@ -229,13 +246,41 @@ def download(
     api.logout()
 
 
-cli.add_command(search)
-cli.add_command(download)
+@click.command("download-browse")
+@click.argument("vector-file", type=click.Path(exists=True, file_okay=True), callback=is_vector_file)
+@click.option(
+    "--output-dir",
+    "-o",
+    type=click.Path(dir_okay=True, resolve_path=True),
+    default="./browse_images/",
+    help="Output directory",
+)
+@click.option("--pbar", is_flag=True, default=True, help="Display a progress bar.")
+def download_browse(vector_file: str, output_dir: str, pbar: bool) -> None:
+    """
+    Download browse images of a vector data file localy.
+    """
+    # create the directory if it not exist
+    os.makedirs(output_dir, exist_ok=True)
 
+    # read the vector file
+    gdf = gpd.read_file(vector_file)
+    print(gdf.shape)
 
-@click.command()
-def cli_gpkg():
-    click.echo("hello")
+    # get the list of browse_url
+    url_list = gdf["browse_url"].tolist()
+
+    # download the list of url with download_browse_img
+    dl_recap = download_browse_img(url_list, output_dir, pbar)
+
+    # update the vector file with browse_path added
+    gdf = update_gdf_browse(gdf, dl_recap, output_dir)
+    save_in_gfile(gdf, vector_file)
+
+
+cli.add_command(search)
+cli.add_command(download)
+cli.add_command(download_browse)
 
 
 if __name__ == "__main__":

diff --git a/usgsxplore/utils.py b/usgsxplore/utils.py
@@ -9,27 +9,31 @@
 from difflib import SequenceMatcher
 
 import geopandas as gpd
-from shapely import MultiPolygon, Polygon
+import pandas as pd
+import requests
+from shapely import MultiPolygon, Point, Polygon
+from tqdm import tqdm
 
 
-def to_gpkg(scenes_metadata: list[dict], geo_file: str = "scenes.gpkg") -> None:
+def to_gdf(scenes_metadata: list[dict]) -> None:
     """
     This method convert the file scenes.jsonl into a geodataframe with the spatialCoverage for the geometry
 
+    :param scenes_metadata: result of the search
     :return: GeoDataFrame to generate a geopackage
     """
     geometries = []
     attributes = {}
 
-    img_dir = os.path.join(os.path.dirname(geo_file), "browse-images")
-
     # loop in every line of the scenes file
     for scene in scenes_metadata:
         geom_type = scene["spatialCoverage"]["type"]
         if geom_type == "Polygon":
             geometries.append(Polygon(scene["spatialCoverage"]["coordinates"][0]))
         elif geom_type == "MultiPolygon":
             geometries.append(MultiPolygon(scene["spatialCoverage"]["coordinates"]))
+        elif geom_type == "Point":
+            geometries.append(Point(scene["spatialCoverage"]["coordinates"]))
         else:
             continue
 
@@ -38,29 +42,33 @@ def to_gpkg(scenes_metadata: list[dict], geo_file: str = "scenes.gpkg") -> None:
             attributes.setdefault(field.get("fieldName"), []).append(field.get("value"))
 
         if len(scene["browse"]) > 0:
-            attributes.setdefault("browse_path", []).append(
-                os.path.join(os.path.abspath(img_dir), os.path.basename(scene["browse"][0]["browsePath"]))
-            )
             attributes.setdefault("browse_url", []).append(scene["browse"][0]["browsePath"])
         else:
-            attributes.setdefault("browse_path", []).append(None)
             attributes.setdefault("browse_url", []).append(None)
 
     # create geodataframe with attributes and geometries
-    gdf = gpd.GeoDataFrame(data=attributes, geometry=geometries, crs="EPSG:4326")
+    return gpd.GeoDataFrame(data=attributes, geometry=geometries, crs="EPSG:4326")
+
+
+def save_in_gfile(gdf: gpd.GeoDataFrame, vector_file: str = "scenes.gpkg") -> None:
+    """
+    This function save the geodataframe into the vector_file given
 
+    :param gdf: geodataframe that will be saved
+    :param vector_file: output vector file
+    """
     # save the geodataframe in a geospatial file
-    if geo_file.endswith(".shp"):
+    if vector_file.endswith(".shp"):
         # here we ingore warnings that tell us all field are truncated
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore", message=r"Normalized/laundered field name: '.+' to '.+'")
-            gdf.to_file(geo_file)
-    elif geo_file.endswith(".gpkg"):
-        gdf.to_file(geo_file, driver="GPKG")
-    elif geo_file.endswith(".geojson"):
-        gdf.to_file(geo_file, driver="GeoJSON")
+            gdf.to_file(vector_file)
+    elif vector_file.endswith(".gpkg"):
+        gdf.to_file(vector_file, driver="GPKG")
+    elif vector_file.endswith(".geojson"):
+        gdf.to_file(vector_file, driver="GeoJSON")
     else:
-        raise ValueError(f"The file '{geo_file}' need to end with : .shp|.gpkg|.geojson")
+        raise ValueError(f"The file '{vector_file}' need to end with : .shp|.gpkg|.geojson")
 
 
 def read_textfile(textfile: str) -> list[str]:
@@ -97,4 +105,71 @@ def sort_strings_by_similarity(ref_str: str, list_str: list[str]) -> list[str]:
     return sorted_list_str
 
 
+def download_browse_img(url_list: list[str], output_dir: str, pbar: bool = True) -> pd.DataFrame:
+    """
+    Download all browse image with the url_list and put them into the output_dir.
+    Return a recap of the downloading.
+
+    :param url_list: list of all browse images url
+    :param output_dir: output directory
+    :param pbar: if True display a progress bar of the downloading
+    :return: dataframe of downloading recap
+    """
+    df = pd.DataFrame({"url": url_list})
+    df.set_index("url", inplace=True)
+    df = df.assign(already_download=False, status=None)
+
+    # Create a set of already downloaded files for faster lookup
+    already_dl_files = {file.split(".", maxsplit=1)[0] for file in os.listdir(output_dir) if file.endswith(".jpg")}
+
+    # Mark already downloaded files in the DataFrame
+    for url in url_list:
+        filename = os.path.basename(url).split(".", maxsplit=1)[0]
+        if filename in already_dl_files:
+            df.loc[url, "already_download"] = True
+
+    # create a progress_bar if pbar
+    if pbar:
+        progress_bar = tqdm(desc="Downloading images", total=len(url_list), initial=df["already_download"].sum())
+
+    # loop around not already_download urls and download it and save
+    # status_code in the dataframe
+    session = requests.Session()
+    # flake8: noqa E712
+    for url, row in df[df["already_download"] == False].iterrows():
+        response = session.get(url)
+        if response.status_code == 200:
+            # get the name of the images
+            filename = os.path.basename(url)
+
+            with open(os.path.join(output_dir, filename), "wb") as f:
+                f.write(response.content)
+        df.loc[url, "status"] = response.status_code
+
+        if pbar:
+            progress_bar.update()
+    # close the progress bar at the end of the downloading
+    if pbar:
+        progress_bar.close()
+
+    # return the recap
+    return df
+
+
+def update_gdf_browse(gdf: gpd.GeoDataFrame, output_dir: str) -> gpd.GeoDataFrame:
+    """
+    Update the gdf given to add a new metadata "browse_path" with the browse.
+
+    :param gdf: the geodataframe that would be modified
+    :param dl_recap: recap of the downloading (output of download_browse_img)
+    :param output_dir: browse output_dir
+    :return gdf
+    """
+    gdf = gdf.assign(browse_path=gdf["browse_url"])
+    gdf["browse_path"] = gdf["browse_path"].apply(os.path.basename)
+    gdf["browse_path"] = gdf["browse_path"].apply(lambda x: os.path.join(output_dir, x))
+
+    return gdf
+
+
 # End-of-file (EOF)