Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Download browse #2

Merged
merged 6 commits into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ The package can be installed using pip.

```bash
pip install usgsxplore

# or with pipx
pipx install usgsxplore
```

# Usage
Expand All @@ -56,6 +59,7 @@ Options:

Commands:
download Download scenes with their entity ids provided in the textfile.
download-browse Download browse images of a vector data file localy.
search Search scenes in a dataset with filters.
```

Expand Down Expand Up @@ -143,7 +147,7 @@ Options:
--overwrite Overwrite existing files
--help Show this message and exit.
```
This command download scenes from their entity ids in the `TEXTFILE` and save the results in `--output-dir`. It can display different types of progression bars depending on the `--pbar` value:
- **0** : no progress bar displayed.
- **1** : one progress bar for all scene downloads.
- **2** : one progress bar for each scene download, with state information.
This command download scenes from their entity ids in the `TEXTFILE` and save the results in `--output-dir`. It can display different type of progression depends on `--pbar` value:
- **0** : display nothing.
- **1** : display one progress bar for all scenes downloading.
- **2** : display a progress bar for each scenes downloading, with state information.
5 changes: 3 additions & 2 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,11 @@ def test_get_entity_id(self):

def test_scene_search(self):
"Test the scene search method"
result = self.api.scene_search("landsat_tm_c2_l1", max_results=1, metadata_type=None)
scene_filter = filt.SceneFilter.from_args(date_interval=("1900-01-01", "2024-08-01"))
result = self.api.scene_search("landsat_tm_c2_l1", scene_filter, max_results=1, metadata_type=None)

assert result["recordsReturned"] == 1
assert result["totalHits"] == 2940421
assert result["totalHits"] == 2940410
assert result["startingNumber"] == 1
assert result["results"][0]["metadata"] == []

Expand Down
51 changes: 44 additions & 7 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,14 @@
import pytest

from usgsxplore.api import API
from usgsxplore.utils import read_textfile, sort_strings_by_similarity, to_gpkg
from usgsxplore.utils import (
download_browse_img,
read_textfile,
save_in_gfile,
sort_strings_by_similarity,
to_gdf,
update_gdf_browse,
)


@pytest.fixture(scope="module")
Expand All @@ -24,20 +31,29 @@ def scenes_metadata() -> list[dict]:
return scenes


def test_to_gpkg(scenes_metadata: list[dict]):
"Test the to_gpkg functions"
def test_to_gdf(scenes_metadata: list[dict]) -> None:
"Test the to_gdf function"
gdf = to_gdf(scenes_metadata)
assert gdf.shape[0] == 10
assert gdf.shape[1] == 35


def test_save_in_gfile(scenes_metadata: list[dict]):
"Test the save_in_gfile functions"
gdf = to_gdf(scenes_metadata)

with TemporaryDirectory() as tmpdir:
gpkg_file = os.path.join(tmpdir, "tmp.gpkg")
shapefile = os.path.join(tmpdir, "tmp.shp")
geojson = os.path.join(tmpdir, "tmp.geojson")
invalid_file = os.path.join(tmpdir, "tmp.invalid")

to_gpkg(scenes_metadata, gpkg_file)
save_in_gfile(gdf, gpkg_file)
with pytest.warns(UserWarning):
to_gpkg(scenes_metadata, shapefile)
to_gpkg(scenes_metadata, geojson)
save_in_gfile(gdf, shapefile)
save_in_gfile(gdf, geojson)
with pytest.raises(ValueError):
to_gpkg(scenes_metadata, invalid_file)
save_in_gfile(gdf, invalid_file)

assert os.path.exists(gpkg_file)
assert os.path.exists(shapefile)
Expand Down Expand Up @@ -70,4 +86,25 @@ def test_read_textfile() -> None:
assert len(list_id) == 2


def test_download_browse_img(scenes_metadata: list[dict]) -> None:
"Test the download_browse_img function"
gdf = to_gdf(scenes_metadata)
url_list = gdf["browse_url"].tolist()

with TemporaryDirectory() as tmpdir:
dl_recap = download_browse_img(url_list, tmpdir, False)
assert dl_recap.shape == (10, 2)
assert len(os.listdir(tmpdir)) == 10


def test_update_gdf_browse(scenes_metadata: list[dict]) -> None:
"Test the update_gdf_browse function"
gdf = to_gdf(scenes_metadata)

gdf = update_gdf_browse(gdf, "images")

# test if the browse_path key exist in column
gdf["browse_path"]


# End-of-file (EOF)
59 changes: 52 additions & 7 deletions usgsxplore/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,22 @@
Author: Luc Godin
"""
import json
import os

import click
import geopandas as gpd

from usgsxplore.api import API
from usgsxplore.errors import FilterFieldError, FilterValueError, USGSInvalidDataset
from usgsxplore.filter import SceneFilter
from usgsxplore.utils import read_textfile, sort_strings_by_similarity, to_gpkg
from usgsxplore.utils import (
download_browse_img,
read_textfile,
save_in_gfile,
sort_strings_by_similarity,
to_gdf,
update_gdf_browse,
)


# ----------------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -68,6 +77,13 @@ def is_text_file(ctx: click.Context, param: click.Parameter, value: str) -> str:
return value


def is_vector_file(ctx: click.Context, param: click.Parameter, value: str) -> str:
"callback for verify the validity of the vector file"
if not value.endswith((".shp", ".gpkg", ".geojson")):
raise click.BadParameter(f"'{value}' must be a vector data file (.gpkg, .shp, .geojson)", ctx=ctx, param=param)
return value


# ----------------------------------------------------------------------------------------------------
# COMMAND LINE INTERFACE
# ----------------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -176,7 +192,8 @@ def search(
scenes = []
for batch_scenes in api.batch_search(dataset, scene_filter, limit, "full", pbar):
scenes += batch_scenes
to_gpkg(scenes, output)
gdf = to_gdf(scenes)
save_in_gfile(gdf, output)

# if dataset is invalid print a list of similar dataset for the user
except USGSInvalidDataset:
Expand Down Expand Up @@ -229,13 +246,41 @@ def download(
api.logout()


cli.add_command(search)
cli.add_command(download)
@click.command("download-browse")
@click.argument("vector-file", type=click.Path(exists=True, file_okay=True), callback=is_vector_file)
@click.option(
"--output-dir",
"-o",
type=click.Path(dir_okay=True, resolve_path=True),
default="./browse_images/",
help="Output directory",
)
@click.option("--pbar", is_flag=True, default=True, help="Display a progress bar.")
def download_browse(vector_file: str, output_dir: str, pbar: bool) -> None:
"""
Download browse images of a vector data file localy.
"""
# create the directory if it not exist
os.makedirs(output_dir, exist_ok=True)

# read the vector file
gdf = gpd.read_file(vector_file)
print(gdf.shape)

@click.command()
def cli_gpkg():
click.echo("hello")
# get the list of browse_url
url_list = gdf["browse_url"].tolist()

# download the list of url with download_browse_img
dl_recap = download_browse_img(url_list, output_dir, pbar)

# update the vector file with browse_path added
gdf = update_gdf_browse(gdf, dl_recap, output_dir)
save_in_gfile(gdf, vector_file)


cli.add_command(search)
cli.add_command(download)
cli.add_command(download_browse)


if __name__ == "__main__":
Expand Down
107 changes: 91 additions & 16 deletions usgsxplore/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,31 @@
from difflib import SequenceMatcher

import geopandas as gpd
from shapely import MultiPolygon, Polygon
import pandas as pd
import requests
from shapely import MultiPolygon, Point, Polygon
from tqdm import tqdm


def to_gpkg(scenes_metadata: list[dict], geo_file: str = "scenes.gpkg") -> None:
def to_gdf(scenes_metadata: list[dict]) -> None:
"""
This method convert the file scenes.jsonl into a geodataframe with the spatialCoverage for the geometry

:param scenes_metadata: result of the search
:return: GeoDataFrame to generate a geopackage
"""
geometries = []
attributes = {}

img_dir = os.path.join(os.path.dirname(geo_file), "browse-images")

# loop in every line of the scenes file
for scene in scenes_metadata:
geom_type = scene["spatialCoverage"]["type"]
if geom_type == "Polygon":
geometries.append(Polygon(scene["spatialCoverage"]["coordinates"][0]))
elif geom_type == "MultiPolygon":
geometries.append(MultiPolygon(scene["spatialCoverage"]["coordinates"]))
elif geom_type == "Point":
geometries.append(Point(scene["spatialCoverage"]["coordinates"]))
else:
continue

Expand All @@ -38,29 +42,33 @@ def to_gpkg(scenes_metadata: list[dict], geo_file: str = "scenes.gpkg") -> None:
attributes.setdefault(field.get("fieldName"), []).append(field.get("value"))

if len(scene["browse"]) > 0:
attributes.setdefault("browse_path", []).append(
os.path.join(os.path.abspath(img_dir), os.path.basename(scene["browse"][0]["browsePath"]))
)
attributes.setdefault("browse_url", []).append(scene["browse"][0]["browsePath"])
else:
attributes.setdefault("browse_path", []).append(None)
attributes.setdefault("browse_url", []).append(None)

# create geodataframe with attributes and geometries
gdf = gpd.GeoDataFrame(data=attributes, geometry=geometries, crs="EPSG:4326")
return gpd.GeoDataFrame(data=attributes, geometry=geometries, crs="EPSG:4326")


def save_in_gfile(gdf: gpd.GeoDataFrame, vector_file: str = "scenes.gpkg") -> None:
"""
This function save the geodataframe into the vector_file given

:param gdf: geodataframe that will be saved
:param vector_file: output vector file
"""
# save the geodataframe in a geospatial file
if geo_file.endswith(".shp"):
if vector_file.endswith(".shp"):
# here we ingore warnings that tell us all field are truncated
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message=r"Normalized/laundered field name: '.+' to '.+'")
gdf.to_file(geo_file)
elif geo_file.endswith(".gpkg"):
gdf.to_file(geo_file, driver="GPKG")
elif geo_file.endswith(".geojson"):
gdf.to_file(geo_file, driver="GeoJSON")
gdf.to_file(vector_file)
elif vector_file.endswith(".gpkg"):
gdf.to_file(vector_file, driver="GPKG")
elif vector_file.endswith(".geojson"):
gdf.to_file(vector_file, driver="GeoJSON")
else:
raise ValueError(f"The file '{geo_file}' need to end with : .shp|.gpkg|.geojson")
raise ValueError(f"The file '{vector_file}' need to end with : .shp|.gpkg|.geojson")


def read_textfile(textfile: str) -> list[str]:
Expand Down Expand Up @@ -97,4 +105,71 @@ def sort_strings_by_similarity(ref_str: str, list_str: list[str]) -> list[str]:
return sorted_list_str


def download_browse_img(url_list: list[str], output_dir: str, pbar: bool = True) -> pd.DataFrame:
"""
Download all browse image with the url_list and put them into the output_dir.
Return a recap of the downloading.

:param url_list: list of all browse images url
:param output_dir: output directory
:param pbar: if True display a progress bar of the downloading
:return: dataframe of downloading recap
"""
df = pd.DataFrame({"url": url_list})
df.set_index("url", inplace=True)
df = df.assign(already_download=False, status=None)

# Create a set of already downloaded files for faster lookup
already_dl_files = {file.split(".", maxsplit=1)[0] for file in os.listdir(output_dir) if file.endswith(".jpg")}

# Mark already downloaded files in the DataFrame
for url in url_list:
filename = os.path.basename(url).split(".", maxsplit=1)[0]
if filename in already_dl_files:
df.loc[url, "already_download"] = True

# create a progress_bar if pbar
if pbar:
progress_bar = tqdm(desc="Downloading images", total=len(url_list), initial=df["already_download"].sum())

# loop around not already_download urls and download it and save
# status_code in the dataframe
session = requests.Session()
# flake8: noqa E712
for url, row in df[df["already_download"] == False].iterrows():
response = session.get(url)
if response.status_code == 200:
# get the name of the images
filename = os.path.basename(url)

with open(os.path.join(output_dir, filename), "wb") as f:
f.write(response.content)
df.loc[url, "status"] = response.status_code

if pbar:
progress_bar.update()
# close the progress bar at the end of the downloading
if pbar:
progress_bar.close()

# return the recap
return df


def update_gdf_browse(gdf: gpd.GeoDataFrame, output_dir: str) -> gpd.GeoDataFrame:
"""
Update the gdf given to add a new metadata "browse_path" with the browse.

:param gdf: the geodataframe that would be modified
:param dl_recap: recap of the downloading (output of download_browse_img)
:param output_dir: browse output_dir
:return gdf
"""
gdf = gdf.assign(browse_path=gdf["browse_url"])
gdf["browse_path"] = gdf["browse_path"].apply(os.path.basename)
gdf["browse_path"] = gdf["browse_path"].apply(lambda x: os.path.join(output_dir, x))

return gdf


# End-of-file (EOF)
Loading