Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Download browse #2

Merged
merged 6 commits into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ The package can be installed using pip.

```bash
pip install usgsxplore

# or with pipx
pipx install usgsxplore
```

# Usage
Expand All @@ -56,6 +59,7 @@ Options:

Commands:
download Download scenes with their entity ids provided in the textfile.
download-browse Download browse images of a vector data file localy.
search Search scenes in a dataset with filters.
```

Expand Down Expand Up @@ -146,3 +150,19 @@ This command download scenes from their entity ids in the `TEXTFILE` and save th
- **0** : display nothing.
- **1** : display one progress bar for all scenes downloading.
- **2** : display a progress bar for each scenes downloading, with state information.

### Downloading-browse

```bash
usgsxplore download-browse --help
```
```
Usage: usgsxplore download-browse [OPTIONS] VECTOR_FILE

Download browse images of a vector data file localy.

Options:
-o, --output-dir PATH Output directory
--pbar Display a progress bar.
--help Show this message and exit.
```
2 changes: 1 addition & 1 deletion tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def test_scene_search(self):
result = self.api.scene_search("landsat_tm_c2_l1", max_results=1, metadata_type=None)

assert result["recordsReturned"] == 1
assert result["totalHits"] == 2940421
assert result["totalHits"] > 2940000
godinlu marked this conversation as resolved.
Show resolved Hide resolved
assert result["startingNumber"] == 1
assert result["results"][0]["metadata"] == []

Expand Down
28 changes: 21 additions & 7 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,12 @@
import pytest

from usgsxplore.api import API
from usgsxplore.utils import read_textfile, sort_strings_by_similarity, to_gpkg
from usgsxplore.utils import (
read_textfile,
save_in_gfile,
sort_strings_by_similarity,
to_gdf,
)


@pytest.fixture(scope="module")
Expand All @@ -24,20 +29,29 @@ def scenes_metadata() -> list[dict]:
return scenes


def test_to_gpkg(scenes_metadata: list[dict]):
"Test the to_gpkg functions"
def test_to_gdf(scenes_metadata: list[dict]) -> None:
"Test the to_gdf function"
gdf = to_gdf(scenes_metadata)
assert gdf.shape[0] == 10
assert gdf.shape[1] == 35


def test_save_in_gfile(scenes_metadata: list[dict]):
"Test the save_in_gfile functions"
gdf = to_gdf(scenes_metadata)

with TemporaryDirectory() as tmpdir:
gpkg_file = os.path.join(tmpdir, "tmp.gpkg")
shapefile = os.path.join(tmpdir, "tmp.shp")
geojson = os.path.join(tmpdir, "tmp.geojson")
invalid_file = os.path.join(tmpdir, "tmp.invalid")

to_gpkg(scenes_metadata, gpkg_file)
save_in_gfile(gdf, gpkg_file)
with pytest.warns(UserWarning):
to_gpkg(scenes_metadata, shapefile)
to_gpkg(scenes_metadata, geojson)
save_in_gfile(gdf, shapefile)
save_in_gfile(gdf, geojson)
with pytest.raises(ValueError):
to_gpkg(scenes_metadata, invalid_file)
save_in_gfile(gdf, invalid_file)

assert os.path.exists(gpkg_file)
assert os.path.exists(shapefile)
Expand Down
59 changes: 52 additions & 7 deletions usgsxplore/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,22 @@
Author: Luc Godin
"""
import json
import os

import click
import geopandas as gpd

from usgsxplore.api import API
from usgsxplore.errors import FilterFieldError, FilterValueError, USGSInvalidDataset
from usgsxplore.filter import SceneFilter
from usgsxplore.utils import read_textfile, sort_strings_by_similarity, to_gpkg
from usgsxplore.utils import (
download_browse_img,
read_textfile,
save_in_gfile,
sort_strings_by_similarity,
to_gdf,
update_gdf_browse,
)


# ----------------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -68,6 +77,13 @@ def is_text_file(ctx: click.Context, param: click.Parameter, value: str) -> str:
return value


def is_vector_file(ctx: click.Context, param: click.Parameter, value: str) -> str:
"callback for verify the validity of the vector file"
if not value.endswith((".shp", ".gpkg", ".geojson")):
raise click.BadParameter(f"'{value}' must be a vector data file (.gpkg, .shp, .geojson)", ctx=ctx, param=param)
return value


# ----------------------------------------------------------------------------------------------------
# COMMAND LINE INTERFACE
# ----------------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -176,7 +192,8 @@ def search(
scenes = []
for batch_scenes in api.batch_search(dataset, scene_filter, limit, "full", pbar):
scenes += batch_scenes
to_gpkg(scenes, output)
gdf = to_gdf(scenes)
save_in_gfile(gdf, output)

# if dataset is invalid print a list of similar dataset for the user
except USGSInvalidDataset:
Expand Down Expand Up @@ -229,13 +246,41 @@ def download(
api.logout()


cli.add_command(search)
cli.add_command(download)
@click.command("download-browse")
@click.argument("vector-file", type=click.Path(exists=True, file_okay=True), callback=is_vector_file)
@click.option(
"--output-dir",
"-o",
type=click.Path(dir_okay=True, resolve_path=True),
default="./browse_images/",
help="Output directory",
)
@click.option("--pbar", is_flag=True, default=True, help="Display a progress bar.")
def download_browse(vector_file: str, output_dir: str, pbar: bool) -> None:
"""
Download browse images of a vector data file localy.
"""
# create the directory if it not exist
os.makedirs(output_dir, exist_ok=True)

# read the vector file
gdf = gpd.read_file(vector_file)
print(gdf.shape)

@click.command()
def cli_gpkg():
click.echo("hello")
# get the list of browse_url
url_list = gdf["browse_url"].tolist()

# download the list of url with download_browse_img
dl_recap = download_browse_img(url_list, output_dir, pbar)

# update the vector file with browse_path added
gdf = update_gdf_browse(gdf, dl_recap, output_dir)
save_in_gfile(gdf, vector_file)


cli.add_command(search)
cli.add_command(download)
cli.add_command(download_browse)


if __name__ == "__main__":
Expand Down
107 changes: 91 additions & 16 deletions usgsxplore/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,31 @@
from difflib import SequenceMatcher

import geopandas as gpd
from shapely import MultiPolygon, Polygon
import pandas as pd
import requests
from shapely import MultiPolygon, Point, Polygon
from tqdm import tqdm


def to_gpkg(scenes_metadata: list[dict], geo_file: str = "scenes.gpkg") -> None:
def to_gdf(scenes_metadata: list[dict]) -> None:
"""
This method convert the file scenes.jsonl into a geodataframe with the spatialCoverage for the geometry

:param scenes_metadata: result of the search
:return: GeoDataFrame to generate a geopackage
"""
geometries = []
attributes = {}

img_dir = os.path.join(os.path.dirname(geo_file), "browse-images")

# loop in every line of the scenes file
for scene in scenes_metadata:
geom_type = scene["spatialCoverage"]["type"]
if geom_type == "Polygon":
geometries.append(Polygon(scene["spatialCoverage"]["coordinates"][0]))
elif geom_type == "MultiPolygon":
geometries.append(MultiPolygon(scene["spatialCoverage"]["coordinates"]))
elif geom_type == "Point":
geometries.append(Point(scene["spatialCoverage"]["coordinates"]))
else:
continue

Expand All @@ -38,29 +42,33 @@ def to_gpkg(scenes_metadata: list[dict], geo_file: str = "scenes.gpkg") -> None:
attributes.setdefault(field.get("fieldName"), []).append(field.get("value"))

if len(scene["browse"]) > 0:
attributes.setdefault("browse_path", []).append(
os.path.join(os.path.abspath(img_dir), os.path.basename(scene["browse"][0]["browsePath"]))
)
attributes.setdefault("browse_url", []).append(scene["browse"][0]["browsePath"])
else:
attributes.setdefault("browse_path", []).append(None)
attributes.setdefault("browse_url", []).append(None)

# create geodataframe with attributes and geometries
gdf = gpd.GeoDataFrame(data=attributes, geometry=geometries, crs="EPSG:4326")
return gpd.GeoDataFrame(data=attributes, geometry=geometries, crs="EPSG:4326")


def save_in_gfile(gdf: gpd.GeoDataFrame, vector_file: str = "scenes.gpkg") -> None:
"""
This function save the geodataframe into the vector_file given

:param gdf: geodataframe that will be saved
:param vector_file: output vector file
"""
# save the geodataframe in a geospatial file
if geo_file.endswith(".shp"):
if vector_file.endswith(".shp"):
# here we ingore warnings that tell us all field are truncated
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message=r"Normalized/laundered field name: '.+' to '.+'")
gdf.to_file(geo_file)
elif geo_file.endswith(".gpkg"):
gdf.to_file(geo_file, driver="GPKG")
elif geo_file.endswith(".geojson"):
gdf.to_file(geo_file, driver="GeoJSON")
gdf.to_file(vector_file)
elif vector_file.endswith(".gpkg"):
gdf.to_file(vector_file, driver="GPKG")
elif vector_file.endswith(".geojson"):
gdf.to_file(vector_file, driver="GeoJSON")
else:
raise ValueError(f"The file '{geo_file}' need to end with : .shp|.gpkg|.geojson")
raise ValueError(f"The file '{vector_file}' need to end with : .shp|.gpkg|.geojson")


def read_textfile(textfile: str) -> list[str]:
Expand Down Expand Up @@ -97,4 +105,71 @@ def sort_strings_by_similarity(ref_str: str, list_str: list[str]) -> list[str]:
return sorted_list_str


def download_browse_img(url_list: list[str], output_dir: str, pbar: bool = True) -> pd.DataFrame:
"""
Download all browse image with the url_list and put them into the output_dir.
Return a recap of the downloading.

:param url_list: list of all browse images url
:param output_dir: output directory
:param pbar: if True display a progress bar of the downloading
:return: dataframe of downloading recap
"""
df = pd.DataFrame({"url": url_list})
df.set_index("url", inplace=True)
df = df.assign(already_download=False, status=None)

# Create a set of already downloaded files for faster lookup
already_dl_files = {file.split(".", maxsplit=1)[0] for file in os.listdir(output_dir) if file.endswith(".jpg")}

# Mark already downloaded files in the DataFrame
for url in url_list:
filename = os.path.basename(url).split(".", maxsplit=1)[0]
if filename in already_dl_files:
df.loc[url, "already_download"] = True

# create a progress_bar if pbar
if pbar:
progress_bar = tqdm(desc="Downloading images", total=len(url_list), initial=df["already_download"].sum())

# loop around not already_download urls and download it and save
# status_code in the dataframe
session = requests.Session()
# flake8: noqa E712
for url, row in df[df["already_download"] == False].iterrows():
response = session.get(url)
if response.status_code == 200:
# get the name of the images
filename = os.path.basename(url)

with open(os.path.join(output_dir, filename), "wb") as f:
f.write(response.content)
df.loc[url, "status"] = response.status_code

if pbar:
progress_bar.update()
# close the progress bar at the end of the downloading
if pbar:
progress_bar.close()

# return the recap
return df


def update_gdf_browse(gdf: gpd.GeoDataFrame, dl_recap: pd.DataFrame, output_dir: str) -> None:
"""
Update the gdf given to add a new metadata "browse_path" with the browse.

:param gdf: the geodataframe that would be modified
:param dl_recap: recap of the downloading (output of download_browse_img)
:param output_dir: browse output_dir
:return gdf
"""
gdf = gdf.assign(browse_path=gdf["browse_url"])
gdf["browse_path"] = gdf["browse_path"].apply(os.path.basename)
gdf["browse_path"] = gdf["browse_path"].apply(lambda x: os.path.join(output_dir, x))

return gdf


# End-of-file (EOF)
Loading