Skip to content

Commit

Permalink
feat: add ignore list to downloads (#115)
Browse files Browse the repository at this point in the history
* add ignore list to downloads

* lint
  • Loading branch information
pnadolny13 authored Nov 14, 2023
1 parent 290297b commit 69e7e5f
Show file tree
Hide file tree
Showing 5 changed files with 299 additions and 0 deletions.
5 changes: 5 additions & 0 deletions hub_utils/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ def download_metadata(
local_path: str,
variant_path_list: str = None,
all_sdk: bool = True,
ignore_list_str: str = "",
):
"""
NOTE: USED FOR
Expand All @@ -340,16 +341,20 @@ def download_metadata(
"""
util = Utilities()
s3 = S3()
ignore_list = ignore_list_str.split(",")
if not variant_path_list:
variant_path_list = ",".join(SDK_SUFFIX_LIST)
if all_sdk:
variant_path_list = ",".join(
[
i["plugin-name"].split(".yml")[0]
for i in util.get_variant_names(None, "sdk")
if i["plugin-name"].split(".yml")[0] not in ignore_list
]
)
for yaml_file in variant_path_list.split(","):
if not yaml_file:
continue
suffix = util.get_suffix(yaml_file)
local_file_path = f"{local_path}/{suffix}.json"
s3.download_latest(os.environ.get("AWS_S3_BUCKET"), suffix, local_file_path)
Expand Down
127 changes: 127 additions & 0 deletions tests/_data/meltano/extractors/tap-github/meltanolabs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
capabilities:
- about
- batch
- catalog
- discover
- schema-flattening
- state
- stream-maps
description: Code hosting platform
domain_url: https://docs.github.com/en/rest
keywords:
- api
- free service
- meltano_sdk
label: GitHub
logo_url: /assets/logos/extractors/github.png
maintenance_status: active
name: tap-github
namespace: tap_github
pip_url: git+https://github.com/MeltanoLabs/tap-github.git
quality: gold
repo: https://github.com/MeltanoLabs/tap-github
select:
- '*.*'
- '!traffic_*.*'
settings:
- description: List of GitHub tokens to authenticate with. Streams will loop through
them when hitting rate limits.
kind: array
label: Additional Auth Tokens
name: additional_auth_tokens
- description: GitHub token to authenticate with.
kind: password
label: Auth Token
name: auth_token
- description: Compression format to use for batch files.
kind: options
label: Batch Config Encoding Compression
name: batch_config.encoding.compression
options:
- label: Gzip
value: gzip
- label: None
value: none
- description: Format to use for batch files.
kind: options
label: Batch Config Encoding Format
name: batch_config.encoding.format
options:
- label: Jsonl
value: jsonl
- description: Prefix to use when writing batch files.
kind: string
label: Batch Config Storage Prefix
name: batch_config.storage.prefix
- description: Root path to use when writing batch files.
kind: string
label: Batch Config Storage Root
name: batch_config.storage.root
- description: "'True' to enable schema flattening and automatically expand nested
properties."
kind: boolean
label: Flattening Enabled
name: flattening_enabled
- description: The max depth to flatten schemas.
kind: integer
label: Flattening Max Depth
name: flattening_max_depth
- description: The log level of the API response metrics.
kind: string
label: Metrics Log Level
name: metrics_log_level
- description: An array of strings containing the github organizations to be included
kind: array
label: Organizations
name: organizations
- description: Add a buffer to avoid consuming all query points for the token at hand.
Defaults to 1000.
kind: integer
label: Rate Limit Buffer
name: rate_limit_buffer
- description: An array of strings containing the github repos to be included
kind: array
label: Repositories
name: repositories
- description: An array of search descriptor objects with the following properties.
"name" - a human readable name for the search query. "query" - a github search
string (generally the same as would come after ?q= in the URL)
kind: array
label: Searches
name: searches
- description: Set to true to skip API calls for the parent streams (such as repositories)
if it is not selected but children are
kind: boolean
label: Skip Parent Streams
name: skip_parent_streams
- description: ''
kind: date_iso8601
label: Start Date
name: start_date
- description: ''
kind: object
label: Stream Map Config
name: stream_map_config
- description: ''
kind: object
label: Stream Maps
name: stream_maps
- description: ''
kind: string
label: User Agent
name: user_agent
- description: A list of GitHub user ids.
kind: array
label: User IDs
name: user_ids
- description: A list of GithHub usernames.
kind: array
label: User Usernames
name: user_usernames
settings_group_validation:
- - repositories
- - organizations
- - searches
- - user_usernames
- - user_ids
variant: meltanolabs
57 changes: 57 additions & 0 deletions tests/_data/meltano/extractors/tap-hubspot/hotgluexyz.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
capabilities:
- about
- discover
- catalog
- state
- stream-maps
- schema-flattening
description: Inbound Marketing software
domain_url: https://developers.hubspot.com/docs/api/overview
executable: tap-hubspot-beta
keywords:
- api
- meltano_sdk
label: Hubspot
logo_url: /assets/logos/extractors/hubspot.png
maintenance_status: active
name: tap-hubspot
namespace: tap_hubspot
pip_url: git+https://gitlab.com/hotglue/tap-hubspot-beta.git
quality: gold
repo: https://gitlab.com/hotglue/tap-hubspot-beta
settings:
- description: HubSpot Access token. See the <a href="https://developers.hubspot.com/docs/api/private-apps">Hubspot
docs</a> if you need help finding this token.
kind: password
label: Access Token
name: access_token
- description: The client ID used for authentication.
documentation: https://developers.hubspot.com/docs/api/working-with-oauth
label: Client ID
name: client_id
- description: The client secret used for authentication.
kind: password
label: Client Secret
name: client_secret
- description: This is the URL that the user will be redirected to after they authorize
your app for the requested scopes
documentation: https://developers.hubspot.com/docs/api/working-with-oauth
label: Redirect URI
name: redirect_uri
- description: This is the refresh token provided by HubSpot.
kind: password
label: Refresh Token
name: refresh_token
- description: The seconds until the token expires.
kind: integer
label: Expires In
name: expires_in
- description: The time to start syncing data from if no existing state is found.
label: Start Date
name: start_date
settings_group_validation:
- - client_id
- client_secret
- redirect_uri
- refresh_token
variant: hotgluexyz
82 changes: 82 additions & 0 deletions tests/_data/meltano/extractors/tap-hubspot/meltanolabs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
capabilities:
- about
- batch
- catalog
- discover
- schema-flattening
- state
- stream-maps
description: Inbound Marketing software
domain_url: https://developers.hubspot.com/docs/api/overview
executable: tap-hubspot
keywords:
- meltano_sdk
label: Hubspot
logo_url: /assets/logos/extractors/hubspot.png
maintenance_status: active
name: tap-hubspot
namespace: tap_hubspot
next_steps: ''
pip_url: git+https://github.com/MeltanoLabs/tap-hubspot.git
quality: gold
repo: https://github.com/MeltanoLabs/tap-hubspot
settings:
- description: Token to authenticate against the API service
kind: password
label: Access Token
name: access_token
- description: Compression format to use for batch files.
kind: options
label: Batch Config Encoding Compression
name: batch_config.encoding.compression
options:
- label: Gzip
value: gzip
- label: None
value: none
- description: Format to use for batch files.
kind: options
label: Batch Config Encoding Format
name: batch_config.encoding.format
options:
- label: Jsonl
value: jsonl
- description: Prefix to use when writing batch files.
kind: string
label: Batch Config Storage Prefix
name: batch_config.storage.prefix
- description: Root path to use when writing batch files.
kind: string
label: Batch Config Storage Root
name: batch_config.storage.root
- description: Latest record date to sync
kind: date_iso8601
label: End Date
name: end_date
- description: "'True' to enable schema flattening and automatically expand nested
properties."
kind: boolean
label: Flattening Enabled
name: flattening_enabled
- description: The max depth to flatten schemas.
kind: integer
label: Flattening Max Depth
name: flattening_max_depth
- description: Earliest record date to sync
kind: date_iso8601
label: Start Date
name: start_date
- description: User-defined config values to be used within map expressions.
kind: object
label: Stream Map Config
name: stream_map_config
- description: Config object for stream maps capability. For more information check
out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html).
kind: object
label: Stream Maps
name: stream_maps
settings_group_validation:
- - access_token
settings_preamble: ''
usage: ''
variant: meltanolabs
28 changes: 28 additions & 0 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,34 @@ def test_download_metadata(patch):
f"{local_path}/extractors/tap-csv/meltanolabs.json"
)

@patch.object(S3, "download_latest")
def test_download_metadata_ignore(patch):

expected_bucket = "TEST_BUCKET"
os.environ["AWS_S3_BUCKET"] = expected_bucket
local_path = f"{PATH}/data/output_path"
hub_yml_path = f"{PATH}/data/hub_data/_data/extractors/tap-csv/meltanolabs.yml"
os.environ["HUB_ROOT_PATH"] = f"./tests/"
download_metadata(
local_path,
# variant_path_list=hub_yml_path,
all_sdk=True,
ignore_list_str="extractors/tap-hubspot/hotgluexyz"
)
assert patch.call_count == 2
patch.assert_has_calls([
call(
expected_bucket,
"extractors/tap-hubspot/meltanolabs",
f"{local_path}/extractors/tap-hubspot/meltanolabs.json"
),
call(
expected_bucket,
"extractors/tap-github/meltanolabs",
f"{local_path}/extractors/tap-github/meltanolabs.json"
)
])

@patch.object(S3, "download_latest")
def test_download_metadata_list(patch):

Expand Down

0 comments on commit 69e7e5f

Please sign in to comment.