diff --git a/archive_query_log/cli/parsers.py b/archive_query_log/cli/parsers.py index 9b6d8fb..19923a0 100644 --- a/archive_query_log/cli/parsers.py +++ b/archive_query_log/cli/parsers.py @@ -439,11 +439,6 @@ def warc_direct_answers_add( dir_okay=False, readable=True, resolve_path=True, allow_dash=False), default=Path("data") / "selected-services.yaml") -@pass_config -def warc_direct_answers_import(config: Config, services_path: Path) -> None: - from archive_query_log.imports.yaml import import_warc_direct_answers_parsers - WarcDirectAnswersParser.init(using=config.es.client) - import_warc_direct_answers_parsers(config, services_path) @parsers.group() diff --git a/archive_query_log/imports/yaml.py b/archive_query_log/imports/yaml.py index da4d9c6..a3ffd0b 100644 --- a/archive_query_log/imports/yaml.py +++ b/archive_query_log/imports/yaml.py @@ -19,7 +19,6 @@ from archive_query_log.parsers.url_query import add_url_query_parser from archive_query_log.parsers.warc_query import add_warc_query_parser from archive_query_log.parsers.warc_snippets import add_warc_snippets_parser -from archive_query_log.parsers.warc_direct_answers import add_warc_direct_answers_parser from archive_query_log.parsers.xml import xpaths_from_css_selector, \ text_xpath, merge_xpaths from archive_query_log.providers import add_provider @@ -480,76 +479,3 @@ def import_warc_snippets_parsers(config: Config, services_path: Path) -> None: title_xpath=title_xpath, text_xpath=snippet_xpath, ) - - -def import_warc_direct_answers_parsers(config: Config, services_path: Path) -> None: - echo("Load providers from services file.") - with services_path.open("r") as file: - services_list: Sequence[dict] = safe_load(file) - echo(f"Found {len(services_list)} service definitions.") - - services: Iterable[dict] = services_list - # noinspection PyTypeChecker - services = tqdm( - services, - desc="Import parsers for providers", - unit="provider", - ) - for service in services: - if ("domains" not in service or "results_parsers" not in service): - continue - - results_parsers = service["results_parsers"] - num_results_parsers = len(results_parsers) - - providers = ( - Provider.search(using=config.es.client) - .query(Terms(domains=service["domains"])) - .scan() - ) - providers = safe_iter_scan(providers) - for provider in providers: - for k, results_parser in enumerate(results_parsers): - if results_parser["type"] != "html_selector": - continue - results_selector = results_parser["results_selector"] - url_selector = results_parser.get("url_selector") - direct_answer_selector = results_parser.get("direct_answer_selector") - - results_xpaths = xpaths_from_css_selector(results_selector) - results_xpaths = [ - "//" + result_xpath - for result_xpath in results_xpaths - ] - results_xpath = merge_xpaths(results_xpaths) - - if url_selector is not None: - url_xpaths = xpaths_from_css_selector(url_selector) - url_xpaths = [ - text_xpath(xpath, attribute="href") - for xpath in url_xpaths - ] - url_xpath = merge_xpaths(url_xpaths) - else: - url_xpath = None - - if direct_answer_selector is not None: - direct_answer_xpaths = xpaths_from_css_selector(direct_answer_selector) - direct_answer_xpaths = [ - text_xpath(xpath, text=True) - for xpath in direct_answer_xpaths - ] - direct_answer_xpath = merge_xpaths(direct_answer_xpaths) - else: - direct_answer_xpath = None - - add_warc_direct_answers_parser( - config=config, - provider_id=provider.meta.id, - url_pattern_regex=results_parser.get("url_pattern"), - priority=num_results_parsers - k, - parser_type="xpath", - xpath=results_xpath, - url_xpath=url_xpath, - text_xpath=direct_answer_xpath, - )