From 3656951259d829c5c27321423eb7d21a21dd03dc Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Thu, 22 Aug 2024 16:50:46 +0200 Subject: [PATCH 01/12] Initial support for scraper ML price extraction integration --- changedetectionio/flask_app.py | 9 +++ .../processors/restock_diff/processor.py | 57 +++++++++++++++++++ .../templates/watch-overview.html | 2 + docker-compose.yml | 11 ++++ requirements.txt | 2 +- 5 files changed, 80 insertions(+), 1 deletion(-) diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py index fd12393ad08..e6e7c6944e1 100644 --- a/changedetectionio/flask_app.py +++ b/changedetectionio/flask_app.py @@ -1602,6 +1602,15 @@ def form_watch_list_checkbox_operations(): flash(f"{len(uuids)} watches were tagged") + elif op.startswith('mode:'): + mode = op.replace('mode:','') + for uuid in uuids: + uuid = uuid.strip() + if datastore.data['watching'].get(uuid): + datastore.data['watching'][uuid]['processor'] = mode + flash(f"{len(uuids)} watches changed modes") + + return redirect(url_for('index')) @app.route("/api/share-url", methods=['GET']) diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py index b2184e35376..52f4d11b08f 100644 --- a/changedetectionio/processors/restock_diff/processor.py +++ b/changedetectionio/processors/restock_diff/processor.py @@ -3,10 +3,13 @@ from . import Restock from loguru import logger import hashlib +import os import re import urllib3 import time +from ...html_tools import html_to_text + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) name = 'Re-stock & Price detection for single product pages' description = 'Detects if the product goes back to in-stock' @@ -118,6 +121,45 @@ class perform_site_check(difference_detection_processor): screenshot = None xpath_data = None + def ML_scrape_for_price_data(self, ML_price_scraper_url): + import requests + from changedetectionio import html_tools + + price_info = None + + # Perform the POST request + response = requests.post(ML_price_scraper_url, json=self.fetcher.xpath_data) + logger.debug(f"ML Price scraper - {ML_price_scraper_url} Response OK? - '{response.ok}'") + # Check if the response contains a dict + if response.ok: # This checks if the request was successful (status code 200-299) + response_json = response.json() + logger.debug(f"ML Price scraper: response - {response_json}'") + if isinstance(response_json, dict) and 'idx' in response_json.keys(): + suggested_xpath_idx = response_json.get('idx') + + # Use the path provided to extra the price text + from price_parser import Price + scrape_element = self.fetcher.xpath_data.get('size_pos', {})[suggested_xpath_idx] + result_s = None + if scrape_element['xpath'][0] == '/' or scrape_element['xpath'].startswith('xpath'): + result_s = html_tools.xpath_filter(xpath_filter=scrape_element['xpath'], + html_content=self.fetcher.content) + else: + # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text + result_s = html_tools.include_filters(include_filters=scrape_element['xpath'], + html_content=self.fetcher.content) + + if result_s: + text = html_to_text(result_s) + if text: + price_info = Price.fromstring(text) + else: + logger.error(f"ML Price scraper: missing xpath index (IDX) in response?") + else: + print(f"ML Price scraper: Request failed with status code: {response.status_code}") + + return price_info + def run_changedetection(self, watch, skip_when_checksum_same=True): if not watch: raise Exception("Watch no longer exists.") @@ -174,6 +216,21 @@ def run_changedetection(self, watch, skip_when_checksum_same=True): else: update_obj['restock']['in_stock'] = False + # Attempt to pass the elements off to the machine-learning endpoint if its enabled + # This might return a confident guess as to which element contains the price data + if not itemprop_availability.get('price'): + ML_price_scraper_url = os.getenv("PRICE_SCRAPER_ML_ENDPOINT") + if self.fetcher.xpath_data and ML_price_scraper_url: + price_info = self.ML_scrape_for_price_data(ML_price_scraper_url) + if price_info and price_info.amount: + logger.success(f"ML Price scraper: Got price data {price_info}") + itemprop_availability['price'] = f"{price_info.amount}" + update_obj['restock']['price'] = f"{price_info.amount}" + if price_info and price_info.currency: + itemprop_availability['currency'] = price_info.currency + update_obj['restock']['currency'] = price_info.currency + + # Main detection method fetched_md5 = None diff --git a/changedetectionio/templates/watch-overview.html b/changedetectionio/templates/watch-overview.html index 736e19da1ba..840d9508b24 100644 --- a/changedetectionio/templates/watch-overview.html +++ b/changedetectionio/templates/watch-overview.html @@ -37,6 +37,8 @@ + + diff --git a/docker-compose.yml b/docker-compose.yml index 2480a33994e..b3441ac389a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -57,6 +57,10 @@ services: # # Absolute minimum seconds to recheck, overrides any watch minimum, change to 0 to disable # - MINIMUM_SECONDS_RECHECK_TIME=3 + # + # Scrape prices from web pages automatically where the page has no embedded price information (see below also) + # - PRICE_SCRAPER_ML_ENDPOINT=http://cdio-price-scraper:5005 + # Comment out ports: when using behind a reverse proxy , enable networks: etc. ports: - 5000:5000 @@ -103,6 +107,13 @@ services: # # Workaround to avoid the browser crashing inside a docker container # # See https://github.com/SeleniumHQ/docker-selenium#quick-start # - /dev/shm:/dev/shm +# restart: unless-stopped + + # Machine Learning/AI - Use "Visual Selector" elements data to scrape price data + +# cdio-keras-price-scraper: +# hostname: cdio-price-scraper +# image: dgtlmoon/changedetection-AI-pricescraper # restart: unless-stopped volumes: diff --git a/requirements.txt b/requirements.txt index 537c3f802ba..6021253f6d6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -82,7 +82,7 @@ pytest-flask ~=1.2 # Anything 4.0 and up but not 5.0 jsonschema ~= 4.0 - +price_parser loguru # For scraping all possible metadata relating to products so we can do better restock detection From c0e9846a8597e7bd60cbd4530c747bd6b04bc359 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 23 Aug 2024 11:31:35 +0200 Subject: [PATCH 02/12] Adding more training data --- .../res/xpath_element_scraper.js | 59 ++++++++++++++++++- 1 file changed, 57 insertions(+), 2 deletions(-) diff --git a/changedetectionio/content_fetchers/res/xpath_element_scraper.js b/changedetectionio/content_fetchers/res/xpath_element_scraper.js index ccd89436973..f6bd12cdb58 100644 --- a/changedetectionio/content_fetchers/res/xpath_element_scraper.js +++ b/changedetectionio/content_fetchers/res/xpath_element_scraper.js @@ -77,7 +77,56 @@ const findUpTag = (el) => { } return null; } +// Text width scraper for ML training/detection +// Create a single canvas and get its 2D context +const canvas = document.createElement("canvas"); +const context = canvas.getContext("2d"); +// Function to get the width and height of the text inside an element and round them to the nearest integer +function getTextWidthAndHeightinPx(element) { + // Set the font to match the style of the text in the element + context.font = window.getComputedStyle(element).font; + + // Get the text inside the element + const text = element.textContent || element.innerText; + + // Measure the text width + const metrics = context.measureText(text); + const width = Math.round(metrics.width); + + // Get the font size from the computed style + const fontSize = parseFloat(window.getComputedStyle(element).fontSize); + const height = Math.round(fontSize); // Using font size as an approximation of height + + // Return both width and height as an object + return { textWidth: width, textHeight: height }; +} + + +// Function to determine which RGB value is the highest, or return 0 if they are all the same +function getDominantColorValue(element) { + // Get the computed style of the element to get the color property + const computedStyle = window.getComputedStyle(element); + const color = computedStyle.color; + + // Extract the RGB values from the color string (format: rgb(r, g, b)) + const rgbValues = color.match(/\d+/g).map(Number); + const [red, green, blue] = rgbValues; + + // Check if all values are the same + if (red === green && green === blue) { + return 0; // All RGB values are the same + } + + // Determine which value is the highest and return the corresponding number + if (red > green && red > blue) { + return 1; // Red is highest + } else if (green > red && green > blue) { + return 2; // Green is highest + } else { + return 3; // Blue is highest + } +} // @todo - if it's SVG or IMG, go into image diff mode // %ELEMENTS% replaced at injection time because different interfaces use it with different settings @@ -164,7 +213,7 @@ visibleElementsArray.forEach(function (element) { } } - let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now + let label = "none" // A placeholder, the actual labels for training are done by hand for now let text = element.textContent.trim().slice(0, 30).trim(); while (/\n{2,}|\t{2,}/.test(text)) { @@ -172,7 +221,10 @@ visibleElementsArray.forEach(function (element) { } // Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training. - const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ; + // @todo could be instead of USD/AUD etc [A-Z]{2,3} ? + const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|RM|,–)/.test(text) ; + // Sizing of the actual text inside the element can be very different from the elements size + const { textWidth, textHeight } = getTextWidthAndHeightinPx(element); size_pos.push({ xpath: xpath_result, @@ -189,6 +241,9 @@ visibleElementsArray.forEach(function (element) { fontSize: window.getComputedStyle(element).getPropertyValue('font-size'), fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'), hasDigitCurrency: hasDigitCurrency, + textColorClass: getDominantColorValue(element), + textWidth: textWidth, + textHeight: textHeight, label: label, }); From d7160d79bd7ef0e7dfe3f0350db7471745836364 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 23 Aug 2024 12:25:26 +0200 Subject: [PATCH 03/12] Use integer value for ML of the r,g,b --- .../res/xpath_element_scraper.js | 40 +++++++------------ 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/changedetectionio/content_fetchers/res/xpath_element_scraper.js b/changedetectionio/content_fetchers/res/xpath_element_scraper.js index f6bd12cdb58..17ac125ba54 100644 --- a/changedetectionio/content_fetchers/res/xpath_element_scraper.js +++ b/changedetectionio/content_fetchers/res/xpath_element_scraper.js @@ -103,31 +103,6 @@ function getTextWidthAndHeightinPx(element) { } -// Function to determine which RGB value is the highest, or return 0 if they are all the same -function getDominantColorValue(element) { - // Get the computed style of the element to get the color property - const computedStyle = window.getComputedStyle(element); - const color = computedStyle.color; - - // Extract the RGB values from the color string (format: rgb(r, g, b)) - const rgbValues = color.match(/\d+/g).map(Number); - const [red, green, blue] = rgbValues; - - // Check if all values are the same - if (red === green && green === blue) { - return 0; // All RGB values are the same - } - - // Determine which value is the highest and return the corresponding number - if (red > green && red > blue) { - return 1; // Red is highest - } else if (green > red && green > blue) { - return 2; // Green is highest - } else { - return 3; // Blue is highest - } -} - // @todo - if it's SVG or IMG, go into image diff mode // %ELEMENTS% replaced at injection time because different interfaces use it with different settings @@ -226,6 +201,17 @@ visibleElementsArray.forEach(function (element) { // Sizing of the actual text inside the element can be very different from the elements size const { textWidth, textHeight } = getTextWidthAndHeightinPx(element); + const computedStyle = window.getComputedStyle(element); + let red, green, blue; + + if (text.length) { + // Extract the RGB values from the color string (format: rgb(r, g, b)) + [red, green, blue] = computedStyle.color.match(/\d+/g).map(Number); + } else { + // Assign default values if text is empty + [red, green, blue] = [0, 0, 0]; + } + size_pos.push({ xpath: xpath_result, width: Math.round(bbox['width']), @@ -241,9 +227,11 @@ visibleElementsArray.forEach(function (element) { fontSize: window.getComputedStyle(element).getPropertyValue('font-size'), fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'), hasDigitCurrency: hasDigitCurrency, - textColorClass: getDominantColorValue(element), textWidth: textWidth, textHeight: textHeight, + t_r: red, + t_g: green, + t_b: blue, label: label, }); From 106f258d13ca7263f4063e624f5739b1e6fc5b0f Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 23 Aug 2024 14:27:12 +0200 Subject: [PATCH 04/12] add delay --- changedetectionio/tests/test_nonrenderable_pages.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/changedetectionio/tests/test_nonrenderable_pages.py b/changedetectionio/tests/test_nonrenderable_pages.py index 3757eb6edf4..ea408b8b9c7 100644 --- a/changedetectionio/tests/test_nonrenderable_pages.py +++ b/changedetectionio/tests/test_nonrenderable_pages.py @@ -2,6 +2,7 @@ from flask import url_for from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks +import time def set_nonrenderable_response(): test_return_data = """ @@ -90,6 +91,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure # A totally zero byte (#2528) response should also not trigger an error set_zero_byte_response() + time.sleep(2) client.get(url_for("form_watch_checknow"), follow_redirects=True) wait_for_all_checks(client) res = client.get(url_for("index")) From bf5b1143e33b091f1a9eb7069be2245462730141 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 23 Aug 2024 18:31:54 +0200 Subject: [PATCH 05/12] Adding test --- .../test-stack-reusable-workflow.yml | 9 ++ .../test_scrape_price_element.py | 124 ++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100644 changedetectionio/tests/ai-price-scraper/test_scrape_price_element.py diff --git a/.github/workflows/test-stack-reusable-workflow.yml b/.github/workflows/test-stack-reusable-workflow.yml index f2864680cef..4e35995f2e3 100644 --- a/.github/workflows/test-stack-reusable-workflow.yml +++ b/.github/workflows/test-stack-reusable-workflow.yml @@ -52,6 +52,10 @@ jobs: docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --cap-add=SYS_ADMIN --name sockpuppetbrowser --hostname sockpuppetbrowser --rm -p 3000:3000 dgtlmoon/sockpuppetbrowser:latest docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --cap-add=SYS_ADMIN --name sockpuppetbrowser-custom-url --hostname sockpuppetbrowser-custom-url -p 3001:3000 --rm dgtlmoon/sockpuppetbrowser:latest + # CDIO AI Element scraper for prices + # Run CDIO with PRICE_SCRAPER_ML_ENDPOINT=http://cdio-ai-price-element:5005/price-element + docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --name cdio-ai-price-element --hostname cdio-ai-price-element -p 5005:5005 --rm dgtlmoon/dgtlmoon/changedetection.io-ai:latest + - name: Spin up ancillary SMTP+Echo message test server run: | # Debug SMTP server/echo message back server @@ -95,6 +99,11 @@ jobs: # Settings headers playwright tests - Call back in from Sockpuppetbrowser, check headers docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000?dumpio=true" --network changedet-network test-changedetectionio bash -c 'find .; cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_request.py; pwd;find .' +# PLAYWRIGHT/NODE-> CDP + - name: ML/AI Price element scraper via Playwright+dgtlmoon/changedetection.io-ai + run: | + docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" -e "PLAYWRIGHT_DRIVER_URL=ws://PRICE_SCRAPER_ML_ENDPOINT:5005/price-element" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/ai-price-scraper/test_scrape_price_element.py' + - name: Playwright and SocketPuppetBrowser - Restock detection run: | # restock detection via playwright - added name=changedet here so that playwright and sockpuppetbrowser can connect to it diff --git a/changedetectionio/tests/ai-price-scraper/test_scrape_price_element.py b/changedetectionio/tests/ai-price-scraper/test_scrape_price_element.py new file mode 100644 index 00000000000..1ad8581f2f8 --- /dev/null +++ b/changedetectionio/tests/ai-price-scraper/test_scrape_price_element.py @@ -0,0 +1,124 @@ + +#!/usr/bin/env python3 +import os +import time + +from flask import url_for +from ..util import live_server_setup, wait_for_all_checks, extract_UUID_from_client + + +# No semantic data just some text, we should be able to find the product price. +def set_response(price="121.95"): + html_content = f""" + + + + + + Ajax Widget + + + +
+
+

Ajax Widget

+

The Ajax Widget is the ultimate solution for all your widget needs. Crafted with precision and using the latest technology, this widget offers unmatched performance and durability. Whether you're using it for personal or professional purposes, the Ajax Widget will not disappoint. It's easy to use, reliable, and comes with a sleek design that complements any setup. Don't settle for less; get the best with the Ajax Widget today!

+
+
+ ${price} +
+ Buy Now
+ IN STOCK +
+
+ + + """ + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(html_content) + time.sleep(1) + return None + + + + +def test_restock_itemprop_basic(client, live_server): + + # needs to be set and something like 'ws://127.0.0.1:3000' + assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test" + assert os.getenv('PRICE_SCRAPER_ML_ENDPOINT'), "Needs PRICE_SCRAPER_ML_ENDPOINT set for this test" + + + live_server_setup(live_server) + + set_response(price="123.99") + + test_url = url_for('test_endpoint', _external=True) + + client.post( + url_for("form_quick_watch_add"), + data={"url": test_url, "tags": 'restock tests', 'processor': 'restock_diff'}, + follow_redirects=True + ) + wait_for_all_checks(client) + res = client.get(url_for("index")) + + assert b'123.99' in res.data + assert b' in-stock' in res.data + assert b' not-in-stock' not in res.data + + res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) + assert b'Deleted' in res.data From 72579d8ea2de36e071c0effe7cf70a3dedf838fa Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 23 Aug 2024 18:38:51 +0200 Subject: [PATCH 06/12] woops --- .github/workflows/test-stack-reusable-workflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-stack-reusable-workflow.yml b/.github/workflows/test-stack-reusable-workflow.yml index 4e35995f2e3..9e2b337c3f8 100644 --- a/.github/workflows/test-stack-reusable-workflow.yml +++ b/.github/workflows/test-stack-reusable-workflow.yml @@ -54,7 +54,7 @@ jobs: # CDIO AI Element scraper for prices # Run CDIO with PRICE_SCRAPER_ML_ENDPOINT=http://cdio-ai-price-element:5005/price-element - docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --name cdio-ai-price-element --hostname cdio-ai-price-element -p 5005:5005 --rm dgtlmoon/dgtlmoon/changedetection.io-ai:latest + docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --name cdio-ai-price-element --hostname cdio-ai-price-element -p 5005:5005 --rm dgtlmoon/changedetection.io-ai:latest - name: Spin up ancillary SMTP+Echo message test server run: | From 5fa841637e034c558580305f6576997f2f88f549 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 23 Aug 2024 19:53:14 +0200 Subject: [PATCH 07/12] fix imports, paths --- .github/workflows/test-stack-reusable-workflow.yml | 2 +- changedetectionio/tests/conftest.py | 2 +- changedetectionio/tests/fetchers/test_content.py | 1 - .../test_scrape_price_element.py | 8 ++++---- 4 files changed, 6 insertions(+), 7 deletions(-) rename changedetectionio/tests/{ai-price-scraper => ml_price_scraper}/test_scrape_price_element.py (95%) diff --git a/.github/workflows/test-stack-reusable-workflow.yml b/.github/workflows/test-stack-reusable-workflow.yml index 9e2b337c3f8..1556560a1cd 100644 --- a/.github/workflows/test-stack-reusable-workflow.yml +++ b/.github/workflows/test-stack-reusable-workflow.yml @@ -102,7 +102,7 @@ jobs: # PLAYWRIGHT/NODE-> CDP - name: ML/AI Price element scraper via Playwright+dgtlmoon/changedetection.io-ai run: | - docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" -e "PLAYWRIGHT_DRIVER_URL=ws://PRICE_SCRAPER_ML_ENDPOINT:5005/price-element" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/ai-price-scraper/test_scrape_price_element.py' + docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" -e "PLAYWRIGHT_DRIVER_URL=ws://cdio-ai-price-element:5005/price-element" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/ml_price_scraper/test_scrape_price_element.py' - name: Playwright and SocketPuppetBrowser - Restock detection run: | diff --git a/changedetectionio/tests/conftest.py b/changedetectionio/tests/conftest.py index 50f7104b145..1a0cb804193 100644 --- a/changedetectionio/tests/conftest.py +++ b/changedetectionio/tests/conftest.py @@ -8,7 +8,7 @@ from changedetectionio import store import os import sys -from loguru import logger + # https://github.com/pallets/flask/blob/1.1.2/examples/tutorial/tests/test_auth.py # Much better boilerplate than the docs diff --git a/changedetectionio/tests/fetchers/test_content.py b/changedetectionio/tests/fetchers/test_content.py index 8d468cd4b52..569703caf15 100644 --- a/changedetectionio/tests/fetchers/test_content.py +++ b/changedetectionio/tests/fetchers/test_content.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -import time from flask import url_for from ..util import live_server_setup, wait_for_all_checks import logging diff --git a/changedetectionio/tests/ai-price-scraper/test_scrape_price_element.py b/changedetectionio/tests/ml_price_scraper/test_scrape_price_element.py similarity index 95% rename from changedetectionio/tests/ai-price-scraper/test_scrape_price_element.py rename to changedetectionio/tests/ml_price_scraper/test_scrape_price_element.py index 1ad8581f2f8..b2a5d630a19 100644 --- a/changedetectionio/tests/ai-price-scraper/test_scrape_price_element.py +++ b/changedetectionio/tests/ml_price_scraper/test_scrape_price_element.py @@ -1,11 +1,11 @@ - -#!/usr/bin/env python3 import os -import time from flask import url_for -from ..util import live_server_setup, wait_for_all_checks, extract_UUID_from_client +from changedetectionio.tests.util import set_original_response, set_modified_response, set_more_modified_response, live_server_setup, \ + wait_for_all_checks, \ + set_longer_modified_response +import time # No semantic data just some text, we should be able to find the product price. def set_response(price="121.95"): From 7c914cd2660d92b5ab588878f980c53ea73345e5 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 23 Aug 2024 20:24:35 +0200 Subject: [PATCH 08/12] dangit --- .github/workflows/test-stack-reusable-workflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-stack-reusable-workflow.yml b/.github/workflows/test-stack-reusable-workflow.yml index 1556560a1cd..52706e011a7 100644 --- a/.github/workflows/test-stack-reusable-workflow.yml +++ b/.github/workflows/test-stack-reusable-workflow.yml @@ -102,7 +102,7 @@ jobs: # PLAYWRIGHT/NODE-> CDP - name: ML/AI Price element scraper via Playwright+dgtlmoon/changedetection.io-ai run: | - docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" -e "PLAYWRIGHT_DRIVER_URL=ws://cdio-ai-price-element:5005/price-element" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/ml_price_scraper/test_scrape_price_element.py' + docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" -e "PRICE_SCRAPER_ML_ENDPOINT=ws://cdio-ai-price-element:5005/price-element" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/ml_price_scraper/test_scrape_price_element.py' - name: Playwright and SocketPuppetBrowser - Restock detection run: | From 2e82b17cacb509f08c3721cd6f9941cf0b1dc45b Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 23 Aug 2024 21:51:33 +0200 Subject: [PATCH 09/12] fix path --- .../tests/ml_price_scraper/test_scrape_price_element.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/changedetectionio/tests/ml_price_scraper/test_scrape_price_element.py b/changedetectionio/tests/ml_price_scraper/test_scrape_price_element.py index b2a5d630a19..e2bc868e6af 100644 --- a/changedetectionio/tests/ml_price_scraper/test_scrape_price_element.py +++ b/changedetectionio/tests/ml_price_scraper/test_scrape_price_element.py @@ -106,7 +106,10 @@ def test_restock_itemprop_basic(client, live_server): set_response(price="123.99") + # because it needs to access itself from within the sockpuppetbrowser test_url = url_for('test_endpoint', _external=True) + test_url = test_url.replace('localhost.localdomain', 'cdio') + test_url = test_url.replace('localhost', 'cdio') client.post( url_for("form_quick_watch_add"), From 446622159c8f56c9a2b20c98108b0d74e2da5445 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 30 Aug 2024 16:06:04 +0200 Subject: [PATCH 10/12] WIP - adding more scrape data and some dev tweaks --- .../res/xpath_element_scraper.js | 67 +++++++++++++------ changedetectionio/flask_app.py | 6 +- changedetectionio/model/Watch.py | 2 +- .../processors/restock_diff/processor.py | 9 +++ 4 files changed, 60 insertions(+), 24 deletions(-) diff --git a/changedetectionio/content_fetchers/res/xpath_element_scraper.js b/changedetectionio/content_fetchers/res/xpath_element_scraper.js index 17ac125ba54..c7a92378b46 100644 --- a/changedetectionio/content_fetchers/res/xpath_element_scraper.js +++ b/changedetectionio/content_fetchers/res/xpath_element_scraper.js @@ -15,6 +15,7 @@ try { console.log(e); } +const percentageNumerical = str => Math.round((str.match(/\d/g) || []).length / str.length * 100); // Include the getXpath script directly, easier than fetching function getxpath(e) { @@ -146,8 +147,10 @@ const visibleElementsArray = []; // Call collectVisibleElements with the starting parent element collectVisibleElements(document.body, visibleElementsArray); +// Append any custom selectors to the visibleElementsArray -visibleElementsArray.forEach(function (element) { + +function get_element_metadata(element) { bbox = element.getBoundingClientRect(); @@ -190,14 +193,21 @@ visibleElementsArray.forEach(function (element) { let label = "none" // A placeholder, the actual labels for training are done by hand for now - let text = element.textContent.trim().slice(0, 30).trim(); - while (/\n{2,}|\t{2,}/.test(text)) { - text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t') - } + // Check if the element was found and get its text , not including any child element + let text = Array.from(element.childNodes) + .filter(node => node.nodeType === Node.TEXT_NODE) + .map(node => node.textContent) + .join(''); + + // Remove any gaps in sequences of newlines and tabs inside the string + text = text.trim().replace(/[\s\t\n\r]{2,}/g, ' ').trim(); // Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training. // @todo could be instead of USD/AUD etc [A-Z]{2,3} ? + //const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|RM|,–)/.test(text) ; const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|RM|,–)/.test(text) ; + const hasDigit = /[0-9]/.test(text) ; + // Sizing of the actual text inside the element can be very different from the elements size const { textWidth, textHeight } = getTextWidthAndHeightinPx(element); @@ -211,8 +221,7 @@ visibleElementsArray.forEach(function (element) { // Assign default values if text is empty [red, green, blue] = [0, 0, 0]; } - - size_pos.push({ + return { xpath: xpath_result, width: Math.round(bbox['width']), height: Math.round(bbox['height']), @@ -223,18 +232,27 @@ visibleElementsArray.forEach(function (element) { // tagtype used by Browser Steps tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '', isClickable: window.getComputedStyle(element).cursor === "pointer", - // Used by the keras trainer + // Used by the keras/pytorch trainer fontSize: window.getComputedStyle(element).getPropertyValue('font-size'), fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'), + pcNumerical: text.length && percentageNumerical(text), + hasDigit: hasDigit, hasDigitCurrency: hasDigitCurrency, textWidth: textWidth, textHeight: textHeight, + textLength: text.length, t_r: red, t_g: green, t_b: blue, label: label, - }); + }; +} +visibleElementsArray.forEach(function (element) { + let metadata = get_element_metadata(element); + if(metadata) { + size_pos.push(metadata); + } }); @@ -243,7 +261,19 @@ visibleElementsArray.forEach(function (element) { if (include_filters.length) { let results; // Foreach filter, go and find it on the page and add it to the results so we can visualise it again + outerLoop: for (const f of include_filters) { + // Quick check so we dont end up with duplicates in the training data + for (let index = 0; index < size_pos.length; index++) { + let item = size_pos[index]; + if (item.xpath === f) { + item.highlight_as_custom_filter = true; + item.found_as_duplicate = true; + item.label = "price"; + continue outerLoop; + } + } + bbox = false; q = false; @@ -264,7 +294,6 @@ if (include_filters.length) { } } else { console.log("[css] Scanning for included filter " + f) - console.log("[css] Scanning for included filter " + f); results = document.querySelectorAll(f); } } catch (e) { @@ -301,17 +330,15 @@ if (include_filters.length) { console.log("xpath_element_scraper: error looking up q.ownerElement") } } - - if (bbox && bbox['width'] > 0 && bbox['height'] > 0) { - size_pos.push({ - xpath: f, - width: parseInt(bbox['width']), - height: parseInt(bbox['height']), - left: parseInt(bbox['left']), - top: parseInt(bbox['top']) + scroll_y, - highlight_as_custom_filter: true - }); + element_info = get_element_metadata(node); + if(element_info) { + // Be sure we use exactly what was written + element_info['xpath'] = f; + element_info['highlight_as_custom_filter'] = true; + element_info['label'] = "price"; + size_pos.push(element_info); } + }); } } diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py index e6e7c6944e1..97ceee53d05 100644 --- a/changedetectionio/flask_app.py +++ b/changedetectionio/flask_app.py @@ -792,9 +792,9 @@ def edit_page(uuid): # Re #286 - We wait for syncing new data to disk in another thread every 60 seconds # But in the case something is added we should save straight away datastore.needs_write_urgent = True - - # Queue the watch for immediate recheck, with a higher priority - update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False})) + if not datastore.data['watching'][uuid].get('paused'): + # Queue the watch for immediate recheck, with a higher priority + update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False})) # Diff page [edit] link should go back to diff page if request.args.get("next") and request.args.get("next") == 'diff': diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py index d3167bf901f..bde9a158b85 100644 --- a/changedetectionio/model/Watch.py +++ b/changedetectionio/model/Watch.py @@ -518,7 +518,7 @@ def save_xpath_data(self, data, as_error=False): self.ensure_data_dir_exists() with open(target_path, 'w') as f: - f.write(json.dumps(data)) + f.write(json.dumps(data, indent=2)) f.close() # Save as PNG, PNG is larger but better for doing visual diff in the future diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py index 52f4d11b08f..6b7ef5dbf19 100644 --- a/changedetectionio/processors/restock_diff/processor.py +++ b/changedetectionio/processors/restock_diff/processor.py @@ -37,6 +37,7 @@ def get_itemprop_availability(html_content) -> Restock: Kind of funny/cool way to find price/availability in one many different possibilities. Use 'extruct' to find any possible RDFa/microdata/json-ld data, make a JSON string from the output then search it. """ + from jsonpath_ng import parse now = time.time() @@ -54,6 +55,7 @@ def get_itemprop_availability(html_content) -> Restock: # First phase, dead simple scanning of anything that looks useful value = Restock() + return value if data: logger.debug(f"Using jsonpath to find price/availability/etc") price_parse = parse('$..(price|Price)') @@ -136,10 +138,15 @@ def ML_scrape_for_price_data(self, ML_price_scraper_url): logger.debug(f"ML Price scraper: response - {response_json}'") if isinstance(response_json, dict) and 'idx' in response_json.keys(): suggested_xpath_idx = response_json.get('idx') + if response_json.get('score') <0.80 or response_json.get('score') > 1.0: + logger.warning(f"Predict score was outside normal range, aborting ML/AI price check, needs better training data in this case?") + return None # Use the path provided to extra the price text from price_parser import Price scrape_element = self.fetcher.xpath_data.get('size_pos', {})[suggested_xpath_idx] + logger.debug(f"Predicted selector with price information is {scrape_element['xpath']}") + result_s = None if scrape_element['xpath'][0] == '/' or scrape_element['xpath'].startswith('xpath'): result_s = html_tools.xpath_filter(xpath_filter=scrape_element['xpath'], @@ -151,6 +158,7 @@ def ML_scrape_for_price_data(self, ML_price_scraper_url): if result_s: text = html_to_text(result_s) + logger.debug(f"Guessed the text '{text}' as the price information") if text: price_info = Price.fromstring(text) else: @@ -158,6 +166,7 @@ def ML_scrape_for_price_data(self, ML_price_scraper_url): else: print(f"ML Price scraper: Request failed with status code: {response.status_code}") +#@TODO THROW HELPFUL MESSAGE WITH LINK TO TUTORIAL IF IT CANT CONNECT! return price_info def run_changedetection(self, watch, skip_when_checksum_same=True): From b7984f266a90367727aa25e6f4803152cff9e2bc Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Fri, 30 Aug 2024 17:53:16 +0200 Subject: [PATCH 11/12] Needed new tag --- changedetectionio/content_fetchers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changedetectionio/content_fetchers/__init__.py b/changedetectionio/content_fetchers/__init__.py index 91f133880af..7ccd0d3dd58 100644 --- a/changedetectionio/content_fetchers/__init__.py +++ b/changedetectionio/content_fetchers/__init__.py @@ -4,7 +4,7 @@ from changedetectionio.content_fetchers.exceptions import BrowserStepsStepException import os -visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary' +visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,bdi' # available_fetchers() will scan this implementation looking for anything starting with html_ # this information is used in the form selections From 392cc4586f60f78017b4b2c9851d8df28d0d8453 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Sun, 1 Sep 2024 12:24:39 +0200 Subject: [PATCH 12/12] extra selectors --- changedetectionio/content_fetchers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changedetectionio/content_fetchers/__init__.py b/changedetectionio/content_fetchers/__init__.py index 7ccd0d3dd58..fbf2161ea9e 100644 --- a/changedetectionio/content_fetchers/__init__.py +++ b/changedetectionio/content_fetchers/__init__.py @@ -4,7 +4,7 @@ from changedetectionio.content_fetchers.exceptions import BrowserStepsStepException import os -visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,bdi' +visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,bdi,strong' # available_fetchers() will scan this implementation looking for anything starting with html_ # this information is used in the form selections