From 3656951259d829c5c27321423eb7d21a21dd03dc Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Thu, 22 Aug 2024 16:50:46 +0200
Subject: [PATCH 01/12] Initial support for scraper ML price extraction
 integration

---
 changedetectionio/flask_app.py                |  9 +++
 .../processors/restock_diff/processor.py      | 57 +++++++++++++++++++
 .../templates/watch-overview.html             |  2 +
 docker-compose.yml                            | 11 ++++
 requirements.txt                              |  2 +-
 5 files changed, 80 insertions(+), 1 deletion(-)

diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py
index fd12393ad08..e6e7c6944e1 100644
--- a/changedetectionio/flask_app.py
+++ b/changedetectionio/flask_app.py
@@ -1602,6 +1602,15 @@ def form_watch_list_checkbox_operations():
 
             flash(f"{len(uuids)} watches were tagged")
 
+        elif op.startswith('mode:'):
+            mode = op.replace('mode:','')
+            for uuid in uuids:
+                uuid = uuid.strip()
+                if datastore.data['watching'].get(uuid):
+                    datastore.data['watching'][uuid]['processor'] = mode
+            flash(f"{len(uuids)} watches changed modes")
+
+
         return redirect(url_for('index'))
 
     @app.route("/api/share-url", methods=['GET'])
diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py
index b2184e35376..52f4d11b08f 100644
--- a/changedetectionio/processors/restock_diff/processor.py
+++ b/changedetectionio/processors/restock_diff/processor.py
@@ -3,10 +3,13 @@
 from . import Restock
 from loguru import logger
 import hashlib
+import os
 import re
 import urllib3
 import time
 
+from ...html_tools import html_to_text
+
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 name = 'Re-stock & Price detection for single product pages'
 description = 'Detects if the product goes back to in-stock'
@@ -118,6 +121,45 @@ class perform_site_check(difference_detection_processor):
     screenshot = None
     xpath_data = None
 
+    def ML_scrape_for_price_data(self, ML_price_scraper_url):
+        import requests
+        from changedetectionio import html_tools
+
+        price_info = None
+
+        # Perform the POST request
+        response = requests.post(ML_price_scraper_url, json=self.fetcher.xpath_data)
+        logger.debug(f"ML Price scraper - {ML_price_scraper_url} Response OK? - '{response.ok}'")
+        # Check if the response contains a dict
+        if response.ok:  # This checks if the request was successful (status code 200-299)
+            response_json = response.json()
+            logger.debug(f"ML Price scraper: response - {response_json}'")
+            if isinstance(response_json, dict) and 'idx' in response_json.keys():
+                suggested_xpath_idx = response_json.get('idx')
+
+                # Use the path provided to extra the price text
+                from price_parser import Price
+                scrape_element = self.fetcher.xpath_data.get('size_pos', {})[suggested_xpath_idx]
+                result_s = None
+                if scrape_element['xpath'][0] == '/' or scrape_element['xpath'].startswith('xpath'):
+                    result_s = html_tools.xpath_filter(xpath_filter=scrape_element['xpath'],
+                                                       html_content=self.fetcher.content)
+                else:
+                    # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
+                    result_s = html_tools.include_filters(include_filters=scrape_element['xpath'],
+                                                          html_content=self.fetcher.content)
+
+                if result_s:
+                    text = html_to_text(result_s)
+                    if text:
+                        price_info = Price.fromstring(text)
+            else:
+                logger.error(f"ML Price scraper: missing xpath index (IDX) in response?")
+        else:
+            print(f"ML Price scraper: Request failed with status code: {response.status_code}")
+
+        return price_info
+
     def run_changedetection(self, watch, skip_when_checksum_same=True):
         if not watch:
             raise Exception("Watch no longer exists.")
@@ -174,6 +216,21 @@ def run_changedetection(self, watch, skip_when_checksum_same=True):
                 else:
                     update_obj['restock']['in_stock'] = False
 
+        # Attempt to pass the elements off to the machine-learning endpoint if its enabled
+        # This might return a confident guess as to which element contains the price data
+        if not itemprop_availability.get('price'):
+            ML_price_scraper_url = os.getenv("PRICE_SCRAPER_ML_ENDPOINT")
+            if self.fetcher.xpath_data and ML_price_scraper_url:
+                price_info = self.ML_scrape_for_price_data(ML_price_scraper_url)
+                if price_info and price_info.amount:
+                    logger.success(f"ML Price scraper: Got price data {price_info}")
+                    itemprop_availability['price'] = f"{price_info.amount}"
+                    update_obj['restock']['price'] = f"{price_info.amount}"
+                if price_info and price_info.currency:
+                    itemprop_availability['currency'] = price_info.currency
+                    update_obj['restock']['currency'] = price_info.currency
+
+
         # Main detection method
         fetched_md5 = None
 
diff --git a/changedetectionio/templates/watch-overview.html b/changedetectionio/templates/watch-overview.html
index 736e19da1ba..840d9508b24 100644
--- a/changedetectionio/templates/watch-overview.html
+++ b/changedetectionio/templates/watch-overview.html
@@ -37,6 +37,8 @@
         <button class="pure-button button-secondary button-xsmall" name="op" value="assign-tag" id="checkbox-assign-tag">Tag</button>
         <button class="pure-button button-secondary button-xsmall" name="op" value="mark-viewed">Mark viewed</button>
         <button class="pure-button button-secondary button-xsmall" name="op" value="notification-default">Use default notification</button>
+        <button class="pure-button button-secondary button-xsmall" name="op" value="mode:text_json_diff">Mode: Page changes</button>
+        <button class="pure-button button-secondary button-xsmall" name="op" value="mode:restock_diff">Mode: Price/Restock</button>
         <button class="pure-button button-secondary button-xsmall" name="op" value="clear-errors">Clear errors</button>
         <button class="pure-button button-secondary button-xsmall" style="background: #dd4242;" name="op" value="clear-history">Clear/reset history</button>
         <button class="pure-button button-secondary button-xsmall" style="background: #dd4242;" name="op" value="delete">Delete</button>
diff --git a/docker-compose.yml b/docker-compose.yml
index 2480a33994e..b3441ac389a 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -57,6 +57,10 @@ services:
   #
   #        Absolute minimum seconds to recheck, overrides any watch minimum, change to 0 to disable
   #      - MINIMUM_SECONDS_RECHECK_TIME=3
+  #
+  #        Scrape prices from web pages automatically where the page has no embedded price information (see below also)
+  #      - PRICE_SCRAPER_ML_ENDPOINT=http://cdio-price-scraper:5005
+
       # Comment out ports: when using behind a reverse proxy , enable networks: etc.
       ports:
         - 5000:5000
@@ -103,6 +107,13 @@ services:
 #            # Workaround to avoid the browser crashing inside a docker container
 #            # See https://github.com/SeleniumHQ/docker-selenium#quick-start
 #            - /dev/shm:/dev/shm
+#        restart: unless-stopped
+
+     # Machine Learning/AI - Use "Visual Selector" elements data to scrape price data
+
+#    cdio-keras-price-scraper:
+#        hostname: cdio-price-scraper
+#        image: dgtlmoon/changedetection-AI-pricescraper
 #        restart: unless-stopped
 
 volumes:
diff --git a/requirements.txt b/requirements.txt
index 537c3f802ba..6021253f6d6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -82,7 +82,7 @@ pytest-flask ~=1.2
 # Anything 4.0 and up but not 5.0
 jsonschema ~= 4.0
 
-
+price_parser
 loguru
 
 # For scraping all possible metadata relating to products so we can do better restock detection

From c0e9846a8597e7bd60cbd4530c747bd6b04bc359 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Fri, 23 Aug 2024 11:31:35 +0200
Subject: [PATCH 02/12] Adding more training data

---
 .../res/xpath_element_scraper.js              | 59 ++++++++++++++++++-
 1 file changed, 57 insertions(+), 2 deletions(-)

diff --git a/changedetectionio/content_fetchers/res/xpath_element_scraper.js b/changedetectionio/content_fetchers/res/xpath_element_scraper.js
index ccd89436973..f6bd12cdb58 100644
--- a/changedetectionio/content_fetchers/res/xpath_element_scraper.js
+++ b/changedetectionio/content_fetchers/res/xpath_element_scraper.js
@@ -77,7 +77,56 @@ const findUpTag = (el) => {
     }
     return null;
 }
+// Text width scraper for ML training/detection
+// Create a single canvas and get its 2D context
+const canvas = document.createElement("canvas");
+const context = canvas.getContext("2d");
 
+// Function to get the width and height of the text inside an element and round them to the nearest integer
+function getTextWidthAndHeightinPx(element) {
+    // Set the font to match the style of the text in the element
+    context.font = window.getComputedStyle(element).font;
+
+    // Get the text inside the element
+    const text = element.textContent || element.innerText;
+
+    // Measure the text width
+    const metrics = context.measureText(text);
+    const width = Math.round(metrics.width);
+
+    // Get the font size from the computed style
+    const fontSize = parseFloat(window.getComputedStyle(element).fontSize);
+    const height = Math.round(fontSize); // Using font size as an approximation of height
+
+    // Return both width and height as an object
+    return { textWidth: width, textHeight: height };
+}
+
+
+// Function to determine which RGB value is the highest, or return 0 if they are all the same
+function getDominantColorValue(element) {
+    // Get the computed style of the element to get the color property
+    const computedStyle = window.getComputedStyle(element);
+    const color = computedStyle.color;
+
+    // Extract the RGB values from the color string (format: rgb(r, g, b))
+    const rgbValues = color.match(/\d+/g).map(Number);
+    const [red, green, blue] = rgbValues;
+
+    // Check if all values are the same
+    if (red === green && green === blue) {
+        return 0; // All RGB values are the same
+    }
+
+    // Determine which value is the highest and return the corresponding number
+    if (red > green && red > blue) {
+        return 1; // Red is highest
+    } else if (green > red && green > blue) {
+        return 2; // Green is highest
+    } else {
+        return 3; // Blue is highest
+    }
+}
 
 // @todo - if it's SVG or IMG, go into image diff mode
 // %ELEMENTS% replaced at injection time because different interfaces use it with different settings
@@ -164,7 +213,7 @@ visibleElementsArray.forEach(function (element) {
         }
     }
 
-    let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now
+    let label = "none" // A placeholder, the actual labels for training are done by hand for now
 
     let text = element.textContent.trim().slice(0, 30).trim();
     while (/\n{2,}|\t{2,}/.test(text)) {
@@ -172,7 +221,10 @@ visibleElementsArray.forEach(function (element) {
     }
 
     // Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
-    const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) &&  /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ;
+    // @todo could be instead of USD/AUD etc [A-Z]{2,3} ?
+    const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) &&  /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|RM|,–)/.test(text) ;
+    // Sizing of the actual text inside the element can be very different from the elements size
+    const { textWidth, textHeight } = getTextWidthAndHeightinPx(element);
 
     size_pos.push({
         xpath: xpath_result,
@@ -189,6 +241,9 @@ visibleElementsArray.forEach(function (element) {
         fontSize: window.getComputedStyle(element).getPropertyValue('font-size'),
         fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'),
         hasDigitCurrency: hasDigitCurrency,
+        textColorClass: getDominantColorValue(element),
+        textWidth: textWidth,
+        textHeight: textHeight,
         label: label,
     });
 

From d7160d79bd7ef0e7dfe3f0350db7471745836364 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Fri, 23 Aug 2024 12:25:26 +0200
Subject: [PATCH 03/12] Use integer value for ML of the r,g,b

---
 .../res/xpath_element_scraper.js              | 40 +++++++------------
 1 file changed, 14 insertions(+), 26 deletions(-)

diff --git a/changedetectionio/content_fetchers/res/xpath_element_scraper.js b/changedetectionio/content_fetchers/res/xpath_element_scraper.js
index f6bd12cdb58..17ac125ba54 100644
--- a/changedetectionio/content_fetchers/res/xpath_element_scraper.js
+++ b/changedetectionio/content_fetchers/res/xpath_element_scraper.js
@@ -103,31 +103,6 @@ function getTextWidthAndHeightinPx(element) {
 }
 
 
-// Function to determine which RGB value is the highest, or return 0 if they are all the same
-function getDominantColorValue(element) {
-    // Get the computed style of the element to get the color property
-    const computedStyle = window.getComputedStyle(element);
-    const color = computedStyle.color;
-
-    // Extract the RGB values from the color string (format: rgb(r, g, b))
-    const rgbValues = color.match(/\d+/g).map(Number);
-    const [red, green, blue] = rgbValues;
-
-    // Check if all values are the same
-    if (red === green && green === blue) {
-        return 0; // All RGB values are the same
-    }
-
-    // Determine which value is the highest and return the corresponding number
-    if (red > green && red > blue) {
-        return 1; // Red is highest
-    } else if (green > red && green > blue) {
-        return 2; // Green is highest
-    } else {
-        return 3; // Blue is highest
-    }
-}
-
 // @todo - if it's SVG or IMG, go into image diff mode
 // %ELEMENTS% replaced at injection time because different interfaces use it with different settings
 
@@ -226,6 +201,17 @@ visibleElementsArray.forEach(function (element) {
     // Sizing of the actual text inside the element can be very different from the elements size
     const { textWidth, textHeight } = getTextWidthAndHeightinPx(element);
 
+    const computedStyle = window.getComputedStyle(element);
+    let red, green, blue;
+
+    if (text.length) {
+        // Extract the RGB values from the color string (format: rgb(r, g, b))
+        [red, green, blue] = computedStyle.color.match(/\d+/g).map(Number);
+    } else {
+        // Assign default values if text is empty
+        [red, green, blue] = [0, 0, 0];
+    }
+
     size_pos.push({
         xpath: xpath_result,
         width: Math.round(bbox['width']),
@@ -241,9 +227,11 @@ visibleElementsArray.forEach(function (element) {
         fontSize: window.getComputedStyle(element).getPropertyValue('font-size'),
         fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'),
         hasDigitCurrency: hasDigitCurrency,
-        textColorClass: getDominantColorValue(element),
         textWidth: textWidth,
         textHeight: textHeight,
+        t_r: red,
+        t_g: green,
+        t_b: blue,
         label: label,
     });
 

From 106f258d13ca7263f4063e624f5739b1e6fc5b0f Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Fri, 23 Aug 2024 14:27:12 +0200
Subject: [PATCH 04/12] add delay

---
 changedetectionio/tests/test_nonrenderable_pages.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/changedetectionio/tests/test_nonrenderable_pages.py b/changedetectionio/tests/test_nonrenderable_pages.py
index 3757eb6edf4..ea408b8b9c7 100644
--- a/changedetectionio/tests/test_nonrenderable_pages.py
+++ b/changedetectionio/tests/test_nonrenderable_pages.py
@@ -2,6 +2,7 @@
 
 from flask import url_for
 from .util import set_original_response, set_modified_response, live_server_setup, wait_for_all_checks
+import time
 
 def set_nonrenderable_response():
     test_return_data = """<html>
@@ -90,6 +91,7 @@ def test_check_basic_change_detection_functionality(client, live_server, measure
 
     # A totally zero byte (#2528) response should also not trigger an error
     set_zero_byte_response()
+    time.sleep(2)
     client.get(url_for("form_watch_checknow"), follow_redirects=True)
     wait_for_all_checks(client)
     res = client.get(url_for("index"))

From bf5b1143e33b091f1a9eb7069be2245462730141 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Fri, 23 Aug 2024 18:31:54 +0200
Subject: [PATCH 05/12] Adding test

---
 .../test-stack-reusable-workflow.yml          |   9 ++
 .../test_scrape_price_element.py              | 124 ++++++++++++++++++
 2 files changed, 133 insertions(+)
 create mode 100644 changedetectionio/tests/ai-price-scraper/test_scrape_price_element.py

diff --git a/.github/workflows/test-stack-reusable-workflow.yml b/.github/workflows/test-stack-reusable-workflow.yml
index f2864680cef..4e35995f2e3 100644
--- a/.github/workflows/test-stack-reusable-workflow.yml
+++ b/.github/workflows/test-stack-reusable-workflow.yml
@@ -52,6 +52,10 @@ jobs:
           docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --cap-add=SYS_ADMIN --name sockpuppetbrowser --hostname sockpuppetbrowser --rm -p 3000:3000 dgtlmoon/sockpuppetbrowser:latest                    
           docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --cap-add=SYS_ADMIN --name sockpuppetbrowser-custom-url --hostname sockpuppetbrowser-custom-url  -p 3001:3000 --rm dgtlmoon/sockpuppetbrowser:latest
 
+          # CDIO AI Element scraper for prices
+          # Run CDIO with PRICE_SCRAPER_ML_ENDPOINT=http://cdio-ai-price-element:5005/price-element
+          docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --name cdio-ai-price-element --hostname cdio-ai-price-element -p 5005:5005 --rm dgtlmoon/dgtlmoon/changedetection.io-ai:latest
+
       - name: Spin up ancillary SMTP+Echo message test server
         run: |
           # Debug SMTP server/echo message back server
@@ -95,6 +99,11 @@ jobs:
           # Settings headers playwright tests - Call back in from Sockpuppetbrowser, check headers
           docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000?dumpio=true" --network changedet-network test-changedetectionio  bash -c 'find .; cd changedetectionio; pytest --live-server-host=0.0.0.0  --live-server-port=5004 tests/test_request.py; pwd;find .'
 
+# PLAYWRIGHT/NODE-> CDP
+      - name: ML/AI Price element scraper via Playwright+dgtlmoon/changedetection.io-ai
+        run: |
+          docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" -e "PLAYWRIGHT_DRIVER_URL=ws://PRICE_SCRAPER_ML_ENDPOINT:5005/price-element" --network changedet-network --hostname=cdio test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/ai-price-scraper/test_scrape_price_element.py'
+
       - name: Playwright and SocketPuppetBrowser - Restock detection
         run: |                            
           # restock detection via playwright - added name=changedet here so that playwright and sockpuppetbrowser can connect to it
diff --git a/changedetectionio/tests/ai-price-scraper/test_scrape_price_element.py b/changedetectionio/tests/ai-price-scraper/test_scrape_price_element.py
new file mode 100644
index 00000000000..1ad8581f2f8
--- /dev/null
+++ b/changedetectionio/tests/ai-price-scraper/test_scrape_price_element.py
@@ -0,0 +1,124 @@
+
+#!/usr/bin/env python3
+import os
+import time
+
+from flask import url_for
+from ..util import live_server_setup, wait_for_all_checks, extract_UUID_from_client
+
+
+# No semantic data just some text, we should be able to find the product price.
+def set_response(price="121.95"):
+    html_content = f"""
+    <!DOCTYPE html>
+    <html lang="en">
+    <head>
+        <meta charset="UTF-8">
+        <meta name="viewport" content="width=device-width, initial-scale=1.0">
+        <title>Ajax Widget</title>
+        <style>
+            body {{
+                font-family: Arial, sans-serif;
+                margin: 0;
+                padding: 0;
+                display: flex;
+                justify-content: center;
+                align-items: center;
+                height: 100vh;
+                background-color: #f4f4f4;
+            }}
+            .container {{
+                display: flex;
+                flex-direction: row;
+                background-color: #fff;
+                border: 1px solid #ddd;
+                padding: 20px;
+                border-radius: 5px;
+                box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+                width: 80%;
+                max-width: 800px;
+            }}
+            .description {{
+                flex: 2;
+                margin-right: 20px;
+            }}
+            .description h1 {{
+                margin-top: 0;
+            }}
+            .price {{
+                flex: 1;
+                text-align: right;
+                font-size: 24px;
+                color: #333;
+            }}
+            .price span {{
+                font-size: 32px;
+                font-weight: bold;
+            }}
+            .buy-button {{
+                display: inline-block;
+                margin-top: 20px;
+                padding: 10px 20px;
+                background-color: #28a745;
+                color: #fff;
+                text-decoration: none;
+                border-radius: 5px;
+                font-size: 16px;
+            }}
+            .buy-button:hover {{
+                background-color: #218838;
+            }}
+        </style>
+    </head>
+    <body>
+        <div class="container">
+            <div class="description">
+                <h1>Ajax Widget</h1>
+                <p>The Ajax Widget is the ultimate solution for all your widget needs. Crafted with precision and using the latest technology, this widget offers unmatched performance and durability. Whether you're using it for personal or professional purposes, the Ajax Widget will not disappoint. It's easy to use, reliable, and comes with a sleek design that complements any setup. Don't settle for less; get the best with the Ajax Widget today!</p>
+            </div>
+            <div class="price">
+                <span>${price}</span>
+                <br>
+                <a href="#" class="buy-button">Buy Now</a><br>
+                IN STOCK
+            </div>
+        </div>
+    </body>
+    </html>
+    """
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(html_content)
+    time.sleep(1)
+    return None
+
+
+
+
+def test_restock_itemprop_basic(client, live_server):
+
+    # needs to be set and something like 'ws://127.0.0.1:3000'
+    assert os.getenv('PLAYWRIGHT_DRIVER_URL'), "Needs PLAYWRIGHT_DRIVER_URL set for this test"
+    assert os.getenv('PRICE_SCRAPER_ML_ENDPOINT'), "Needs PRICE_SCRAPER_ML_ENDPOINT set for this test"
+
+
+    live_server_setup(live_server)
+
+    set_response(price="123.99")
+
+    test_url = url_for('test_endpoint', _external=True)
+
+    client.post(
+        url_for("form_quick_watch_add"),
+        data={"url": test_url, "tags": 'restock tests', 'processor': 'restock_diff'},
+        follow_redirects=True
+    )
+    wait_for_all_checks(client)
+    res = client.get(url_for("index"))
+
+    assert b'123.99' in res.data
+    assert b' in-stock' in res.data
+    assert b' not-in-stock' not in res.data
+
+    res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True)
+    assert b'Deleted' in res.data

From 72579d8ea2de36e071c0effe7cf70a3dedf838fa Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Fri, 23 Aug 2024 18:38:51 +0200
Subject: [PATCH 06/12] woops

---
 .github/workflows/test-stack-reusable-workflow.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-stack-reusable-workflow.yml b/.github/workflows/test-stack-reusable-workflow.yml
index 4e35995f2e3..9e2b337c3f8 100644
--- a/.github/workflows/test-stack-reusable-workflow.yml
+++ b/.github/workflows/test-stack-reusable-workflow.yml
@@ -54,7 +54,7 @@ jobs:
 
           # CDIO AI Element scraper for prices
           # Run CDIO with PRICE_SCRAPER_ML_ENDPOINT=http://cdio-ai-price-element:5005/price-element
-          docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --name cdio-ai-price-element --hostname cdio-ai-price-element -p 5005:5005 --rm dgtlmoon/dgtlmoon/changedetection.io-ai:latest
+          docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --name cdio-ai-price-element --hostname cdio-ai-price-element -p 5005:5005 --rm dgtlmoon/changedetection.io-ai:latest
 
       - name: Spin up ancillary SMTP+Echo message test server
         run: |

From 5fa841637e034c558580305f6576997f2f88f549 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Fri, 23 Aug 2024 19:53:14 +0200
Subject: [PATCH 07/12] fix imports, paths

---
 .github/workflows/test-stack-reusable-workflow.yml        | 2 +-
 changedetectionio/tests/conftest.py                       | 2 +-
 changedetectionio/tests/fetchers/test_content.py          | 1 -
 .../test_scrape_price_element.py                          | 8 ++++----
 4 files changed, 6 insertions(+), 7 deletions(-)
 rename changedetectionio/tests/{ai-price-scraper => ml_price_scraper}/test_scrape_price_element.py (95%)

diff --git a/.github/workflows/test-stack-reusable-workflow.yml b/.github/workflows/test-stack-reusable-workflow.yml
index 9e2b337c3f8..1556560a1cd 100644
--- a/.github/workflows/test-stack-reusable-workflow.yml
+++ b/.github/workflows/test-stack-reusable-workflow.yml
@@ -102,7 +102,7 @@ jobs:
 # PLAYWRIGHT/NODE-> CDP
       - name: ML/AI Price element scraper via Playwright+dgtlmoon/changedetection.io-ai
         run: |
-          docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" -e "PLAYWRIGHT_DRIVER_URL=ws://PRICE_SCRAPER_ML_ENDPOINT:5005/price-element" --network changedet-network --hostname=cdio test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/ai-price-scraper/test_scrape_price_element.py'
+          docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" -e "PLAYWRIGHT_DRIVER_URL=ws://cdio-ai-price-element:5005/price-element" --network changedet-network --hostname=cdio test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/ml_price_scraper/test_scrape_price_element.py'
 
       - name: Playwright and SocketPuppetBrowser - Restock detection
         run: |                            
diff --git a/changedetectionio/tests/conftest.py b/changedetectionio/tests/conftest.py
index 50f7104b145..1a0cb804193 100644
--- a/changedetectionio/tests/conftest.py
+++ b/changedetectionio/tests/conftest.py
@@ -8,7 +8,7 @@
 from changedetectionio import store
 import os
 import sys
-from loguru import logger
+
 
 # https://github.com/pallets/flask/blob/1.1.2/examples/tutorial/tests/test_auth.py
 # Much better boilerplate than the docs
diff --git a/changedetectionio/tests/fetchers/test_content.py b/changedetectionio/tests/fetchers/test_content.py
index 8d468cd4b52..569703caf15 100644
--- a/changedetectionio/tests/fetchers/test_content.py
+++ b/changedetectionio/tests/fetchers/test_content.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python3
 
-import time
 from flask import url_for
 from ..util import live_server_setup, wait_for_all_checks
 import logging
diff --git a/changedetectionio/tests/ai-price-scraper/test_scrape_price_element.py b/changedetectionio/tests/ml_price_scraper/test_scrape_price_element.py
similarity index 95%
rename from changedetectionio/tests/ai-price-scraper/test_scrape_price_element.py
rename to changedetectionio/tests/ml_price_scraper/test_scrape_price_element.py
index 1ad8581f2f8..b2a5d630a19 100644
--- a/changedetectionio/tests/ai-price-scraper/test_scrape_price_element.py
+++ b/changedetectionio/tests/ml_price_scraper/test_scrape_price_element.py
@@ -1,11 +1,11 @@
-
-#!/usr/bin/env python3
 import os
-import time
 
 from flask import url_for
-from ..util import live_server_setup, wait_for_all_checks, extract_UUID_from_client
+from changedetectionio.tests.util import set_original_response, set_modified_response, set_more_modified_response, live_server_setup, \
+    wait_for_all_checks, \
+    set_longer_modified_response
 
+import time
 
 # No semantic data just some text, we should be able to find the product price.
 def set_response(price="121.95"):

From 7c914cd2660d92b5ab588878f980c53ea73345e5 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Fri, 23 Aug 2024 20:24:35 +0200
Subject: [PATCH 08/12] dangit

---
 .github/workflows/test-stack-reusable-workflow.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test-stack-reusable-workflow.yml b/.github/workflows/test-stack-reusable-workflow.yml
index 1556560a1cd..52706e011a7 100644
--- a/.github/workflows/test-stack-reusable-workflow.yml
+++ b/.github/workflows/test-stack-reusable-workflow.yml
@@ -102,7 +102,7 @@ jobs:
 # PLAYWRIGHT/NODE-> CDP
       - name: ML/AI Price element scraper via Playwright+dgtlmoon/changedetection.io-ai
         run: |
-          docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" -e "PLAYWRIGHT_DRIVER_URL=ws://cdio-ai-price-element:5005/price-element" --network changedet-network --hostname=cdio test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/ml_price_scraper/test_scrape_price_element.py'
+          docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" -e "PRICE_SCRAPER_ML_ENDPOINT=ws://cdio-ai-price-element:5005/price-element" --network changedet-network --hostname=cdio test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/ml_price_scraper/test_scrape_price_element.py'
 
       - name: Playwright and SocketPuppetBrowser - Restock detection
         run: |                            

From 2e82b17cacb509f08c3721cd6f9941cf0b1dc45b Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Fri, 23 Aug 2024 21:51:33 +0200
Subject: [PATCH 09/12] fix path

---
 .../tests/ml_price_scraper/test_scrape_price_element.py        | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/changedetectionio/tests/ml_price_scraper/test_scrape_price_element.py b/changedetectionio/tests/ml_price_scraper/test_scrape_price_element.py
index b2a5d630a19..e2bc868e6af 100644
--- a/changedetectionio/tests/ml_price_scraper/test_scrape_price_element.py
+++ b/changedetectionio/tests/ml_price_scraper/test_scrape_price_element.py
@@ -106,7 +106,10 @@ def test_restock_itemprop_basic(client, live_server):
 
     set_response(price="123.99")
 
+    # because it needs to access itself from within the sockpuppetbrowser
     test_url = url_for('test_endpoint', _external=True)
+    test_url = test_url.replace('localhost.localdomain', 'cdio')
+    test_url = test_url.replace('localhost', 'cdio')
 
     client.post(
         url_for("form_quick_watch_add"),

From 446622159c8f56c9a2b20c98108b0d74e2da5445 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Fri, 30 Aug 2024 16:06:04 +0200
Subject: [PATCH 10/12] WIP - adding more scrape data and some dev tweaks

---
 .../res/xpath_element_scraper.js              | 67 +++++++++++++------
 changedetectionio/flask_app.py                |  6 +-
 changedetectionio/model/Watch.py              |  2 +-
 .../processors/restock_diff/processor.py      |  9 +++
 4 files changed, 60 insertions(+), 24 deletions(-)

diff --git a/changedetectionio/content_fetchers/res/xpath_element_scraper.js b/changedetectionio/content_fetchers/res/xpath_element_scraper.js
index 17ac125ba54..c7a92378b46 100644
--- a/changedetectionio/content_fetchers/res/xpath_element_scraper.js
+++ b/changedetectionio/content_fetchers/res/xpath_element_scraper.js
@@ -15,6 +15,7 @@ try {
     console.log(e);
 }
 
+const percentageNumerical = str => Math.round((str.match(/\d/g) || []).length / str.length * 100);
 
 // Include the getXpath script directly, easier than fetching
 function getxpath(e) {
@@ -146,8 +147,10 @@ const visibleElementsArray = [];
 // Call collectVisibleElements with the starting parent element
 collectVisibleElements(document.body, visibleElementsArray);
 
+// Append any custom selectors to the visibleElementsArray
 
-visibleElementsArray.forEach(function (element) {
+
+function get_element_metadata(element) {
 
     bbox = element.getBoundingClientRect();
 
@@ -190,14 +193,21 @@ visibleElementsArray.forEach(function (element) {
 
     let label = "none" // A placeholder, the actual labels for training are done by hand for now
 
-    let text = element.textContent.trim().slice(0, 30).trim();
-    while (/\n{2,}|\t{2,}/.test(text)) {
-        text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t')
-    }
+    // Check if the element was found and get its text , not including any child element
+    let text = Array.from(element.childNodes)
+        .filter(node => node.nodeType === Node.TEXT_NODE)
+        .map(node => node.textContent)
+        .join('');
+
+    // Remove any gaps in sequences of newlines and tabs inside the string
+    text = text.trim().replace(/[\s\t\n\r]{2,}/g, ' ').trim();
 
     // Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
     // @todo could be instead of USD/AUD etc [A-Z]{2,3} ?
+    //const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) &&  /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|RM|,–)/.test(text) ;
     const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) &&  /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|RM|,–)/.test(text) ;
+    const hasDigit = /[0-9]/.test(text) ;
+
     // Sizing of the actual text inside the element can be very different from the elements size
     const { textWidth, textHeight } = getTextWidthAndHeightinPx(element);
 
@@ -211,8 +221,7 @@ visibleElementsArray.forEach(function (element) {
         // Assign default values if text is empty
         [red, green, blue] = [0, 0, 0];
     }
-
-    size_pos.push({
+    return {
         xpath: xpath_result,
         width: Math.round(bbox['width']),
         height: Math.round(bbox['height']),
@@ -223,18 +232,27 @@ visibleElementsArray.forEach(function (element) {
         // tagtype used by Browser Steps
         tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
         isClickable: window.getComputedStyle(element).cursor === "pointer",
-        // Used by the keras trainer
+        // Used by the keras/pytorch trainer
         fontSize: window.getComputedStyle(element).getPropertyValue('font-size'),
         fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'),
+        pcNumerical: text.length && percentageNumerical(text),
+        hasDigit: hasDigit,
         hasDigitCurrency: hasDigitCurrency,
         textWidth: textWidth,
         textHeight: textHeight,
+        textLength: text.length,
         t_r: red,
         t_g: green,
         t_b: blue,
         label: label,
-    });
+    };
+}
 
+visibleElementsArray.forEach(function (element) {
+    let metadata = get_element_metadata(element);
+    if(metadata) {
+        size_pos.push(metadata);
+    }
 });
 
 
@@ -243,7 +261,19 @@ visibleElementsArray.forEach(function (element) {
 if (include_filters.length) {
     let results;
     // Foreach filter, go and find it on the page and add it to the results so we can visualise it again
+    outerLoop:
     for (const f of include_filters) {
+        // Quick check so we dont end up with duplicates in the training data
+        for (let index = 0; index < size_pos.length; index++) {
+            let item = size_pos[index];
+            if (item.xpath === f) {
+                item.highlight_as_custom_filter = true;
+                item.found_as_duplicate = true;
+                item.label = "price";
+                continue outerLoop;
+            }
+        }
+
         bbox = false;
         q = false;
 
@@ -264,7 +294,6 @@ if (include_filters.length) {
                 }
             } else {
                 console.log("[css] Scanning for included filter " + f)
-                console.log("[css] Scanning for included filter " + f);
                 results = document.querySelectorAll(f);
             }
         } catch (e) {
@@ -301,17 +330,15 @@ if (include_filters.length) {
                         console.log("xpath_element_scraper: error looking up q.ownerElement")
                     }
                 }
-
-                if (bbox && bbox['width'] > 0 && bbox['height'] > 0) {
-                    size_pos.push({
-                        xpath: f,
-                        width: parseInt(bbox['width']),
-                        height: parseInt(bbox['height']),
-                        left: parseInt(bbox['left']),
-                        top: parseInt(bbox['top']) + scroll_y,
-                        highlight_as_custom_filter: true
-                    });
+                element_info = get_element_metadata(node);
+                if(element_info) {
+                    // Be sure we use exactly what was written
+                    element_info['xpath'] = f;
+                    element_info['highlight_as_custom_filter'] = true;
+                    element_info['label'] = "price";
+                    size_pos.push(element_info);
                 }
+
             });
         }
     }
diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py
index e6e7c6944e1..97ceee53d05 100644
--- a/changedetectionio/flask_app.py
+++ b/changedetectionio/flask_app.py
@@ -792,9 +792,9 @@ def edit_page(uuid):
             # Re #286 - We wait for syncing new data to disk in another thread every 60 seconds
             # But in the case something is added we should save straight away
             datastore.needs_write_urgent = True
-
-            # Queue the watch for immediate recheck, with a higher priority
-            update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
+            if not datastore.data['watching'][uuid].get('paused'):
+                # Queue the watch for immediate recheck, with a higher priority
+                update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
 
             # Diff page [edit] link should go back to diff page
             if request.args.get("next") and request.args.get("next") == 'diff':
diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py
index d3167bf901f..bde9a158b85 100644
--- a/changedetectionio/model/Watch.py
+++ b/changedetectionio/model/Watch.py
@@ -518,7 +518,7 @@ def save_xpath_data(self, data, as_error=False):
         self.ensure_data_dir_exists()
 
         with open(target_path, 'w') as f:
-            f.write(json.dumps(data))
+            f.write(json.dumps(data, indent=2))
             f.close()
 
     # Save as PNG, PNG is larger but better for doing visual diff in the future
diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py
index 52f4d11b08f..6b7ef5dbf19 100644
--- a/changedetectionio/processors/restock_diff/processor.py
+++ b/changedetectionio/processors/restock_diff/processor.py
@@ -37,6 +37,7 @@ def get_itemprop_availability(html_content) -> Restock:
     Kind of funny/cool way to find price/availability in one many different possibilities.
     Use 'extruct' to find any possible RDFa/microdata/json-ld data, make a JSON string from the output then search it.
     """
+
     from jsonpath_ng import parse
 
     now = time.time()
@@ -54,6 +55,7 @@ def get_itemprop_availability(html_content) -> Restock:
 
     # First phase, dead simple scanning of anything that looks useful
     value = Restock()
+    return value
     if data:
         logger.debug(f"Using jsonpath to find price/availability/etc")
         price_parse = parse('$..(price|Price)')
@@ -136,10 +138,15 @@ def ML_scrape_for_price_data(self, ML_price_scraper_url):
             logger.debug(f"ML Price scraper: response - {response_json}'")
             if isinstance(response_json, dict) and 'idx' in response_json.keys():
                 suggested_xpath_idx = response_json.get('idx')
+                if response_json.get('score') <0.80 or response_json.get('score') > 1.0:
+                    logger.warning(f"Predict score was outside normal range, aborting ML/AI price check, needs better training data in this case?")
+                    return None
 
                 # Use the path provided to extra the price text
                 from price_parser import Price
                 scrape_element = self.fetcher.xpath_data.get('size_pos', {})[suggested_xpath_idx]
+                logger.debug(f"Predicted selector with price information is {scrape_element['xpath']}")
+
                 result_s = None
                 if scrape_element['xpath'][0] == '/' or scrape_element['xpath'].startswith('xpath'):
                     result_s = html_tools.xpath_filter(xpath_filter=scrape_element['xpath'],
@@ -151,6 +158,7 @@ def ML_scrape_for_price_data(self, ML_price_scraper_url):
 
                 if result_s:
                     text = html_to_text(result_s)
+                    logger.debug(f"Guessed the text '{text}' as the price information")
                     if text:
                         price_info = Price.fromstring(text)
             else:
@@ -158,6 +166,7 @@ def ML_scrape_for_price_data(self, ML_price_scraper_url):
         else:
             print(f"ML Price scraper: Request failed with status code: {response.status_code}")
 
+#@TODO THROW HELPFUL MESSAGE WITH LINK TO TUTORIAL IF IT CANT CONNECT!
         return price_info
 
     def run_changedetection(self, watch, skip_when_checksum_same=True):

From b7984f266a90367727aa25e6f4803152cff9e2bc Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Fri, 30 Aug 2024 17:53:16 +0200
Subject: [PATCH 11/12] Needed new tag

---
 changedetectionio/content_fetchers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/changedetectionio/content_fetchers/__init__.py b/changedetectionio/content_fetchers/__init__.py
index 91f133880af..7ccd0d3dd58 100644
--- a/changedetectionio/content_fetchers/__init__.py
+++ b/changedetectionio/content_fetchers/__init__.py
@@ -4,7 +4,7 @@
 from changedetectionio.content_fetchers.exceptions import BrowserStepsStepException
 import os
 
-visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary'
+visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,bdi'
 
 # available_fetchers() will scan this implementation looking for anything starting with html_
 # this information is used in the form selections

From 392cc4586f60f78017b4b2c9851d8df28d0d8453 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Sun, 1 Sep 2024 12:24:39 +0200
Subject: [PATCH 12/12] extra selectors

---
 changedetectionio/content_fetchers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/changedetectionio/content_fetchers/__init__.py b/changedetectionio/content_fetchers/__init__.py
index 7ccd0d3dd58..fbf2161ea9e 100644
--- a/changedetectionio/content_fetchers/__init__.py
+++ b/changedetectionio/content_fetchers/__init__.py
@@ -4,7 +4,7 @@
 from changedetectionio.content_fetchers.exceptions import BrowserStepsStepException
 import os
 
-visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,bdi'
+visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,bdi,strong'
 
 # available_fetchers() will scan this implementation looking for anything starting with html_
 # this information is used in the form selections