dgtlmoon · dgtlmoon · Aug 22, 2024 · Aug 23, 2024 · Aug 23, 2024 · Aug 23, 2024
diff --git a/.github/workflows/test-stack-reusable-workflow.yml b/.github/workflows/test-stack-reusable-workflow.yml
@@ -52,6 +52,10 @@ jobs:
           docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --cap-add=SYS_ADMIN --name sockpuppetbrowser --hostname sockpuppetbrowser --rm -p 3000:3000 dgtlmoon/sockpuppetbrowser:latest                    
           docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --cap-add=SYS_ADMIN --name sockpuppetbrowser-custom-url --hostname sockpuppetbrowser-custom-url  -p 3001:3000 --rm dgtlmoon/sockpuppetbrowser:latest
 
+          # CDIO AI Element scraper for prices
+          # Run CDIO with PRICE_SCRAPER_ML_ENDPOINT=http://cdio-ai-price-element:5005/price-element
+          docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --name cdio-ai-price-element --hostname cdio-ai-price-element -p 5005:5005 --rm dgtlmoon/changedetection.io-ai:latest
+
       - name: Spin up ancillary SMTP+Echo message test server
         run: |
           # Debug SMTP server/echo message back server
@@ -95,6 +99,11 @@ jobs:
           # Settings headers playwright tests - Call back in from Sockpuppetbrowser, check headers
           docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000?dumpio=true" --network changedet-network test-changedetectionio  bash -c 'find .; cd changedetectionio; pytest --live-server-host=0.0.0.0  --live-server-port=5004 tests/test_request.py; pwd;find .'
 
+# PLAYWRIGHT/NODE-> CDP
+      - name: ML/AI Price element scraper via Playwright+dgtlmoon/changedetection.io-ai
+        run: |
+          docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" -e "PRICE_SCRAPER_ML_ENDPOINT=ws://cdio-ai-price-element:5005/price-element" --network changedet-network --hostname=cdio test-changedetectionio  bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/ml_price_scraper/test_scrape_price_element.py'
+
       - name: Playwright and SocketPuppetBrowser - Restock detection
         run: |                            
           # restock detection via playwright - added name=changedet here so that playwright and sockpuppetbrowser can connect to it

diff --git a/changedetectionio/content_fetchers/__init__.py b/changedetectionio/content_fetchers/__init__.py
@@ -4,7 +4,7 @@
 from changedetectionio.content_fetchers.exceptions import BrowserStepsStepException
 import os
 
-visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary'
+visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,bdi,strong'
 
 # available_fetchers() will scan this implementation looking for anything starting with html_
 # this information is used in the form selections

diff --git a/changedetectionio/content_fetchers/res/xpath_element_scraper.js b/changedetectionio/content_fetchers/res/xpath_element_scraper.js
@@ -15,6 +15,7 @@ try {
     console.log(e);
 }
 
+const percentageNumerical = str => Math.round((str.match(/\d/g) || []).length / str.length * 100);
 
 // Include the getXpath script directly, easier than fetching
 function getxpath(e) {
@@ -77,6 +78,30 @@ const findUpTag = (el) => {
     }
     return null;
 }
+// Text width scraper for ML training/detection
+// Create a single canvas and get its 2D context
+const canvas = document.createElement("canvas");
+const context = canvas.getContext("2d");
+
+// Function to get the width and height of the text inside an element and round them to the nearest integer
+function getTextWidthAndHeightinPx(element) {
+    // Set the font to match the style of the text in the element
+    context.font = window.getComputedStyle(element).font;
+
+    // Get the text inside the element
+    const text = element.textContent || element.innerText;
+
+    // Measure the text width
+    const metrics = context.measureText(text);
+    const width = Math.round(metrics.width);
+
+    // Get the font size from the computed style
+    const fontSize = parseFloat(window.getComputedStyle(element).fontSize);
+    const height = Math.round(fontSize); // Using font size as an approximation of height
+
+    // Return both width and height as an object
+    return { textWidth: width, textHeight: height };
+}
 
 
 // @todo - if it's SVG or IMG, go into image diff mode
@@ -122,8 +147,10 @@ const visibleElementsArray = [];
 // Call collectVisibleElements with the starting parent element
 collectVisibleElements(document.body, visibleElementsArray);
 
+// Append any custom selectors to the visibleElementsArray
 
-visibleElementsArray.forEach(function (element) {
+
+function get_element_metadata(element) {
 
     bbox = element.getBoundingClientRect();
 
@@ -164,17 +191,37 @@ visibleElementsArray.forEach(function (element) {
         }
     }
 
-    let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now
+    let label = "none" // A placeholder, the actual labels for training are done by hand for now
 
-    let text = element.textContent.trim().slice(0, 30).trim();
-    while (/\n{2,}|\t{2,}/.test(text)) {
-        text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t')
-    }
+    // Check if the element was found and get its text , not including any child element
+    let text = Array.from(element.childNodes)
+        .filter(node => node.nodeType === Node.TEXT_NODE)
+        .map(node => node.textContent)
+        .join('');
 
-    // Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
-    const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) &&  /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ;
+    // Remove any gaps in sequences of newlines and tabs inside the string
+    text = text.trim().replace(/[\s\t\n\r]{2,}/g, ' ').trim();
 
-    size_pos.push({
+    // Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
+    // @todo could be instead of USD/AUD etc [A-Z]{2,3} ?
+    //const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) &&  /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|RM|,–)/.test(text) ;
+    const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) &&  /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|RM|,–)/.test(text) ;
+    const hasDigit = /[0-9]/.test(text) ;
+
+    // Sizing of the actual text inside the element can be very different from the elements size
+    const { textWidth, textHeight } = getTextWidthAndHeightinPx(element);
+
+    const computedStyle = window.getComputedStyle(element);
+    let red, green, blue;
+
+    if (text.length) {
+        // Extract the RGB values from the color string (format: rgb(r, g, b))
+        [red, green, blue] = computedStyle.color.match(/\d+/g).map(Number);
+    } else {
+        // Assign default values if text is empty
+        [red, green, blue] = [0, 0, 0];
+    }
+    return {
         xpath: xpath_result,
         width: Math.round(bbox['width']),
         height: Math.round(bbox['height']),
@@ -185,13 +232,27 @@ visibleElementsArray.forEach(function (element) {
         // tagtype used by Browser Steps
         tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
         isClickable: window.getComputedStyle(element).cursor === "pointer",
-        // Used by the keras trainer
+        // Used by the keras/pytorch trainer
         fontSize: window.getComputedStyle(element).getPropertyValue('font-size'),
         fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'),
+        pcNumerical: text.length && percentageNumerical(text),
+        hasDigit: hasDigit,
         hasDigitCurrency: hasDigitCurrency,
+        textWidth: textWidth,
+        textHeight: textHeight,
+        textLength: text.length,
+        t_r: red,
+        t_g: green,
+        t_b: blue,
         label: label,
-    });
+    };
+}
 
+visibleElementsArray.forEach(function (element) {
+    let metadata = get_element_metadata(element);
+    if(metadata) {
+        size_pos.push(metadata);
+    }
 });
 
 
@@ -200,7 +261,19 @@ visibleElementsArray.forEach(function (element) {
 if (include_filters.length) {
     let results;
     // Foreach filter, go and find it on the page and add it to the results so we can visualise it again
+    outerLoop:
     for (const f of include_filters) {
+        // Quick check so we dont end up with duplicates in the training data
+        for (let index = 0; index < size_pos.length; index++) {
+            let item = size_pos[index];
+            if (item.xpath === f) {
+                item.highlight_as_custom_filter = true;
+                item.found_as_duplicate = true;
+                item.label = "price";
+                continue outerLoop;
+            }
+        }
+
         bbox = false;
         q = false;
 
@@ -221,7 +294,6 @@ if (include_filters.length) {
                 }
             } else {
                 console.log("[css] Scanning for included filter " + f)
-                console.log("[css] Scanning for included filter " + f);
                 results = document.querySelectorAll(f);
             }
         } catch (e) {
@@ -258,17 +330,15 @@ if (include_filters.length) {
                         console.log("xpath_element_scraper: error looking up q.ownerElement")
                     }
                 }
-
-                if (bbox && bbox['width'] > 0 && bbox['height'] > 0) {
-                    size_pos.push({
-                        xpath: f,
-                        width: parseInt(bbox['width']),
-                        height: parseInt(bbox['height']),
-                        left: parseInt(bbox['left']),
-                        top: parseInt(bbox['top']) + scroll_y,
-                        highlight_as_custom_filter: true
-                    });
+                element_info = get_element_metadata(node);
+                if(element_info) {
+                    // Be sure we use exactly what was written
+                    element_info['xpath'] = f;
+                    element_info['highlight_as_custom_filter'] = true;
+                    element_info['label'] = "price";
+                    size_pos.push(element_info);
                 }
+
             });
         }
     }

diff --git a/changedetectionio/flask_app.py b/changedetectionio/flask_app.py
@@ -792,9 +792,9 @@ def edit_page(uuid):
             # Re #286 - We wait for syncing new data to disk in another thread every 60 seconds
             # But in the case something is added we should save straight away
             datastore.needs_write_urgent = True
-
-            # Queue the watch for immediate recheck, with a higher priority
-            update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
+            if not datastore.data['watching'][uuid].get('paused'):
+                # Queue the watch for immediate recheck, with a higher priority
+                update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
 
             # Diff page [edit] link should go back to diff page
             if request.args.get("next") and request.args.get("next") == 'diff':
@@ -1602,6 +1602,15 @@ def form_watch_list_checkbox_operations():
 
             flash(f"{len(uuids)} watches were tagged")
 
+        elif op.startswith('mode:'):
+            mode = op.replace('mode:','')
+            for uuid in uuids:
+                uuid = uuid.strip()
+                if datastore.data['watching'].get(uuid):
+                    datastore.data['watching'][uuid]['processor'] = mode
+            flash(f"{len(uuids)} watches changed modes")
+
+
         return redirect(url_for('index'))
 
     @app.route("/api/share-url", methods=['GET'])

diff --git a/changedetectionio/model/Watch.py b/changedetectionio/model/Watch.py
@@ -518,7 +518,7 @@ def save_xpath_data(self, data, as_error=False):
         self.ensure_data_dir_exists()
 
         with open(target_path, 'w') as f:
-            f.write(json.dumps(data))
+            f.write(json.dumps(data, indent=2))
             f.close()
 
     # Save as PNG, PNG is larger but better for doing visual diff in the future

diff --git a/changedetectionio/processors/restock_diff/processor.py b/changedetectionio/processors/restock_diff/processor.py
@@ -3,10 +3,13 @@
 from . import Restock
 from loguru import logger
 import hashlib
+import os
 import re
 import urllib3
 import time
 
+from ...html_tools import html_to_text
+
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 name = 'Re-stock & Price detection for single product pages'
 description = 'Detects if the product goes back to in-stock'
@@ -34,6 +37,7 @@ def get_itemprop_availability(html_content) -> Restock:
     Kind of funny/cool way to find price/availability in one many different possibilities.
     Use 'extruct' to find any possible RDFa/microdata/json-ld data, make a JSON string from the output then search it.
     """
+
     from jsonpath_ng import parse
 
     now = time.time()
@@ -54,6 +58,7 @@ def get_itemprop_availability(html_content) -> Restock:
 
     # First phase, dead simple scanning of anything that looks useful
     value = Restock()
+    return value
     if data:
         logger.debug(f"Using jsonpath to find price/availability/etc")
         price_parse = parse('$..(price|Price)')
@@ -121,6 +126,52 @@ class perform_site_check(difference_detection_processor):
     screenshot = None
     xpath_data = None
 
+    def ML_scrape_for_price_data(self, ML_price_scraper_url):
+        import requests
+        from changedetectionio import html_tools
+
+        price_info = None
+
+        # Perform the POST request
+        response = requests.post(ML_price_scraper_url, json=self.fetcher.xpath_data)
+        logger.debug(f"ML Price scraper - {ML_price_scraper_url} Response OK? - '{response.ok}'")
+        # Check if the response contains a dict
+        if response.ok:  # This checks if the request was successful (status code 200-299)
+            response_json = response.json()
+            logger.debug(f"ML Price scraper: response - {response_json}'")
+            if isinstance(response_json, dict) and 'idx' in response_json.keys():
+                suggested_xpath_idx = response_json.get('idx')
+                if response_json.get('score') <0.80 or response_json.get('score') > 1.0:
+                    logger.warning(f"Predict score was outside normal range, aborting ML/AI price check, needs better training data in this case?")
+                    return None
+
+                # Use the path provided to extra the price text
+                from price_parser import Price
+                scrape_element = self.fetcher.xpath_data.get('size_pos', {})[suggested_xpath_idx]
+                logger.debug(f"Predicted selector with price information is {scrape_element['xpath']}")
+
+                result_s = None
+                if scrape_element['xpath'][0] == '/' or scrape_element['xpath'].startswith('xpath'):
+                    result_s = html_tools.xpath_filter(xpath_filter=scrape_element['xpath'],
+                                                       html_content=self.fetcher.content)
+                else:
+                    # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
+                    result_s = html_tools.include_filters(include_filters=scrape_element['xpath'],
+                                                          html_content=self.fetcher.content)
+
+                if result_s:
+                    text = html_to_text(result_s)
+                    logger.debug(f"Guessed the text '{text}' as the price information")
+                    if text:
+                        price_info = Price.fromstring(text)
+            else:
+                logger.error(f"ML Price scraper: missing xpath index (IDX) in response?")
+        else:
+            print(f"ML Price scraper: Request failed with status code: {response.status_code}")
+
+#@TODO THROW HELPFUL MESSAGE WITH LINK TO TUTORIAL IF IT CANT CONNECT!
+        return price_info
+
     def run_changedetection(self, watch, skip_when_checksum_same=True):
         if not watch:
             raise Exception("Watch no longer exists.")
@@ -177,6 +228,21 @@ def run_changedetection(self, watch, skip_when_checksum_same=True):
                 else:
                     update_obj['restock']['in_stock'] = False
 
+        # Attempt to pass the elements off to the machine-learning endpoint if its enabled
+        # This might return a confident guess as to which element contains the price data
+        if not itemprop_availability.get('price'):
+            ML_price_scraper_url = os.getenv("PRICE_SCRAPER_ML_ENDPOINT")
+            if self.fetcher.xpath_data and ML_price_scraper_url:
+                price_info = self.ML_scrape_for_price_data(ML_price_scraper_url)
+                if price_info and price_info.amount:
+                    logger.success(f"ML Price scraper: Got price data {price_info}")
+                    itemprop_availability['price'] = f"{price_info.amount}"
+                    update_obj['restock']['price'] = f"{price_info.amount}"
+                if price_info and price_info.currency:
+                    itemprop_availability['currency'] = price_info.currency
+                    update_obj['restock']['currency'] = price_info.currency
+
+
         # Main detection method
         fetched_md5 = None
 

diff --git a/changedetectionio/templates/watch-overview.html b/changedetectionio/templates/watch-overview.html
@@ -37,6 +37,8 @@
         <button class="pure-button button-secondary button-xsmall" name="op" value="assign-tag" id="checkbox-assign-tag">Tag</button>
         <button class="pure-button button-secondary button-xsmall" name="op" value="mark-viewed">Mark viewed</button>
         <button class="pure-button button-secondary button-xsmall" name="op" value="notification-default">Use default notification</button>
+        <button class="pure-button button-secondary button-xsmall" name="op" value="mode:text_json_diff">Mode: Page changes</button>
+        <button class="pure-button button-secondary button-xsmall" name="op" value="mode:restock_diff">Mode: Price/Restock</button>
         <button class="pure-button button-secondary button-xsmall" name="op" value="clear-errors">Clear errors</button>
         <button class="pure-button button-secondary button-xsmall" style="background: #dd4242;" name="op" value="clear-history">Clear/reset history</button>
         <button class="pure-button button-secondary button-xsmall" style="background: #dd4242;" name="op" value="delete">Delete</button>

diff --git a/changedetectionio/tests/conftest.py b/changedetectionio/tests/conftest.py
@@ -8,7 +8,7 @@
 from changedetectionio import store
 import os
 import sys
-from loguru import logger
+
 
 # https://github.com/pallets/flask/blob/1.1.2/examples/tutorial/tests/test_auth.py
 # Much better boilerplate than the docs

diff --git a/changedetectionio/tests/fetchers/test_content.py b/changedetectionio/tests/fetchers/test_content.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python3
 
-import time
 from flask import url_for
 from ..util import live_server_setup, wait_for_all_checks
 import logging