Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial support for scraper ML price extraction integration #2584

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
9 changes: 9 additions & 0 deletions .github/workflows/test-stack-reusable-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ jobs:
docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --cap-add=SYS_ADMIN --name sockpuppetbrowser --hostname sockpuppetbrowser --rm -p 3000:3000 dgtlmoon/sockpuppetbrowser:latest
docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --cap-add=SYS_ADMIN --name sockpuppetbrowser-custom-url --hostname sockpuppetbrowser-custom-url -p 3001:3000 --rm dgtlmoon/sockpuppetbrowser:latest

# CDIO AI Element scraper for prices
# Run CDIO with PRICE_SCRAPER_ML_ENDPOINT=http://cdio-ai-price-element:5005/price-element
docker run --network changedet-network -d -e "LOG_LEVEL=TRACE" --name cdio-ai-price-element --hostname cdio-ai-price-element -p 5005:5005 --rm dgtlmoon/changedetection.io-ai:latest

- name: Spin up ancillary SMTP+Echo message test server
run: |
# Debug SMTP server/echo message back server
Expand Down Expand Up @@ -95,6 +99,11 @@ jobs:
# Settings headers playwright tests - Call back in from Sockpuppetbrowser, check headers
docker run --name "changedet" --hostname changedet --rm -e "FLASK_SERVER_NAME=changedet" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000?dumpio=true" --network changedet-network test-changedetectionio bash -c 'find .; cd changedetectionio; pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/test_request.py; pwd;find .'

# PLAYWRIGHT/NODE-> CDP
- name: ML/AI Price element scraper via Playwright+dgtlmoon/changedetection.io-ai
run: |
docker run --rm -e "FLASK_SERVER_NAME=cdio" -e "PLAYWRIGHT_DRIVER_URL=ws://sockpuppetbrowser:3000" -e "PRICE_SCRAPER_ML_ENDPOINT=ws://cdio-ai-price-element:5005/price-element" --network changedet-network --hostname=cdio test-changedetectionio bash -c 'cd changedetectionio;pytest --live-server-host=0.0.0.0 --live-server-port=5004 tests/ml_price_scraper/test_scrape_price_element.py'

- name: Playwright and SocketPuppetBrowser - Restock detection
run: |
# restock detection via playwright - added name=changedet here so that playwright and sockpuppetbrowser can connect to it
Expand Down
2 changes: 1 addition & 1 deletion changedetectionio/content_fetchers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from changedetectionio.content_fetchers.exceptions import BrowserStepsStepException
import os

visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary'
visualselector_xpath_selectors = 'div,span,form,table,tbody,tr,td,a,p,ul,li,h1,h2,h3,h4,header,footer,section,article,aside,details,main,nav,section,summary,bdi,strong'

# available_fetchers() will scan this implementation looking for anything starting with html_
# this information is used in the form selections
Expand Down
114 changes: 92 additions & 22 deletions changedetectionio/content_fetchers/res/xpath_element_scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ try {
console.log(e);
}

const percentageNumerical = str => Math.round((str.match(/\d/g) || []).length / str.length * 100);

// Include the getXpath script directly, easier than fetching
function getxpath(e) {
Expand Down Expand Up @@ -77,6 +78,30 @@ const findUpTag = (el) => {
}
return null;
}
// Text width scraper for ML training/detection
// Create a single canvas and get its 2D context
const canvas = document.createElement("canvas");
const context = canvas.getContext("2d");

// Function to get the width and height of the text inside an element and round them to the nearest integer
function getTextWidthAndHeightinPx(element) {
// Set the font to match the style of the text in the element
context.font = window.getComputedStyle(element).font;

// Get the text inside the element
const text = element.textContent || element.innerText;

// Measure the text width
const metrics = context.measureText(text);
const width = Math.round(metrics.width);

// Get the font size from the computed style
const fontSize = parseFloat(window.getComputedStyle(element).fontSize);
const height = Math.round(fontSize); // Using font size as an approximation of height

// Return both width and height as an object
return { textWidth: width, textHeight: height };
}


// @todo - if it's SVG or IMG, go into image diff mode
Expand Down Expand Up @@ -122,8 +147,10 @@ const visibleElementsArray = [];
// Call collectVisibleElements with the starting parent element
collectVisibleElements(document.body, visibleElementsArray);

// Append any custom selectors to the visibleElementsArray

visibleElementsArray.forEach(function (element) {

function get_element_metadata(element) {

bbox = element.getBoundingClientRect();

Expand Down Expand Up @@ -164,17 +191,37 @@ visibleElementsArray.forEach(function (element) {
}
}

let label = "not-interesting" // A placeholder, the actual labels for training are done by hand for now
let label = "none" // A placeholder, the actual labels for training are done by hand for now

let text = element.textContent.trim().slice(0, 30).trim();
while (/\n{2,}|\t{2,}/.test(text)) {
text = text.replace(/\n{2,}/g, '\n').replace(/\t{2,}/g, '\t')
}
// Check if the element was found and get its text , not including any child element
let text = Array.from(element.childNodes)
.filter(node => node.nodeType === Node.TEXT_NODE)
.map(node => node.textContent)
.join('');

// Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|,–)/.test(text) ;
// Remove any gaps in sequences of newlines and tabs inside the string
text = text.trim().replace(/[\s\t\n\r]{2,}/g, ' ').trim();

size_pos.push({
// Try to identify any possible currency amounts "Sale: 4000" or "Sale now 3000 Kc", can help with the training.
// @todo could be instead of USD/AUD etc [A-Z]{2,3} ?
//const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|RM|,–)/.test(text) ;
const hasDigitCurrency = (/\d/.test(text.slice(0, 6)) || /\d/.test(text.slice(-6)) ) && /([€£$¥₩₹]|USD|AUD|EUR|Kč|kr|SEK|RM|,–)/.test(text) ;
const hasDigit = /[0-9]/.test(text) ;

// Sizing of the actual text inside the element can be very different from the elements size
const { textWidth, textHeight } = getTextWidthAndHeightinPx(element);

const computedStyle = window.getComputedStyle(element);
let red, green, blue;

if (text.length) {
// Extract the RGB values from the color string (format: rgb(r, g, b))
[red, green, blue] = computedStyle.color.match(/\d+/g).map(Number);
} else {
// Assign default values if text is empty
[red, green, blue] = [0, 0, 0];
}
return {
xpath: xpath_result,
width: Math.round(bbox['width']),
height: Math.round(bbox['height']),
Expand All @@ -185,13 +232,27 @@ visibleElementsArray.forEach(function (element) {
// tagtype used by Browser Steps
tagtype: (element.tagName.toLowerCase() === 'input' && element.type) ? element.type.toLowerCase() : '',
isClickable: window.getComputedStyle(element).cursor === "pointer",
// Used by the keras trainer
// Used by the keras/pytorch trainer
fontSize: window.getComputedStyle(element).getPropertyValue('font-size'),
fontWeight: window.getComputedStyle(element).getPropertyValue('font-weight'),
pcNumerical: text.length && percentageNumerical(text),
hasDigit: hasDigit,
hasDigitCurrency: hasDigitCurrency,
textWidth: textWidth,
textHeight: textHeight,
textLength: text.length,
t_r: red,
t_g: green,
t_b: blue,
label: label,
});
};
}

visibleElementsArray.forEach(function (element) {
let metadata = get_element_metadata(element);
if(metadata) {
size_pos.push(metadata);
}
});


Expand All @@ -200,7 +261,19 @@ visibleElementsArray.forEach(function (element) {
if (include_filters.length) {
let results;
// Foreach filter, go and find it on the page and add it to the results so we can visualise it again
outerLoop:
for (const f of include_filters) {
// Quick check so we dont end up with duplicates in the training data
for (let index = 0; index < size_pos.length; index++) {
let item = size_pos[index];
if (item.xpath === f) {
item.highlight_as_custom_filter = true;
item.found_as_duplicate = true;
item.label = "price";
continue outerLoop;
}
}

bbox = false;
q = false;

Expand All @@ -221,7 +294,6 @@ if (include_filters.length) {
}
} else {
console.log("[css] Scanning for included filter " + f)
console.log("[css] Scanning for included filter " + f);
results = document.querySelectorAll(f);
}
} catch (e) {
Expand Down Expand Up @@ -258,17 +330,15 @@ if (include_filters.length) {
console.log("xpath_element_scraper: error looking up q.ownerElement")
}
}

if (bbox && bbox['width'] > 0 && bbox['height'] > 0) {
size_pos.push({
xpath: f,
width: parseInt(bbox['width']),
height: parseInt(bbox['height']),
left: parseInt(bbox['left']),
top: parseInt(bbox['top']) + scroll_y,
highlight_as_custom_filter: true
});
element_info = get_element_metadata(node);
if(element_info) {
// Be sure we use exactly what was written
element_info['xpath'] = f;
element_info['highlight_as_custom_filter'] = true;
element_info['label'] = "price";
size_pos.push(element_info);
}

});
}
}
Expand Down
15 changes: 12 additions & 3 deletions changedetectionio/flask_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,9 +792,9 @@ def edit_page(uuid):
# Re #286 - We wait for syncing new data to disk in another thread every 60 seconds
# But in the case something is added we should save straight away
datastore.needs_write_urgent = True

# Queue the watch for immediate recheck, with a higher priority
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))
if not datastore.data['watching'][uuid].get('paused'):
# Queue the watch for immediate recheck, with a higher priority
update_q.put(queuedWatchMetaData.PrioritizedItem(priority=1, item={'uuid': uuid, 'skip_when_checksum_same': False}))

# Diff page [edit] link should go back to diff page
if request.args.get("next") and request.args.get("next") == 'diff':
Expand Down Expand Up @@ -1602,6 +1602,15 @@ def form_watch_list_checkbox_operations():

flash(f"{len(uuids)} watches were tagged")

elif op.startswith('mode:'):
mode = op.replace('mode:','')
for uuid in uuids:
uuid = uuid.strip()
if datastore.data['watching'].get(uuid):
datastore.data['watching'][uuid]['processor'] = mode
flash(f"{len(uuids)} watches changed modes")


return redirect(url_for('index'))

@app.route("/api/share-url", methods=['GET'])
Expand Down
2 changes: 1 addition & 1 deletion changedetectionio/model/Watch.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,7 @@ def save_xpath_data(self, data, as_error=False):
self.ensure_data_dir_exists()

with open(target_path, 'w') as f:
f.write(json.dumps(data))
f.write(json.dumps(data, indent=2))
f.close()

# Save as PNG, PNG is larger but better for doing visual diff in the future
Expand Down
66 changes: 66 additions & 0 deletions changedetectionio/processors/restock_diff/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
from . import Restock
from loguru import logger
import hashlib
import os
import re
import urllib3
import time

from ...html_tools import html_to_text

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
name = 'Re-stock & Price detection for single product pages'
description = 'Detects if the product goes back to in-stock'
Expand Down Expand Up @@ -34,6 +37,7 @@ def get_itemprop_availability(html_content) -> Restock:
Kind of funny/cool way to find price/availability in one many different possibilities.
Use 'extruct' to find any possible RDFa/microdata/json-ld data, make a JSON string from the output then search it.
"""

from jsonpath_ng import parse

now = time.time()
Expand All @@ -54,6 +58,7 @@ def get_itemprop_availability(html_content) -> Restock:

# First phase, dead simple scanning of anything that looks useful
value = Restock()
return value
if data:
logger.debug(f"Using jsonpath to find price/availability/etc")
price_parse = parse('$..(price|Price)')
Expand Down Expand Up @@ -121,6 +126,52 @@ class perform_site_check(difference_detection_processor):
screenshot = None
xpath_data = None

def ML_scrape_for_price_data(self, ML_price_scraper_url):
import requests
from changedetectionio import html_tools

price_info = None

# Perform the POST request
response = requests.post(ML_price_scraper_url, json=self.fetcher.xpath_data)
logger.debug(f"ML Price scraper - {ML_price_scraper_url} Response OK? - '{response.ok}'")
# Check if the response contains a dict
if response.ok: # This checks if the request was successful (status code 200-299)
response_json = response.json()
logger.debug(f"ML Price scraper: response - {response_json}'")
if isinstance(response_json, dict) and 'idx' in response_json.keys():
suggested_xpath_idx = response_json.get('idx')
if response_json.get('score') <0.80 or response_json.get('score') > 1.0:
logger.warning(f"Predict score was outside normal range, aborting ML/AI price check, needs better training data in this case?")
return None

# Use the path provided to extra the price text
from price_parser import Price
scrape_element = self.fetcher.xpath_data.get('size_pos', {})[suggested_xpath_idx]
logger.debug(f"Predicted selector with price information is {scrape_element['xpath']}")

result_s = None
if scrape_element['xpath'][0] == '/' or scrape_element['xpath'].startswith('xpath'):
result_s = html_tools.xpath_filter(xpath_filter=scrape_element['xpath'],
html_content=self.fetcher.content)
else:
# CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
result_s = html_tools.include_filters(include_filters=scrape_element['xpath'],
html_content=self.fetcher.content)

if result_s:
text = html_to_text(result_s)
logger.debug(f"Guessed the text '{text}' as the price information")
if text:
price_info = Price.fromstring(text)
else:
logger.error(f"ML Price scraper: missing xpath index (IDX) in response?")
else:
print(f"ML Price scraper: Request failed with status code: {response.status_code}")

#@TODO THROW HELPFUL MESSAGE WITH LINK TO TUTORIAL IF IT CANT CONNECT!
return price_info

def run_changedetection(self, watch, skip_when_checksum_same=True):
if not watch:
raise Exception("Watch no longer exists.")
Expand Down Expand Up @@ -177,6 +228,21 @@ def run_changedetection(self, watch, skip_when_checksum_same=True):
else:
update_obj['restock']['in_stock'] = False

# Attempt to pass the elements off to the machine-learning endpoint if its enabled
# This might return a confident guess as to which element contains the price data
if not itemprop_availability.get('price'):
ML_price_scraper_url = os.getenv("PRICE_SCRAPER_ML_ENDPOINT")
if self.fetcher.xpath_data and ML_price_scraper_url:
price_info = self.ML_scrape_for_price_data(ML_price_scraper_url)
if price_info and price_info.amount:
logger.success(f"ML Price scraper: Got price data {price_info}")
itemprop_availability['price'] = f"{price_info.amount}"
update_obj['restock']['price'] = f"{price_info.amount}"
if price_info and price_info.currency:
itemprop_availability['currency'] = price_info.currency
update_obj['restock']['currency'] = price_info.currency


# Main detection method
fetched_md5 = None

Expand Down
2 changes: 2 additions & 0 deletions changedetectionio/templates/watch-overview.html
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
<button class="pure-button button-secondary button-xsmall" name="op" value="assign-tag" id="checkbox-assign-tag">Tag</button>
<button class="pure-button button-secondary button-xsmall" name="op" value="mark-viewed">Mark viewed</button>
<button class="pure-button button-secondary button-xsmall" name="op" value="notification-default">Use default notification</button>
<button class="pure-button button-secondary button-xsmall" name="op" value="mode:text_json_diff">Mode: Page changes</button>
<button class="pure-button button-secondary button-xsmall" name="op" value="mode:restock_diff">Mode: Price/Restock</button>
<button class="pure-button button-secondary button-xsmall" name="op" value="clear-errors">Clear errors</button>
<button class="pure-button button-secondary button-xsmall" style="background: #dd4242;" name="op" value="clear-history">Clear/reset history</button>
<button class="pure-button button-secondary button-xsmall" style="background: #dd4242;" name="op" value="delete">Delete</button>
Expand Down
2 changes: 1 addition & 1 deletion changedetectionio/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from changedetectionio import store
import os
import sys
from loguru import logger


# https://github.com/pallets/flask/blob/1.1.2/examples/tutorial/tests/test_auth.py
# Much better boilerplate than the docs
Expand Down
1 change: 0 additions & 1 deletion changedetectionio/tests/fetchers/test_content.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python3

import time
from flask import url_for
from ..util import live_server_setup, wait_for_all_checks
import logging
Expand Down
Loading
Loading