From f49eb4567f21f779d917e562d3cb47bdbee0eb41 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Mon, 20 May 2024 15:11:15 +0200 Subject: [PATCH 1/2] Ability to set default User-Agent for either fetching types directly in the UI (#2375) --- .../content_fetchers/puppeteer.py | 1 - .../content_fetchers/requests.py | 5 -- changedetectionio/forms.py | 6 +++ changedetectionio/model/App.py | 5 ++ changedetectionio/processors/__init__.py | 4 ++ changedetectionio/store.py | 1 - changedetectionio/templates/settings.html | 16 ++++-- changedetectionio/tests/test_request.py | 52 +++++++++++++++---- 8 files changed, 70 insertions(+), 20 deletions(-) diff --git a/changedetectionio/content_fetchers/puppeteer.py b/changedetectionio/content_fetchers/puppeteer.py index cad1b6b8531..a497cb165f0 100644 --- a/changedetectionio/content_fetchers/puppeteer.py +++ b/changedetectionio/content_fetchers/puppeteer.py @@ -9,7 +9,6 @@ from changedetectionio.content_fetchers.base import Fetcher, manage_user_agent from changedetectionio.content_fetchers.exceptions import PageUnloadable, Non200ErrorCodeReceived, EmptyReply, BrowserFetchTimedOut, BrowserConnectError - class fetcher(Fetcher): fetcher_description = "Puppeteer/direct {}/Javascript".format( os.getenv("PLAYWRIGHT_BROWSER_TYPE", 'chromium').capitalize() diff --git a/changedetectionio/content_fetchers/requests.py b/changedetectionio/content_fetchers/requests.py index b743dbcec42..2c28cda7c44 100644 --- a/changedetectionio/content_fetchers/requests.py +++ b/changedetectionio/content_fetchers/requests.py @@ -30,11 +30,6 @@ def run(self, if self.browser_steps_get_valid_steps(): raise BrowserStepsInUnsupportedFetcher(url=url) - # Make requests use a more modern looking user-agent - if not {k.lower(): v for k, v in request_headers.items()}.get('user-agent', None): - request_headers['User-Agent'] = os.getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36') - proxies = {} # Allows override the proxy on a per-request basis diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py index 2d64a2273db..673be9caa16 100644 --- a/changedetectionio/forms.py +++ b/changedetectionio/forms.py @@ -526,6 +526,10 @@ class SingleExtraBrowser(Form): browser_connection_url = StringField('Browser connection URL', [validators.Optional()], render_kw={"placeholder": "wss://brightdata... wss://oxylabs etc", "size":50}) # @todo do the validation here instead +class DefaultUAInputForm(Form): + html_requests = StringField('Plaintext requests', validators=[validators.Optional()], render_kw={"placeholder": ""}) + if os.getenv("PLAYWRIGHT_DRIVER_URL") or os.getenv("WEBDRIVER_URL"): + html_webdriver = StringField('Chrome requests', validators=[validators.Optional()], render_kw={"placeholder": ""}) # datastore.data['settings']['requests'].. class globalSettingsRequestForm(Form): @@ -537,6 +541,8 @@ class globalSettingsRequestForm(Form): extra_proxies = FieldList(FormField(SingleExtraProxy), min_entries=5) extra_browsers = FieldList(FormField(SingleExtraBrowser), min_entries=5) + default_ua = FormField(DefaultUAInputForm, label="Default User-Agent overrides") + def validate_extra_proxies(self, extra_validators=None): for e in self.data['extra_proxies']: if e.get('proxy_name') or e.get('proxy_url'): diff --git a/changedetectionio/model/App.py b/changedetectionio/model/App.py index 1202d5db198..75384f17056 100644 --- a/changedetectionio/model/App.py +++ b/changedetectionio/model/App.py @@ -6,6 +6,7 @@ ) _FILTER_FAILURE_THRESHOLD_ATTEMPTS_DEFAULT = 6 +DEFAULT_SETTINGS_HEADERS_USERAGENT='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36' class model(dict): base_config = { @@ -22,6 +23,10 @@ class model(dict): 'time_between_check': {'weeks': None, 'days': None, 'hours': 3, 'minutes': None, 'seconds': None}, 'timeout': int(getenv("DEFAULT_SETTINGS_REQUESTS_TIMEOUT", "45")), # Default 45 seconds 'workers': int(getenv("DEFAULT_SETTINGS_REQUESTS_WORKERS", "10")), # Number of threads, lower is better for slow connections + 'default_ua': { + 'html_requests': getenv("DEFAULT_SETTINGS_HEADERS_USERAGENT", DEFAULT_SETTINGS_HEADERS_USERAGENT), + 'html_webdriver': None, + } }, 'application': { # Custom notification content diff --git a/changedetectionio/processors/__init__.py b/changedetectionio/processors/__init__.py index e2b544811a9..8702ee5d195 100644 --- a/changedetectionio/processors/__init__.py +++ b/changedetectionio/processors/__init__.py @@ -97,6 +97,10 @@ def call_browser(self): request_headers.update(self.datastore.get_all_base_headers()) request_headers.update(self.datastore.get_all_headers_in_textfile_for_watch(uuid=self.watch.get('uuid'))) + ua = self.datastore.data['settings']['requests'].get('default_ua') + if ua and ua.get(prefer_fetch_backend): + request_headers.update({'User-Agent': ua.get(prefer_fetch_backend)}) + # https://github.com/psf/requests/issues/4525 # Requests doesnt yet support brotli encoding, so don't put 'br' here, be totally sure that the user cannot # do this by accident. diff --git a/changedetectionio/store.py b/changedetectionio/store.py index 884c617a16b..afa6b2ae6c4 100644 --- a/changedetectionio/store.py +++ b/changedetectionio/store.py @@ -554,7 +554,6 @@ def has_extra_headers_file(self): return os.path.isfile(filepath) def get_all_base_headers(self): - from .model.App import parse_headers_from_text_file headers = {} # Global app settings headers.update(self.data['settings'].get('headers', {})) diff --git a/changedetectionio/templates/settings.html b/changedetectionio/templates/settings.html index e72c7818a88..0e3cea34470 100644 --- a/changedetectionio/templates/settings.html +++ b/changedetectionio/templates/settings.html @@ -108,8 +108,6 @@

Use the Basic method (default) where your watched sites don't need Javascript to render.

The Chrome/Javascript method requires a network connection to a running WebDriver+Chrome server, set by the ENV var 'WEBDRIVER_URL'.

-
- Tip: Connect using Bright Data and Oxylabs Proxies, find out more here.
@@ -121,6 +119,18 @@ {{ render_field(form.application.form.webdriver_delay) }}
+
+ {{ render_field(form.requests.form.default_ua) }} + + Applied to all requests.

+ Note: Simply changing the User-Agent often does not defeat anti-robot technologies, it's important to consider all of the ways that the browser is detected. +
+
+
@@ -190,7 +200,7 @@

Chrome Extension

- + Chrome Chrome Webstore

diff --git a/changedetectionio/tests/test_request.py b/changedetectionio/tests/test_request.py index 869ea349656..cfbc7825aad 100644 --- a/changedetectionio/tests/test_request.py +++ b/changedetectionio/tests/test_request.py @@ -256,12 +256,40 @@ def test_method_in_request(client, live_server): def test_headers_textfile_in_request(client, live_server): #live_server_setup(live_server) # Add our URL to the import page + + webdriver_ua = "Hello fancy webdriver UA 1.0" + requests_ua = "Hello basic requests UA 1.1" + test_url = url_for('test_headers', _external=True) if os.getenv('PLAYWRIGHT_DRIVER_URL'): # Because its no longer calling back to localhost but from the browser container, set in test-only.yml test_url = test_url.replace('localhost', 'cdio') - print ("TEST URL IS ",test_url) + form_data = { + "application-fetch_backend": "html_requests", + "application-minutes_between_check": 180, + "requests-default_ua-html_requests": requests_ua + } + + if os.getenv('PLAYWRIGHT_DRIVER_URL'): + form_data["requests-default_ua-html_webdriver"] = webdriver_ua + + res = client.post( + url_for("settings_page"), + data=form_data, + follow_redirects=True + ) + assert b'Settings updated' in res.data + + res = client.get(url_for("settings_page")) + + # Only when some kind of real browser is setup + if os.getenv('PLAYWRIGHT_DRIVER_URL'): + assert b'requests-default_ua-html_webdriver' in res.data + + # Field should always be there + assert b"requests-default_ua-html_requests" in res.data + # Add the test URL twice, we will check res = client.post( url_for("import_page"), @@ -272,15 +300,14 @@ def test_headers_textfile_in_request(client, live_server): wait_for_all_checks(client) - # Add some headers to a request res = client.post( url_for("edit_page", uuid="first"), data={ - "url": test_url, - "tags": "testtag", - "fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests', - "headers": "xxx:ooo\ncool:yeah\r\n"}, + "url": test_url, + "tags": "testtag", + "fetch_backend": 'html_webdriver' if os.getenv('PLAYWRIGHT_DRIVER_URL') else 'html_requests', + "headers": "xxx:ooo\ncool:yeah\r\n"}, follow_redirects=True ) assert b"Updated watch." in res.data @@ -292,7 +319,7 @@ def test_headers_textfile_in_request(client, live_server): with open('test-datastore/headers.txt', 'w') as f: f.write("global-header: nice\r\nnext-global-header: nice") - with open('test-datastore/'+extract_UUID_from_client(client)+'/headers.txt', 'w') as f: + with open('test-datastore/' + extract_UUID_from_client(client) + '/headers.txt', 'w') as f: f.write("watch-header: nice") client.get(url_for("form_watch_checknow"), follow_redirects=True) @@ -306,7 +333,7 @@ def test_headers_textfile_in_request(client, live_server): # Not needed anymore os.unlink('test-datastore/headers.txt') os.unlink('test-datastore/headers-testtag.txt') - os.unlink('test-datastore/'+extract_UUID_from_client(client)+'/headers.txt') + os.unlink('test-datastore/' + extract_UUID_from_client(client) + '/headers.txt') # The service should echo back the request verb res = client.get( url_for("preview_page", uuid="first"), @@ -319,7 +346,12 @@ def test_headers_textfile_in_request(client, live_server): assert b"Watch-Header:nice" in res.data assert b"Tag-Header:test" in res.data + # Check the custom UA from system settings page made it through + if os.getenv('PLAYWRIGHT_DRIVER_URL'): + assert "User-Agent:".encode('utf-8') + webdriver_ua.encode('utf-8') in res.data + else: + assert "User-Agent:".encode('utf-8') + requests_ua.encode('utf-8') in res.data - #unlink headers.txt on start/stop + # unlink headers.txt on start/stop res = client.get(url_for("form_delete", uuid="all"), follow_redirects=True) - assert b'Deleted' in res.data \ No newline at end of file + assert b'Deleted' in res.data From 7b04b52e45a944f6772f6483bc1424af216c4941 Mon Sep 17 00:00:00 2001 From: Alexander Sulfrian Date: Mon, 20 May 2024 15:49:12 +0200 Subject: [PATCH 2/2] RSS and tags/groups - Fixes use active_tag_uuid, fixes broken RSS link in page html (#2379) --- changedetectionio/templates/base.html | 6 +++--- .../templates/watch-overview.html | 20 +++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/changedetectionio/templates/base.html b/changedetectionio/templates/base.html index 60ceb3bb549..bbfe8634d14 100644 --- a/changedetectionio/templates/base.html +++ b/changedetectionio/templates/base.html @@ -6,7 +6,7 @@ Change Detection{{extra_title}} - + {% if extra_stylesheets %} @@ -83,8 +83,8 @@
  • - - + + diff --git a/changedetectionio/templates/watch-overview.html b/changedetectionio/templates/watch-overview.html index 6a25208ea49..15f538fb85e 100644 --- a/changedetectionio/templates/watch-overview.html +++ b/changedetectionio/templates/watch-overview.html @@ -13,7 +13,7 @@
    {{ render_nolabel_field(form.url, placeholder="https://...", required=true) }} - {{ render_nolabel_field(form.tags, value=active_tag.title if active_tag else '', placeholder="watch label / tag") }} + {{ render_nolabel_field(form.tags, value=active_tag.title if active_tag_uuid else '', placeholder="watch label / tag") }} {{ render_nolabel_field(form.watch_submit_button, title="Watch this URL!" ) }} {{ render_nolabel_field(form.edit_and_watch_submit_button, title="Edit first then Watch") }}
    @@ -46,7 +46,7 @@ {% endif %} {% if search_q %}
    Searching "{{search_q}}"
    {% endif %}
    - All + All {% for uuid, tag in tags %} @@ -67,11 +67,11 @@ {% set link_order = "desc" if sort_order == 'asc' else "asc" %} {% set arrow_span = "" %} - # + # - Website - Last Checked - Last Changed + Website + Last Checked + Last Changed @@ -95,11 +95,11 @@ {{ loop.index+pagination.skip }} {% if not watch.paused %} - Pause checks + Pause checks {% else %} - UnPause checks + UnPause checks {% endif %} - Mute notifications + Mute notifications {{watch.title if watch.title is not none and watch.title|length > 0 else watch.url}} @@ -204,7 +204,7 @@ all {% if active_tag_uuid %} in "{{active_tag.title}}"{%endif%}
  • - RSS Feed + RSS Feed
  • {{ pagination.links }}