From 71bb8fe15aabbfbeadc561b5be9c67e40f8a3e6f Mon Sep 17 00:00:00 2001 From: ddio Date: Thu, 26 Sep 2024 23:41:16 +0800 Subject: [PATCH 1/2] feat: allow flex-direction obfuscation --- scrapy-tw-rental-house/pyproject.toml | 2 +- .../scrapy_twrh/spiders/rental591/util.py | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/scrapy-tw-rental-house/pyproject.toml b/scrapy-tw-rental-house/pyproject.toml index 7c78bde6..4ac928fb 100644 --- a/scrapy-tw-rental-house/pyproject.toml +++ b/scrapy-tw-rental-house/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "scrapy-tw-rental-house" -version = "1.4.0" +version = "1.4.1" description = "Scrapy spider for TW Rental House" readme = "README.md" authors = ["ddio "] diff --git a/scrapy-tw-rental-house/scrapy_twrh/spiders/rental591/util.py b/scrapy-tw-rental-house/scrapy_twrh/spiders/rental591/util.py index da4ce8ec..cccc9db0 100644 --- a/scrapy-tw-rental-house/scrapy_twrh/spiders/rental591/util.py +++ b/scrapy-tw-rental-house/scrapy_twrh/spiders/rental591/util.py @@ -1,3 +1,4 @@ +import re from collections import namedtuple from scrapy.http import Response from scrapy_twrh.spiders.util import clean_number @@ -45,20 +46,31 @@ def reorder_inline_flex_dom(base: Response, selector): items = base.css(selector) ret = [] for item in items: - # child span may contain style="display:inline-flex;" + # child span may contain style="display:inline-flex;flex-direction:row-reverse;" i_list = item.css('span[style*=display\\:inline-flex] > i') plain_value = item.xpath('text()').get() if plain_value is not None: ret.append(plain_value) elif i_list: + # check if it's reversed, find all values of flex-direction + container_style = item.css('span[style*=display\\:inline-flex]::attr(style)').get() + + # we may have multiple flex-direction, get last one + flex_directions = re.findall(r'flex-direction: ?([\w-]+)', container_style) + order_base = 1 + if flex_directions: + last_flex_direction = flex_directions[-1] + if last_flex_direction == 'row-reverse': + order_base = -1 # store i_list order (in style:order) and its ::text content) shuffled_list = [] for i in i_list: order = i.css('::attr(style)').re_first(r'order:(\d+)') + order = int(order) * order_base text = i.css('::text').get() shuffled_list.append((order, text)) # sort by order - shuffled_list.sort(key=lambda x: int(x[0])) + shuffled_list.sort(key=lambda x: x[0]) ret.append(''.join(map(lambda x: x[1], shuffled_list))) return ret From 068c4b7708cec7f49d5bf09e046ed79b67f16501 Mon Sep 17 00:00:00 2001 From: ddio Date: Thu, 26 Sep 2024 23:43:03 +0800 Subject: [PATCH 2/2] feat: update to latest scrapy twrh lib --- scrapy-twrh-example/poetry.lock | 2 +- twrh-dataset/poetry.lock | 8 ++++---- twrh-dataset/pyproject.toml | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/scrapy-twrh-example/poetry.lock b/scrapy-twrh-example/poetry.lock index 43fd0be4..008062da 100644 --- a/scrapy-twrh-example/poetry.lock +++ b/scrapy-twrh-example/poetry.lock @@ -639,7 +639,7 @@ w3lib = ">=1.17.0" [[package]] name = "scrapy-tw-rental-house" -version = "1.4.0" +version = "1.4.1" description = "Scrapy spider for TW Rental House" optional = false python-versions = "^3.10" diff --git a/twrh-dataset/poetry.lock b/twrh-dataset/poetry.lock index d70e6c98..5326ce49 100644 --- a/twrh-dataset/poetry.lock +++ b/twrh-dataset/poetry.lock @@ -897,13 +897,13 @@ w3lib = ">=1.17.0" [[package]] name = "scrapy-tw-rental-house" -version = "1.4.0" +version = "1.4.1" description = "Scrapy spider for TW Rental House" optional = false python-versions = "<4.0,>=3.10" files = [ - {file = "scrapy_tw_rental_house-1.4.0-py3-none-any.whl", hash = "sha256:5c225bfb6a5f26880dac20a1425b2cbc2104eec7e8e43ead431baa1c290dba05"}, - {file = "scrapy_tw_rental_house-1.4.0.tar.gz", hash = "sha256:c8dba314990a483c59b3eb2cdb447c1eede4845f06face7f4b9f4b1913b96199"}, + {file = "scrapy_tw_rental_house-1.4.1-py3-none-any.whl", hash = "sha256:8509fd08eb86449c698b0cd2eca17fd5384497b1dde4faef1da9295ed4ff682c"}, + {file = "scrapy_tw_rental_house-1.4.1.tar.gz", hash = "sha256:3c5826889cb496c52d1b6dabd88e0535568c98bbc27fc30a0fd193646399705c"}, ] [package.dependencies] @@ -1234,4 +1234,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "e11677048b8eca5af06b484dec2a3a1f4738890fa2bc5ea65a15618c19efb96e" +content-hash = "ab1759449ce5c1ac336ea1ef0b04edcc19e05e93503676fe38ee5b1a9b5a3ff8" diff --git a/twrh-dataset/pyproject.toml b/twrh-dataset/pyproject.toml index c40a4b7a..39602de5 100644 --- a/twrh-dataset/pyproject.toml +++ b/twrh-dataset/pyproject.toml @@ -10,7 +10,7 @@ readme = "README.md" python = "^3.10" # cffi = "==1.13.2" django = "^5" -scrapy-tw-rental-house = "==1.4.0" +scrapy-tw-rental-house = "==1.4.1" psycopg2-binary = "^2.9.9" pylint-django = "^2.5.5" sentry-sdk = "^1.39.1"