hacky special-case solution for links in feed excerpts

Guts · Apr 6, 2024 · 8c5db54 · 8c5db54
1 parent 77250f0
commit 8c5db54
Show file tree

Hide file tree

Showing 5 changed files with 88 additions and 10 deletions.
diff --git a/mkdocs_rss_plugin/plugin.py b/mkdocs_rss_plugin/plugin.py
@@ -283,6 +283,7 @@ def on_page_content(
                 created=page_dates[0],
                 description=self.util.get_description_or_abstract(
                     in_page=page,
+                    html=html,
                     chars_count=self.config.abstract_chars_count,
                     abstract_delimiter=self.config.abstract_delimiter,
                 ),

diff --git a/mkdocs_rss_plugin/util.py b/mkdocs_rss_plugin/util.py
@@ -6,6 +6,7 @@
 
 # standard library
 import logging
+import re
 import ssl
 import sys
 from datetime import date, datetime
@@ -15,7 +16,7 @@
 from typing import Any, Iterable, Optional, Tuple, Union
 from urllib import request
 from urllib.error import HTTPError, URLError
-from urllib.parse import urlencode, urlparse, urlunparse
+from urllib.parse import urlencode, urljoin, urlparse, urlunparse
 
 # 3rd party
 import markdown
@@ -48,6 +49,22 @@
 # ########## Classes #############
 # ################################
 
+HREF_MATCH_PATTERN = re.compile('href="(.*?)"')
+SRC_MATCH_PATTERN = re.compile('src="(.*?)"')
+
+
+def relative_links_resolve_to_page(page_html, page_url):
+    href_links_to_replace = re.findall(HREF_MATCH_PATTERN, page_html)
+    src_links_to_replace = re.findall(SRC_MATCH_PATTERN, page_html)
+    links_to_replace = set(href_links_to_replace + src_links_to_replace)
+    links_with_replacements = [
+        (link, urljoin(page_url, link)) for link in links_to_replace
+    ]
+    replaced_html = page_html
+    for original, replacement in links_with_replacements:
+        replaced_html = replaced_html.replace(original, replacement)
+    return replaced_html
+
 
 class Util:
     """Plugin logic."""
@@ -452,12 +469,17 @@ def get_date_from_meta(
         return out_date
 
     def get_description_or_abstract(
-        self, in_page: Page, chars_count: int = 160, abstract_delimiter: str = None
+        self,
+        in_page: Page,
+        html: str,
+        chars_count: int = 160,
+        abstract_delimiter: str = None,
     ) -> str:
         """Returns description from page meta. If it doesn't exist, use the \
-        {chars_count} first characters from page content (in markdown).
+        {chars_count} first characters from page content (in html).
 
         :param Page in_page: page to look at
+        :param str html: rendered page html
         :param int chars_count: if page.meta.description is not set, number of chars \
         of the content to use. Defaults to: 160 - optional
         :param str abstract_delimiter: description delimiter, defaults to None
@@ -486,14 +508,10 @@ def get_description_or_abstract(
             return ""
         elif (
             abstract_delimiter
-            and (
-                excerpt_separator_position := in_page.markdown.find(abstract_delimiter)
-            )
-            > -1
+            and (excerpt_separator_position := html.find(abstract_delimiter)) > -1
         ):
-            return markdown.markdown(
-                in_page.markdown[:excerpt_separator_position],
-                output_format="html5",
+            return relative_links_resolve_to_page(
+                html[:excerpt_separator_position], in_page.canonical_url
             )
         # If chars count is unlimited, use the html content
         elif in_page.content and chars_count == -1:

diff --git a/tests/fixtures/docs/blog/posts/assets/example_image.webp b/tests/fixtures/docs/blog/posts/assets/example_image.webp
diff --git a/tests/fixtures/docs/blog/posts/sample_blog_post_internal_links.md b/tests/fixtures/docs/blog/posts/sample_blog_post_internal_links.md
@@ -0,0 +1,24 @@
+---
+date: 2023-02-12
+authors: [guts]
+categories:
+  - Blog
+---
+
+# Blog sample with internal links
+
+I'm a really short intro.
+
+![here's an internal image](./assets/example_image.webp)
+
+[Here's an internal link](./sample_blog_post.md)
+and another
+[Another link](../../index.md)
+
+<!-- more -->
+
+## This part won't show up in RSS feed
+
+### What is Lorem Ipsum?
+
+Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.
diff --git a/tests/test_build.py b/tests/test_build.py
@@ -715,6 +715,41 @@ def test_not_git_repo(self):
         # restore name
         git_dir_tmp.replace(git_dir)
 
+    def test_abstract_with_internal_links(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            cli_result = self.build_docs_setup(
+                testproject_path="docs",
+                mkdocs_yml_filepath=Path("tests/fixtures/mkdocs_minimal.yml"),
+                output_path=tmpdirname,
+                strict=True,
+            )
+            self.assertEqual(cli_result.exit_code, 0)
+            self.assertIsNone(cli_result.exception)
+
+            feed_rss_created = feedparser.parse(
+                Path(tmpdirname) / OUTPUT_RSS_FEED_CREATED
+            )
+
+            feed_rss_updated = feedparser.parse(
+                Path(tmpdirname) / OUTPUT_RSS_FEED_UPDATED
+            )
+
+            ##print(json.dumps(feed_rss_created))
+
+            for page in feed_rss_created.entries + feed_rss_updated.entries:
+                if page.title == "Blog sample with internal links":
+                    self.assertIn(
+                        'href="https://guts.github.io/mkdocs-rss-plugin/blog/posts/sample_blog_post/"',
+                        page.summary,
+                    )
+                    self.assertIn(
+                        'href="https://guts.github.io/mkdocs-rss-plugin/"', page.summary
+                    )
+                    self.assertIn(
+                        'src="https://guts.github.io/mkdocs-rss-plugin/blog/posts/assets/example_image.webp"',
+                        page.summary,
+                    )
+
 
 # ##############################################################################
 # ##### Stand alone program ########