fix: title with special characters can't be saved

GrassBlock1 · Jul 30, 2024 · c11bff7 · c11bff7
1 parent 24737fa
commit c11bff7
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 3 deletions.
diff --git a/b23_cv/cleanup_filename.py b/b23_cv/cleanup_filename.py
@@ -0,0 +1,44 @@
+"""This module is modified from yt-dlp's utils.py (
+https://github.com/yt-dlp/yt-dlp/blob/94a1c5e642e468cebeb51f74c6c220434cb47d96/yt_dlp/utils/_utils.py#L612) The 
+original code is licensed under the Unlicense License (https://unlicense.org), and the modified code is licensed under 
+GNU General Public License v3.0 (https://www.gnu.org/licenses/gpl-3.0.html)
+"""
+
+import itertools
+import re
+
+# needed for sanitizing filenames in restricted mode
+ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
+                        itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
+                                        'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
+
+
+def sanitize_filename(s):
+    """Sanitizes a string, so it could be used as part of a filename.
+    @param s:           string to sanitize
+    """
+    if s == '':
+        return ''
+
+    def replace_insane(char):
+        if char in ACCENT_CHARS:
+            return ACCENT_CHARS[char]
+        elif char == '\n':
+            return ''
+        elif char in '"*:<>?|/\\':
+            # Replace with their full-width unicode counterparts
+            return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
+        elif char == '?' or ord(char) < 32 or ord(char) == 127:
+            return ' '
+        elif char == '"':
+            return ''
+        elif char == ':':
+            return '_-'
+        elif char in '\\/|*<>':
+            return '_'
+        return char
+
+    s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)  # Handle timestamps
+    result = ''.join(map(replace_insane, s))
+
+    return result
diff --git a/b23_cv/get_batch.py b/b23_cv/get_batch.py
@@ -7,6 +7,7 @@
 from selenium.webdriver.support import expected_conditions as ec
 from selenium.webdriver.support.wait import WebDriverWait
 
+from b23_cv import cleanup_filename
 from b23_cv.download_image import download_image
 from b23_cv.init_driver import init_driver
 
@@ -70,8 +71,11 @@ def __main__(stdin_url, stdin_folder):
         # 将HTML内容转换为Markdown
         markdown_content = markdownify.markdownify(str(soup), heading_style="ATX")
 
+        # 尝试解决标题中包含 / 等特殊字符时无法保存的问题
+        file_name = cleanup_filename.sanitize_filename(title)
+
         # 保存Markdown内容到文件
-        markdown_file_path = os.path.join(output_folder, f'{title}.md')
+        markdown_file_path = os.path.join(output_folder, f'{file_name}.md')
         with open(markdown_file_path, 'w', encoding='utf-8') as f:
             f.write(markdown_content)
         print(f'Saved {markdown_file_path}')

diff --git a/b23_cv/get_single.py b/b23_cv/get_single.py
@@ -6,6 +6,7 @@
 from selenium.webdriver.support import expected_conditions as ec
 from selenium.webdriver.support.wait import WebDriverWait
 
+from b23_cv import cleanup_filename
 from b23_cv.download_image import download_image
 from b23_cv.init_driver import init_driver
 
@@ -47,9 +48,12 @@ def __main__(stdin_url, stdin_folder):
     # 将HTML内容转换为Markdown
     markdown_content = markdownify.markdownify(str(soup), heading_style="ATX")
 
+    # 尝试解决标题中包含 / 等特殊字符时无法保存的问题
+    file_name = cleanup_filename.sanitize_filename(title)
+
     # 保存Markdown内容到文件
-    markdown_file_path = os.path.join(output_folder, f'{title}.md')
-    with open(markdown_file_path, 'w', encoding='utf-8') as f:
+    markdown_file_path = os.path.join(output_folder, f'{file_name}.md')
+    with open(markdown_file_path, 'w+', encoding='utf-8') as f:
         f.write(markdown_content)
     print(f'Saved {markdown_file_path}')