diff --git a/b23_cv/cleanup_filename.py b/b23_cv/cleanup_filename.py new file mode 100644 index 0000000..225e39e --- /dev/null +++ b/b23_cv/cleanup_filename.py @@ -0,0 +1,44 @@ +"""This module is modified from yt-dlp's utils.py ( +https://github.com/yt-dlp/yt-dlp/blob/94a1c5e642e468cebeb51f74c6c220434cb47d96/yt_dlp/utils/_utils.py#L612) The +original code is licensed under the Unlicense License (https://unlicense.org), and the modified code is licensed under +GNU General Public License v3.0 (https://www.gnu.org/licenses/gpl-3.0.html) +""" + +import itertools +import re + +# needed for sanitizing filenames in restricted mode +ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', + itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'], + 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y'))) + + +def sanitize_filename(s): + """Sanitizes a string, so it could be used as part of a filename. + @param s: string to sanitize + """ + if s == '': + return '' + + def replace_insane(char): + if char in ACCENT_CHARS: + return ACCENT_CHARS[char] + elif char == '\n': + return '' + elif char in '"*:<>?|/\\': + # Replace with their full-width unicode counterparts + return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0)) + elif char == '?' or ord(char) < 32 or ord(char) == 127: + return ' ' + elif char == '"': + return '' + elif char == ':': + return '_-' + elif char in '\\/|*<>': + return '_' + return char + + s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps + result = ''.join(map(replace_insane, s)) + + return result diff --git a/b23_cv/get_batch.py b/b23_cv/get_batch.py index 0b6b250..914787d 100644 --- a/b23_cv/get_batch.py +++ b/b23_cv/get_batch.py @@ -7,6 +7,7 @@ from selenium.webdriver.support import expected_conditions as ec from selenium.webdriver.support.wait import WebDriverWait +from b23_cv import cleanup_filename from b23_cv.download_image import download_image from b23_cv.init_driver import init_driver @@ -70,8 +71,11 @@ def __main__(stdin_url, stdin_folder): # 将HTML内容转换为Markdown markdown_content = markdownify.markdownify(str(soup), heading_style="ATX") + # 尝试解决标题中包含 / 等特殊字符时无法保存的问题 + file_name = cleanup_filename.sanitize_filename(title) + # 保存Markdown内容到文件 - markdown_file_path = os.path.join(output_folder, f'{title}.md') + markdown_file_path = os.path.join(output_folder, f'{file_name}.md') with open(markdown_file_path, 'w', encoding='utf-8') as f: f.write(markdown_content) print(f'Saved {markdown_file_path}') diff --git a/b23_cv/get_single.py b/b23_cv/get_single.py index 25921df..54b17bc 100644 --- a/b23_cv/get_single.py +++ b/b23_cv/get_single.py @@ -6,6 +6,7 @@ from selenium.webdriver.support import expected_conditions as ec from selenium.webdriver.support.wait import WebDriverWait +from b23_cv import cleanup_filename from b23_cv.download_image import download_image from b23_cv.init_driver import init_driver @@ -47,9 +48,12 @@ def __main__(stdin_url, stdin_folder): # 将HTML内容转换为Markdown markdown_content = markdownify.markdownify(str(soup), heading_style="ATX") + # 尝试解决标题中包含 / 等特殊字符时无法保存的问题 + file_name = cleanup_filename.sanitize_filename(title) + # 保存Markdown内容到文件 - markdown_file_path = os.path.join(output_folder, f'{title}.md') - with open(markdown_file_path, 'w', encoding='utf-8') as f: + markdown_file_path = os.path.join(output_folder, f'{file_name}.md') + with open(markdown_file_path, 'w+', encoding='utf-8') as f: f.write(markdown_content) print(f'Saved {markdown_file_path}')