Skip to content

Commit

Permalink
fix: title with special characters can't be saved
Browse files Browse the repository at this point in the history
  • Loading branch information
GrassBlock1 committed Jul 30, 2024
1 parent 24737fa commit c11bff7
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 3 deletions.
44 changes: 44 additions & 0 deletions b23_cv/cleanup_filename.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""This module is modified from yt-dlp's utils.py (
https://github.com/yt-dlp/yt-dlp/blob/94a1c5e642e468cebeb51f74c6c220434cb47d96/yt_dlp/utils/_utils.py#L612) The
original code is licensed under the Unlicense License (https://unlicense.org), and the modified code is licensed under
GNU General Public License v3.0 (https://www.gnu.org/licenses/gpl-3.0.html)
"""

import itertools
import re

# needed for sanitizing filenames in restricted mode
ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))


def sanitize_filename(s):
"""Sanitizes a string, so it could be used as part of a filename.
@param s: string to sanitize
"""
if s == '':
return ''

def replace_insane(char):
if char in ACCENT_CHARS:
return ACCENT_CHARS[char]
elif char == '\n':
return ''
elif char in '"*:<>?|/\\':
# Replace with their full-width unicode counterparts
return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
elif char == '?' or ord(char) < 32 or ord(char) == 127:
return ' '
elif char == '"':
return ''
elif char == ':':
return '_-'
elif char in '\\/|*<>':
return '_'
return char

s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
result = ''.join(map(replace_insane, s))

return result
6 changes: 5 additions & 1 deletion b23_cv/get_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.wait import WebDriverWait

from b23_cv import cleanup_filename
from b23_cv.download_image import download_image
from b23_cv.init_driver import init_driver

Expand Down Expand Up @@ -70,8 +71,11 @@ def __main__(stdin_url, stdin_folder):
# 将HTML内容转换为Markdown
markdown_content = markdownify.markdownify(str(soup), heading_style="ATX")

# 尝试解决标题中包含 / 等特殊字符时无法保存的问题
file_name = cleanup_filename.sanitize_filename(title)

# 保存Markdown内容到文件
markdown_file_path = os.path.join(output_folder, f'{title}.md')
markdown_file_path = os.path.join(output_folder, f'{file_name}.md')
with open(markdown_file_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
print(f'Saved {markdown_file_path}')
Expand Down
8 changes: 6 additions & 2 deletions b23_cv/get_single.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.wait import WebDriverWait

from b23_cv import cleanup_filename
from b23_cv.download_image import download_image
from b23_cv.init_driver import init_driver

Expand Down Expand Up @@ -47,9 +48,12 @@ def __main__(stdin_url, stdin_folder):
# 将HTML内容转换为Markdown
markdown_content = markdownify.markdownify(str(soup), heading_style="ATX")

# 尝试解决标题中包含 / 等特殊字符时无法保存的问题
file_name = cleanup_filename.sanitize_filename(title)

# 保存Markdown内容到文件
markdown_file_path = os.path.join(output_folder, f'{title}.md')
with open(markdown_file_path, 'w', encoding='utf-8') as f:
markdown_file_path = os.path.join(output_folder, f'{file_name}.md')
with open(markdown_file_path, 'w+', encoding='utf-8') as f:
f.write(markdown_content)
print(f'Saved {markdown_file_path}')

Expand Down

0 comments on commit c11bff7

Please sign in to comment.