Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support .ass output #40

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 142 additions & 19 deletions to_srt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,33 @@
import re


SUPPORTED_EXTENSIONS = [".xml", ".vtt"]
SUPPORTED_EXTENSIONS = [".xml", ".vtt", "dfxp"]


def leading_zeros(value, digits=2):
value = "000000" + str(value)
return value[-digits:]


def convert_time(raw_time):
def convert_time(raw_time, extension):
if int(raw_time) == 0:
return "{}:{}:{},{}".format(0, 0, 0, 0)
if extension == 'ass':
return "{}:{}:{}.{}".format(0, 0, 0, 0)
else:
return "{}:{}:{},{}".format(0, 0, 0, 0)

ms = '000'
if len(raw_time) > 4:
ms = leading_zeros(int(raw_time[:-4]) % 1000, 3)
ms = leading_zeros(int(raw_time[:-5]) % 1000, 2) # Accept only 2 digits after coma for seconds
time_in_seconds = int(raw_time[:-7]) if len(raw_time) > 7 else 0
second = leading_zeros(time_in_seconds % 60)
minute = leading_zeros(int(math.floor(time_in_seconds / 60)) % 60)
hour = leading_zeros(int(math.floor(time_in_seconds / 3600)))
return "{}:{}:{},{}".format(hour, minute, second, ms)

if extension == 'ass':
return "{}:{}:{}.{}".format(hour, minute, second, ms)
else:
return "{}:{}:{},{}".format(hour, minute, second, ms)


def xml_id_display_align_before(text):
Expand All @@ -51,27 +58,33 @@ def xml_get_cursive_style_ids(text):
if re.search(style_ids_re, line)]


def xml_cleanup_spans_start(span_id_re, cursive_ids, text):
def xml_cleanup_spans_start(span_id_re, cursive_ids, text, extension):
has_cursive = []
span_start_tags = re.findall(span_id_re, text)
for s in span_start_tags:
has_cursive.append(u"<i>" if s[1] in cursive_ids else u"")
if extension == 'ass':
has_cursive.append(u"{\\i1}" if s[1] in cursive_ids else u"")
else:
has_cursive.append(u"<i>" if s[1] in cursive_ids else u"")
text = has_cursive[-1].join(text.split(s[0], 1))
return text, has_cursive


def xml_cleanup_spans_end(span_end_re, text, has_cursive):
def xml_cleanup_spans_end(span_end_re, text, has_cursive, extension):
span_end_tags = re.findall(span_end_re, text)
for s, cursive in zip(span_end_tags, has_cursive):
cursive = u"</i>" if cursive else u""
if extension == 'ass':
cursive = u"{\\i0}" if cursive else u""
else:
cursive = u"</i>" if cursive else u""
text = cursive.join(text.split(s, 1))
return text


def to_srt(text, extension):
if extension.lower() == ".xml":
def to_srt(text, fileName):
if ".xml" in fileName.lower() or ".dfxp" in fileName.lower():
return xml_to_srt(text)
if extension.lower() == ".vtt":
if fileName.lower() == ".vtt":
return vtt_to_srt(text)


Expand Down Expand Up @@ -106,8 +119,8 @@ def vtt_to_srt(text):
def xml_to_srt(text):
def append_subs(start, end, prev_content, format_time):
subs.append({
"start_time": convert_time(start) if format_time else start,
"end_time": convert_time(end) if format_time else end,
"start_time": convert_time(start, 'srt') if format_time else start,
"end_time": convert_time(end, 'srt') if format_time else end,
"content": u"\n".join(prev_content),
})

Expand All @@ -131,7 +144,7 @@ def append_subs(start, end, prev_content, format_time):
fmt_t = True
for s in sub_lines:
s, has_cursive = xml_cleanup_spans_start(
span_id_re, cursive_ids, s)
span_id_re, cursive_ids, s, 'srt')

string_region_re = r'<p(.*region="' + display_align_before + r'".*")>(.*)</p>'
s = re.sub(string_region_re, r'<p\1>{\\an8}\2</p>', s)
Expand All @@ -142,7 +155,7 @@ def append_subs(start, end, prev_content, format_time):
content = u"\n".join(content.split(br_tags.group()))

content = xml_cleanup_spans_end(
span_end_re, content, has_cursive)
span_end_re, content, has_cursive, 'srt')

prev_start = prev_time["start"]
start = re.search(start_re, s).group(1)
Expand All @@ -166,6 +179,109 @@ def append_subs(start, end, prev_content, format_time):
for s in range(len(subs)))
return u"\n".join(lines)

def xml_to_ass(text, fileTitle):
def append_subs(start, end, prev_content, format_time):
subs.append({
"start_time": convert_time(start, 'ass') if format_time else start,
"end_time": convert_time(end, 'ass') if format_time else end,
"content": u"\n".join(prev_content),
})

display_align_before = xml_id_display_align_before(text)
begin_re = re.compile(u"\s*<p begin=")
sub_lines = (l for l in text.split("\n") if re.search(begin_re, l))
subs = []
prev_time = {"start": 0, "end": 0}
prev_content = []
start = end = ''
start_re = re.compile(u'begin\="([0-9:\.]*)')
end_re = re.compile(u'end\="([0-9:\.]*)')
content_re = re.compile(u'\">(.*)</p>')

# some span tags are used for italics, we'll replace them by <i> and </i>,
# which is the standard for .srt files. We ignore all other uses.
cursive_ids = xml_get_cursive_style_ids(text)
span_id_re = re.compile(u'(<span style=\"([a-zA-Z0-9_.]+)\">)+')
span_end_re = re.compile(u'(</span>)+')
br_re = re.compile(u'(<br\s*\/?>)+')
fmt_t = True
for s in sub_lines:
s, has_cursive = xml_cleanup_spans_start(
span_id_re, cursive_ids, s, 'ass')

string_region_re = r'<p(.*region="' + display_align_before + r'".*")>(.*)</p>'
s = re.sub(string_region_re, r'<p\1>{\\an8}\2</p>', s)
content = re.search(content_re, s).group(1)

br_tags = re.search(br_re, content)
if br_tags:
content = u"\\N".join(content.split(br_tags.group()))

content = xml_cleanup_spans_end(
span_end_re, content, has_cursive, 'ass')

prev_start = prev_time["start"]
start = re.search(start_re, s).group(1)
end = re.search(end_re, s).group(1)
if len(start.split(":")) > 1:
fmt_t = False
start = start.replace(".", ",")
end = end.replace(".", ",")
if (prev_start == start and prev_time["end"] == end) or not prev_start:
# Fix for multiple lines starting at the same time
prev_time = {"start": start, "end": end}
prev_content.append(content)
continue
append_subs(prev_time["start"], prev_time["end"], prev_content, fmt_t)
prev_time = {"start": start, "end": end}
prev_content = [content]
append_subs(start, end, prev_content, fmt_t)

lines = (u"Dialogue: 0,{},{},{},,0,0,0,,{}\n".format(
subs[s]["start_time"], subs[s]["end_time"], setFont(subs[s]["content"]), subs[s]["content"].replace('{\\an8}', '').replace('{\\i0}\\N{\\i1}', '\\N'))
for s in range(len(subs)))

concatenatedDialogues = u"".join(lines)
return addAssheader(fileTitle) + concatenatedDialogues

def setFont(text):
text = text.replace('&amp;', '&')
isHourFormat = re.match(r'([01]?[0-9]|2[0-3])h[0-5][0-9]', text) or re.match(r'([01]?[0-9]|2[0-3])\sh\s[0-5][0-9]', text)

if isHourFormat:
return 'Sign'
elif 'an8' in text:
return 'Top'
elif text.isupper():
return 'Sign'
else:
return 'Default'

def addAssheader(fileTitle):

title = '.'.join(fileTitle.split('.')[:-1])

ass = '[Script Info]\n'
ass += 'Title: ' + title + '\n'
ass += 'ScriptType: v4.00+\n'
ass += 'WrapStyle: 0\n'
ass += 'PlayResX: 1920\n'
ass += 'PlayResY: 1080\n'
ass += 'YCbCr Matrix: TV.709\n'
ass += 'ScaledBorderAndShadow: yes\n'
ass += '\n'
ass += '[V4+ Styles]\n'
ass += 'Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n'
ass += 'Style: Default,Arial,60,&H00FFFFFF,&H000000FF,&H00000000,&HAA000000,-1,0,0,0,100,100,0,0,1,3.5,1.5,2,200,200,75,1\n'
ass += 'Style: Top,Arial,60,&H00FFFFFF,&H000000FF,&H00000000,&HAA000000,-1,0,0,0,100,100,0,0,1,3.5,1.5,8,200,200,75,1\n'
ass += 'Style: Sign,Arial,60,&H00FFFFFF,&H000000FF,&H00000000,&HAA000000,-1,0,0,0,100,100,0,0,1,3.5,1.5,8,200,200,75,1\n'
ass += '\n'
ass += '[Events]\n'
ass += 'Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n'
return ass

def getFileNameWithoutExtension(fileName):
return fileName.replace('.xml', '').replace('.vtt', '').replace('.dfxp', '')

def main():
directory = "."
Expand All @@ -175,15 +291,22 @@ def main():
help=help_text.format("input", directory))
parser.add_argument("-o", "--output", type=str, default=directory,
help=help_text.format("output", directory))
parser.add_argument('-ass', '--ass', default=directory,
help=help_text.format("ass", directory), action='store_true')
a = parser.parse_args()
filenames = [fn for fn in os.listdir(a.input)
if fn[-4:].lower() in SUPPORTED_EXTENSIONS]
for fn in filenames:
with codecs.open("{}/{}".format(a.input, fn), 'rb', "utf-8") as f:
text = f.read()
with codecs.open("{}/{}.srt".format(a.output, fn), 'wb', "utf-8") as f:
f.write(to_srt(text, fn[-4:]))

if a.ass == True :
with codecs.open("{}/{}.ass".format(a.output, getFileNameWithoutExtension(fn)), 'wb', "utf-8") as f:
f.write(xml_to_ass(text, fn))
print('\nFile created: ' + getFileNameWithoutExtension(fn) + '.ass')
else:
with codecs.open("{}/{}.srt".format(a.output, getFileNameWithoutExtension(fn)), 'wb', "utf-8") as f:
f.write(to_srt(text, fn))
print('\nFile created: ' + getFileNameWithoutExtension(fn) + '.srt')

if __name__ == '__main__':
main()