pbs · OlteanuRares · Nov 16, 2023 · Nov 28, 2023 · Dec 5, 2023 · Jan 11, 2024
@@ -53,9 +53,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '2.2.4'
+version = '2.2.5'
 # The full version, including alpha/beta/rc tags.
-release = '2.2.4'
+release = '2.2.5'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

@@ -81,7 +81,7 @@
 import math
 import re
 import textwrap
-from collections import deque
+from collections import deque, Counter
 from copy import deepcopy
 
 from pycaption.base import (
@@ -301,21 +301,35 @@ def _translate_line(self, line):
         self.time_translator.start_at(parts[0][0])
 
         # loop through each word
-        for word in parts[0][2].split(' '):
-            # ignore empty results or invalid commands
-            word = word.strip()
-            if len(word) == 4:
-                self._translate_word(word)
-
-    def _translate_word(self, word):
-        if self._handle_double_command(word):
+        words = [word.strip() for word in parts[0][2].split(' ') if len(word) == 4]
+
+        for idx, word in enumerate(words):
+            self._translate_word(word, words, idx)
+
+    @staticmethod
+    def get_command(commands, idx):
+        try:
+            return commands[idx]
+        except IndexError:
+            return None
+
+    @staticmethod
+    def has_doubled_pac(words):
+        counter = Counter(words)
+        doubles = [
+            word for word in counter.keys() if _is_pac_command(word) and counter[word] > 1
+        ]
+        return bool(len(doubles))
+
+    def _translate_word(self, word, words, idx):
+        if self._skip_double_command(word, words, idx):
             # count frames for timing
             self.time_translator.increment_frames()
             return
         # first check if word is a command
         # TODO - check that all the positioning commands are here, or use
         # some other strategy to determine if the word is a command.
-        if word in COMMANDS or _is_pac_command(word):
+        if word in COMMANDS or _is_pac_command(word) or word in PAC_TAB_OFFSET_COMMANDS:
             self._translate_command(word)
 
         # second, check if word is a special character
@@ -332,31 +346,47 @@ def _translate_word(self, word):
         # count frames for timing only after processing a command
         self.time_translator.increment_frames()
 
-    def _handle_double_command(self, word):
+    def _skip_double_command(self, word, words, idx):
         # If the caption is to be broadcast, each of the commands are doubled
         # up for redundancy in case the signal is garbled in transmission.
         # The decoder is programmed to ignore a second command when it is the
         # same as the first.
         # Also like codes, Special Characters are always doubled up,
         # with only one member of each pair being displayed.
+        next_command = self.get_command(words, idx + 1)
+        second_next = self.get_command(words, idx + 2)
+
         if word in COMMANDS or _is_pac_command(word) or word in SPECIAL_CHARS or word in EXTENDED_CHARS:
-            if word == self.last_command:
+            # skip duplicates, execute the last occurrence if not a positioning command
+            if word == self.last_command and not _is_pac_command(word) and word not in EXTENDED_CHARS:
                 self.last_command = ''
                 return True
-            # Fix for the <position> <tab offset> <position> <tab offset>
-            # repetition
-            elif _is_pac_command(word) and word in self.last_command:
+            # skip consecutive positioning commands, execute the last one
+            elif _is_pac_command(word) and _is_pac_command(next_command):
                 self.last_command = ''
                 return True
+            # Fix for the <position> <tab offset> <position> <tab offset> repetition
+            # execute the last positioning command
+            elif _is_pac_command(word) and next_command in PAC_TAB_OFFSET_COMMANDS and _is_pac_command(second_next):
+                self.last_command = ''
+                return True
+            # execute offset commands only if previous command is PAC and next is not PAC
             elif word in PAC_TAB_OFFSET_COMMANDS:
-                if _is_pac_command(self.last_command):
-                    self.last_command += f" {word}"
+                if _is_pac_command(self.last_command) and not _is_pac_command(next_command):
+                    self.last_command = word
                     return False
                 else:
                     return True
+            elif word in EXTENDED_CHARS:
+                if self.has_doubled_pac(words):
+                    if word == self.last_command:
+                        self.last_command = ''
+                        return True
+                else:
+                    return False
 
-        self.last_command = word
-        return False
+            self.last_command = word
+            return False
 
     def _translate_special_char(self, word):
         self.buffer.add_chars(SPECIAL_CHARS[word])

@@ -987,9 +987,11 @@ def _restructure_bytes_to_position_map(byte_to_pos_map):
 
 HEADER = 'Scenarist_SCC V1.0'
 
+UNHANDLED_COMMANDS = ["9120", "91ae", "912f", "91a1"]
+
 # taken from
 # http://www.theneitherworld.com/mcpoodle/SCC_TOOLS/DOCS/CC_CHARS.HTML
-INCONVERTIBLE_TO_ASCII_EXTENDED_CHARS_ASSOCIATION = {
+SUBSTITUTES_EXTENDED_CHARS_ASSOCIATION = {
     '¡': ["!", "i"],   # inverted exclamation mark
      '¤': ["C"],  # currency
      '¥': ["Y"],  # yen
@@ -1045,5 +1047,13 @@ def _restructure_bytes_to_position_map(byte_to_pos_map):
      '┌': ["+"],
      '┐': ["+"],
      '└': ["+"],
-     '┘': ["+"]
+     '┘': ["+"],
+     '*': ['#', 'a'],
+     '{': ['['],
+     '}': [']'],
+     "\\": ['/'],
+     '^': ['/'],
+     '_': ['-'],
+     '~': ['~'],
+     '|': ['/'],
 }
@@ -8,7 +8,8 @@
 )
 from .constants import (
     PAC_BYTES_TO_POSITIONING_MAP, COMMANDS, PAC_TAB_OFFSET_COMMANDS,
-    MICROSECONDS_PER_CODEWORD, INCONVERTIBLE_TO_ASCII_EXTENDED_CHARS_ASSOCIATION
+    MICROSECONDS_PER_CODEWORD, SUBSTITUTES_EXTENDED_CHARS_ASSOCIATION,
+    MICROSECONDS_PER_CODEWORD, UNHANDLED_COMMANDS
 )
 
 PopOnCue = collections.namedtuple("PopOnCue", "buffer, start, end")
@@ -342,8 +343,8 @@ def interpret_command(self, command):
 
         :type command: str
         """
-        self._update_positioning(command)
-
+        if command not in UNHANDLED_COMMANDS:
+            self._update_positioning(command)
         text = COMMANDS.get(command, '')
 
         if 'italic' in text:
@@ -429,13 +430,7 @@ def remove_ascii_duplicate(self, accented_character):
                 self._collection[-1].text
                 )
         if is_text_node:
-            try:
-                ascii_char = [
-                    unicodedata.normalize('NFD', accented_character)
-                    .encode('ascii', 'strict').decode("utf-8")
-                ]
-            except (UnicodeEncodeError, UnicodeDecodeError):
-                ascii_char = INCONVERTIBLE_TO_ASCII_EXTENDED_CHARS_ASSOCIATION.get(accented_character)
+            ascii_char = SUBSTITUTES_EXTENDED_CHARS_ASSOCIATION.get(accented_character)
 
             if ascii_char and self._collection[-1].text[-1] in ascii_char:
                 self._collection[-1].text = self._collection[-1].text[:-1]

@@ -24,7 +24,7 @@
 
 setup(
     name='pycaption',
-    version='2.2.4',
+    version='2.2.5.dev',
     description='Closed caption converter',
     long_description=open(README_PATH).read(),
     author='Joe Norton',

@@ -61,7 +61,11 @@
     sample_scc_duplicate_tab_offset, sample_scc_duplicate_special_characters,
     sample_scc_tab_offset, sample_scc_with_unknown_commands,
     sample_scc_special_and_extended_characters,
-    sample_scc_with_line_too_long
+    sample_scc_with_consecutive_pac_commands,
+    sample_scc_special_and_extended_characters,
+    sample_scc_with_line_too_long,
+    sample_scc_with_double_pac,
+    sample_scc_without_double_pac
 )
 from tests.fixtures.srt import (  # noqa: F401
     sample_srt, sample_srt_ascii, sample_srt_numeric, sample_srt_empty,

@@ -920,10 +920,10 @@ def sample_dfxp_from_scc_output():
    <region tts:displayAlign="before" tts:origin="40% 53%" tts:textAlign="left" xml:id="r5"/>
    <region tts:displayAlign="before" tts:origin="70% 17%" tts:textAlign="left" xml:id="r6"/>
    <region tts:displayAlign="before" tts:origin="20% 35%" tts:textAlign="left" xml:id="r7"/>
-   <region tts:displayAlign="before" tts:origin="20% 83%" tts:textAlign="left" xml:id="r8"/>
+   <region tts:displayAlign="before" tts:origin="25% 83%" tts:textAlign="left" xml:id="r8"/>
    <region tts:displayAlign="before" tts:origin="70% 11%" tts:textAlign="left" xml:id="r9"/>
    <region tts:displayAlign="before" tts:origin="40% 41%" tts:textAlign="left" xml:id="r10"/>
-   <region tts:displayAlign="before" tts:origin="20% 71%" tts:textAlign="left" xml:id="r11"/>
+   <region tts:displayAlign="before" tts:origin="25% 71%" tts:textAlign="left" xml:id="r11"/>
   </layout>
  </head>
  <body>

@@ -24,6 +24,33 @@ def sample_scc_created_dfxp_with_wrongly_closing_spans():
 """
 
 
+@pytest.fixture(scope="session")
+def sample_scc_with_consecutive_pac_commands():
+    return """\
+Scenarist_SCC V1.0
+
+00:00:00;15	942c
+
+00:11:45;10	9420 94d0 94ce 5b20 cec1 5252 c154 4f52 205d 9470 946e cd4f cecb 45d9 d320 4c4f d645 2054 c849 cec7 d320 54c8 c154 2046 4cd9 ae80 942c 8080 8080 942f
+
+00:11:47;28	9420 9454 9723 d9c1 d9a1 94f4 9723 5b20 c84f 4f54 49ce c720 5d80 942c 8080 8080 942f
+
+00:11:50;08	9420 94d0 94ce 45d3 d045 4349 c14c 4cd9 2049 4620 54c8 45d9 a752 4520 54c8 4520 4fce 45d3 9470 946e 57c8 4f20 c745 5420 544f 2046 4cd9 2054 c845 cdae 942c 8080 8080 942f
+
+00:11:54;06	942c
+
+00:23:00;13	9420 1370 136e 5b20 43c8 494c c420 5d80 94d0 94ce c745 4f52 c745 20cd c1c4 4520 c120 cdc1 43c8 49ce 4580 9470 946e 464f 5220 c84f 5749 4520 544f 942c 8080 8080 942f
+
+00:23:02;04	9420 91d0 91ce 544f 2046 49ce c420 43d5 5249 4fd5 d320 c745 4f52 c745 9170 916e c1ce c420 c849 d320 4652 4945 cec4 d380 92d0 92ce 45d6 4552 d920 c4c1 d920 4fce 4c49 ce45 2c80 942c 8080 8080 942f
+
+00:23:05;00	9420 9152 91ae d357 49ce c720 c2d9 20d0 c2d3 cb49 c4d3 ae4f 52c7 9170 916e 544f 20d0 4cc1 d920 46d5 ce20 c7c1 cd45 d320 c1ce c420 57c1 5443 c880 92d0 9723 91ae d94f d552 2046 c1d6 4f52 4954 4520 d649 c445 4fd3 ae80 942c 8080 8080 942f
+
+
+00:23:05;00	9420 9152 91ae d357 49ce c720 c2d9 20d0 c2d3 cb49 c4d3 ae4f 52c7 9170 916e 544f 20d0 4cc1 d920 46d5 ce20 c7c1 cd45 d320 c1ce c420 57c1 5443 c880 92d0 9723 91ae d94f d552 2046 c1d6 4f52 4954 4520 d649 c445 4fd3 ae80 942c 8080 8080 942f
+
+"""
+
+
 @pytest.fixture(scope="session")
 def scc_that_generates_webvtt_with_proper_newlines():
     return """\
@@ -435,11 +462,41 @@ def sample_scc_with_line_too_long():
 
 00:00:00;03	942c
 
-00:00:01;45	9420 91f4 cb45 4c4c d920 4ac1 cd45 d3ba 20c8 eff7 9254 f468 e520 7368 eff7 2073 f461 f2f4 e564 942c 8080 8080 942f
+00:00:01;45	9420 91f4 cb45 4c4c d920 4ac1 cd45 d3ba 20c8 eff7 9254 f468 e520 7368 eff7d3ba 20c8 eff7 9254 f468 e520 7368 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 eff7 2073 f461 f2f4 e564 942c 8080 8080 942f
 
 00:00:02;55	9420 91e0 9723 f761 7320 4361 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 ec20 c4e5 6ee9 73ef 6e2c 2061 20e6 f2e9 e56e 6480 9240 9723 efe6 20ef 75f2 732c 20f7 6173 2064 efe9 6e67 206d 7920 43c4 73ae 942c 8080 8080 942f
 
 00:00:06;57	9420 94e0 c16e 6420 68e5 2073 61e9 642c 2049 20e3 616e 2064 ef20 6120 54d6 2073 68ef f7ae 942c 8080 8080 942f
 
 00:00:08;58	9420 9452 4920 ea75 73f4 20f7 616e f4e5 6420 ef6e e520 7368 eff7 2c80 94f2 ea75 73f4 20f4 ef20 6861 76e5 2061 7320 6120 ece9 f4f4 ece5 942c 8080 8080 942f
 """
+
+
+@pytest.fixture(scope="session")
+def sample_scc_with_double_pac():
+    return """\
+Scenarist_SCC V1.0
+
+00:21:18;06    9420 94ae 94d0 97a1 2080 97ad e96e 20f4 68e5 2070 f2ef e3e5 7373 2c20 4920 e56e 64e5 6420 942c 7570 94f2 97a1 2080 97ad e96e 20f4 68e9 7320 f768 e5e5 ece3 6861 e9f2 2c80 942f
+
+00:21:21;16    9420 94ae 94d0 9723 2080 97ad 73ef 20e9 f420 62e5 e361 6de5 2068 61f2 6420 ea75 73f4 9470 2080 97ad f4ef 2067 942c e5f4 2061 f2ef 756e 6420 f468 e520 f768 e5e5 ece3 6861 e9f2 942f
+
+00:21:24;01    9420 94ae 94d0 2080 97ad 62e5 e361 7573 e520 68e5 2068 6164 2061 ecec 20f4 68e9 7320 2080 9140 9140 92a8 92a8 92a8 92a8 94f2 2080 97ad 61ec ec20 ef76 e5f2 20f4 942c 68e5 2070 ec61 e3e5 2073 ef80 942f
+
+00:21:28;05    9420 94ae 9470 2080 97ad 94a1 f768 e56e 20e9 7320 7370 61e3 e520 942c 62e5 e361 6de5 2061 7661 e9ec 6162 ece5 942f
+"""
+
+
+@pytest.fixture(scope="session")
+def sample_scc_without_double_pac():
+    return """\
+Scenarist_SCC V1.0
+
+00:21:18;06    9420 94ae 94d0 97a1 2080 97ad e96e 20f4 68e5 2070 f2ef e3e5 7373 2c20 4920 e56e 64e5 6420 942c 7570 94f2 97a1 2080 97ad e96e 20f4 68e9 7320 f768 e5e5 ece3 6861 e9f2 2c80 942f
+
+00:21:21;16    9420 94ae 94d0 9723 2080 97ad 73ef 20e9 f420 62e5 e361 6de5 2068 61f2 6420 ea75 73f4 9470 2080 97ad f4ef 2067 942c e5f4 2061 f2ef 756e 6420 f468 e520 f768 e5e5 ece3 6861 e9f2 942f
+
+00:21:24;01    9420 94ae 94d0 2080 97ad 62e5 e361 7573 e520 68e5 2068 6164 2061 ecec 20f4 68e9 7320 2080 92a8 92a8 92a8 92a8 94f2 2080 97ad 61ec ec20 ef76 e5f2 20f4 942c 68e5 2070 ec61 e3e5 2073 ef80 942f
+
+00:21:28;05    9420 94ae 9470 2080 97ad 94a1 f768 e56e 20e9 7320 7370 61e3 e520 942c 62e5 e361 6de5 2061 7661 e9ec 6162 ece5 942f
+"""