merged main and change the skipping logic

pbs · Jan 11, 2024 · 5b990fc · ana-nichifor · Jan 11, 2024 · 5b990fc
2 parents 7d758a2 + f4cc953
commit 5b990fc
Show file tree

Hide file tree

Showing 14 changed files with 202 additions and 1,318 deletions.
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -20,7 +20,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["py37", "py38", "py39", "py310", "py311"]
+        python-version: ["py38", "py39", "py310", "py311", "py312"]
 
     steps:
       - uses: actions/checkout@v2

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,8 +1,8 @@
 version: '3.8'
 
 services:
-  test_py37:
-    image: python:3.7-slim-bullseye
+  test_py38:
+    image: python:3.8-slim-bullseye
     command: sh -c "
       cd pycaption;
       pip install --upgrade pip;
@@ -13,8 +13,8 @@ services:
     volumes:
       - .:/pycaption
 
-  test_py38:
-    image: python:3.8-slim-bullseye
+  test_py39:
+    image: python:3.9-slim-bullseye
     command: sh -c "
       cd pycaption;
       pip install --upgrade pip;
@@ -25,8 +25,8 @@ services:
     volumes:
       - .:/pycaption
 
-  test_py39:
-    image: python:3.9-slim-bullseye
+  test_py310:
+    image: python:3.10-slim-bullseye
     command: sh -c "
       cd pycaption;
       pip install --upgrade pip;
@@ -37,8 +37,8 @@ services:
     volumes:
       - .:/pycaption
 
-  test_py310:
-    image: python:3.10-slim-bullseye
+  test_py311:
+    image: python:3.11-slim-bullseye
     command: sh -c "
       cd pycaption;
       pip install --upgrade pip;
@@ -49,8 +49,8 @@ services:
     volumes:
       - .:/pycaption
 
-  test_py311:
-    image: python:3.11-slim-bullseye
+  test_py312:
+    image: python:3.12-slim-bullseye
     command: sh -c "
       cd pycaption;
       pip install --upgrade pip;

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -1,5 +1,14 @@
 Changelog
 ---------
+2.2.2
+^^^^^
+- Remove support for Python 3.6 & 3.7
+- Restrict SCC source files to 31 characters per line (32 will throw an exception)
+
+2.2.1
+^^^^^
+- Ignore the substitute character that comes before the extended character in SCC files.
+
 2.2.0
 ^^^^^
 - Added support for Python 3.11

diff --git a/examples/example.scc b/examples/example.scc
diff --git a/pycaption/base.py b/pycaption/base.py
@@ -212,7 +212,7 @@ def __repr__(self):
             f'{self.format_start()} --> {self.format_end()}\n{self.get_text()}'
         )
 
-    def get_text(self):
+    def get_text_nodes(self):
         """
         Get the text of the caption.
         """
@@ -224,7 +224,10 @@ def get_text_for_node(node):
                 return '\n'
             return ''
 
-        text_nodes = [get_text_for_node(node) for node in self.nodes]
+        return [get_text_for_node(node) for node in self.nodes]
+
+    def get_text(self):
+        text_nodes = self.get_text_nodes()
         return ''.join(text_nodes).strip()
 
     def _format_timestamp(self, microseconds, msec_separator=None):

diff --git a/pycaption/exceptions.py b/pycaption/exceptions.py
@@ -35,3 +35,9 @@ class RelativizationError(Exception):
 
 class InvalidInputError(RuntimeError):
     """Error raised when the input is invalid (i.e. a unicode string)"""
+
+
+class CaptionLineLengthError(CaptionReadError):
+    """
+    Error raised when a Caption has a line longer than 32 characters.
+    """
diff --git a/pycaption/scc/__init__.py b/pycaption/scc/__init__.py
@@ -88,7 +88,7 @@
     BaseReader, BaseWriter, CaptionSet, CaptionNode,
 )
 from pycaption.exceptions import CaptionReadNoCaptions, InvalidInputError, \
-    CaptionReadTimingError
+    CaptionReadTimingError, CaptionLineLengthError
 from .constants import (
     HEADER, COMMANDS, SPECIAL_CHARS, EXTENDED_CHARS, CHARACTERS,
     MICROSECONDS_PER_CODEWORD, CHARACTER_TO_CODE,
@@ -232,6 +232,22 @@ def read(self, content, lang='en-US', simulate_roll_up=False, offset=0):
         captions = CaptionSet({lang: self.caption_stash.get_all()})
 
         # check captions for incorrect lengths
+        lines = []
+        for caption in self.caption_stash._collection:
+            caption_text = "".join(caption.to_real_caption().get_text_nodes())
+            lines.extend(caption_text.split("\n"))
+        lines_too_long = [line for line in lines if len(line) >= 32]
+
+        if bool(lines_too_long):
+            msg = ""
+            for line in lines_too_long:
+                msg += line + f" - Length { len(line)}" + "\n"
+            raise CaptionLineLengthError(
+                f"32 character limit for caption cue in scc file.\n"
+                f"Lines longer than 32:\n"
+                f"{msg}"
+            )
+
         for cap in captions.get_captions(lang):
             # if there's an end time on a caption and the difference is
             # less than .05s kill it (this is likely caused by a standalone
@@ -285,21 +301,27 @@ def _translate_line(self, line):
         self.time_translator.start_at(parts[0][0])
 
         # loop through each word
-        for word in parts[0][2].split(' '):
-            # ignore empty results or invalid commands
-            word = word.strip()
-            if len(word) == 4:
-                self._translate_word(word)
-
-    def _translate_word(self, word):
-        if self._skip_double_command(word):
+        words = [word.strip() for word in parts[0][2].split(' ') if len(word) == 4]
+
+        for idx, word in enumerate(words):
+            self._translate_word(word, words, idx)
+
+    @staticmethod
+    def get_command(commands, idx):
+        try:
+            return commands[idx]
+        except IndexError:
+            return None
+
+    def _translate_word(self, word, words, idx):
+        if self._skip_double_command(word, words, idx):
             # count frames for timing
             self.time_translator.increment_frames()
             return
         # first check if word is a command
         # TODO - check that all the positioning commands are here, or use
         # some other strategy to determine if the word is a command.
-        if word in COMMANDS or _is_pac_command(word):
+        if word in COMMANDS or _is_pac_command(word) or word in PAC_TAB_OFFSET_COMMANDS:
             self._translate_command(word)
 
         # second, check if word is a special character
@@ -316,32 +338,35 @@ def _translate_word(self, word):
         # count frames for timing only after processing a command
         self.time_translator.increment_frames()
 
-    def _skip_double_command(self, word):
+    def _skip_double_command(self, word, words, idx):
         # If the caption is to be broadcast, each of the commands are doubled
         # up for redundancy in case the signal is garbled in transmission.
         # The decoder is programmed to ignore a second command when it is the
         # same as the first.
         # Also like codes, Special Characters are always doubled up,
         # with only one member of each pair being displayed.
+        next_command = self.get_command(words, idx + 1)
+        second_next = self.get_command(words, idx + 2)
         if word in COMMANDS or _is_pac_command(word) or word in SPECIAL_CHARS:
-            if word == self.last_command:
+            # skip duplicates, execute the last occurrence
+            if word == next_command:
                 self.last_command = ''
                 return True
-            elif _is_pac_command(word) and _is_pac_command(self.last_command):
+            # Fix for the <position> <position> to execute only the last one
+            elif _is_pac_command(word) and _is_pac_command(next_command):
                 self.last_command = ''
                 return True
             # Fix for the <position> <tab offset> <position> <tab offset>
             # repetition
-            elif _is_pac_command(word) and word in self.last_command:
+            elif _is_pac_command(word) and next_command in PAC_TAB_OFFSET_COMMANDS and _is_pac_command(second_next):
                 self.last_command = ''
                 return True
+            # execute offset commands only if previous command is PAC and next is not pack
             elif word in PAC_TAB_OFFSET_COMMANDS:
-                if _is_pac_command(self.last_command):
-                    self.last_command += f" {word}"
+                if _is_pac_command(self.last_command) and not _is_pac_command(next_command):
                     return False
                 else:
                     return True
-
         self.last_command = word
         return False
 
@@ -529,13 +554,7 @@ def write(self, caption_set):
     # Wrap lines at 32 chars
     @staticmethod
     def _layout_line(caption):
-        def caption_node_to_text(caption_node):
-            if caption_node.type_ == CaptionNode.TEXT:
-                return caption_node.content
-            elif caption_node.type_ == CaptionNode.BREAK:
-                return '\n'
-        caption_text = ''.join(
-            [caption_node_to_text(node) for node in caption.nodes])
+        caption_text = "".join(caption.get_text_nodes())
         inner_lines = caption_text.split('\n')
         inner_lines_laid_out = [textwrap.fill(x, 32) for x in inner_lines]
         return '\n'.join(inner_lines_laid_out)

diff --git a/pycaption/scc/constants.py b/pycaption/scc/constants.py
@@ -1,4 +1,5 @@
 from itertools import product
+from collections import defaultdict
 
 COMMANDS = {
     '9420': '',
@@ -987,3 +988,64 @@ def _restructure_bytes_to_position_map(byte_to_pos_map):
 HEADER = 'Scenarist_SCC V1.0'
 
 UNHANDLED_COMMANDS = ["9120", "91ae", "912f", "91a1"]
+
+# taken from
+# http://www.theneitherworld.com/mcpoodle/SCC_TOOLS/DOCS/CC_CHARS.HTML
+INCONVERTIBLE_TO_ASCII_EXTENDED_CHARS_ASSOCIATION = {
+    '¡': "!",   # inverted exclamation mark
+     '¤': "C",  # currency
+     '¥': "Y",  # yen
+     '¦': "-",  # broken bar
+     '©': "c",  # copyright sign
+     '«': '"',  # left pointing double angle quotation mark
+     '»': '"',  # right pointing double angle quotation mark
+     'À': "A",
+     'Á': "A",
+     'Â': "A",
+     'Ã': "A",
+     'Ä': "A",
+     'Å': "A",
+     'Ç': "C",
+     'È': "E",
+     'É': "E",
+     'Ê': "E",
+     'Ë': "E",
+     'Ì': "I",
+     'Í': "I",
+     'Î': "I",
+     'Ï': "I",
+     'Ò': "O",
+     'Ó': "O",
+     'Ô': ")",
+     'Õ': "O",
+     'Ö': "O",
+     'Ø': "O",
+     'Ù': "U",
+     'Ú': "U",
+     'Û': "U",
+     'Ü': "U",
+     'ß': "s",
+     'ã': "a",
+     'ä': "a",
+     'å': "a",
+     'ë': "e",
+     'ì': "i",
+     'ï': "i",
+     'ò': "o",
+     'õ': "o",
+     'ö': "o",
+     'ø': "o",
+     'ù': "u",
+     'ü': "u",
+     '—': "-",  # em dash
+     '‘': "'",
+     '’': "'",
+     '“': '"',
+     '”': '"',
+     '•': ".",
+     '℠': "s",
+     '┌': "+",
+     '┐': "+",
+     '└': "+",
+     '┘': "+"
+}
diff --git a/pycaption/scc/specialized_collections.py b/pycaption/scc/specialized_collections.py
@@ -8,7 +8,8 @@
 )
 from .constants import (
     PAC_BYTES_TO_POSITIONING_MAP, COMMANDS, PAC_TAB_OFFSET_COMMANDS,
-    MICROSECONDS_PER_CODEWORD, UNHANDLED_COMMANDS
+    MICROSECONDS_PER_CODEWORD, UNHANDLED_COMMANDS,
+    INCONVERTIBLE_TO_ASCII_EXTENDED_CHARS_ASSOCIATION
 )
 
 PopOnCue = collections.namedtuple("PopOnCue", "buffer, start, end")
@@ -364,13 +365,11 @@ def _update_positioning(self, command):
 
         :type command: str
         """
-
         if command in PAC_TAB_OFFSET_COMMANDS:
             tab_offset = PAC_TAB_OFFSET_COMMANDS[command]
             prev_positioning = self._position_tracer.default
             positioning = (prev_positioning[0],
                            prev_positioning[1] + tab_offset)
-
         else:
             first, second = command[:2], command[2:]
 
@@ -424,10 +423,20 @@ def remove_ascii_duplicate(self, accented_character):
 
         :type accented_character: str
         """
-        if self._collection and self._collection[-1].is_text_node() and \
-                self._collection[-1].text:
-            ascii_char = unicodedata.normalize('NFD', accented_character)\
-                .encode('ascii', 'ignore').decode("utf-8")
+        is_text_node = (
+                self._collection and
+                self._collection[-1].is_text_node() and
+                self._collection[-1].text
+                )
+        if is_text_node:
+            try:
+                ascii_char = unicodedata.normalize('NFD', accented_character) \
+                    .encode('ascii', 'strict').decode("utf-8")
+            except (UnicodeEncodeError, UnicodeDecodeError):
+                ascii_char = INCONVERTIBLE_TO_ASCII_EXTENDED_CHARS_ASSOCIATION[
+                    accented_character
+                ]
+
             if ascii_char and self._collection[-1].text[-1] == ascii_char:
                 self._collection[-1].text = self._collection[-1].text[:-1]
 

diff --git a/run_tests.sh b/run_tests.sh
@@ -2,11 +2,11 @@
 
 DOCKER_CMD="docker-compose -p pycaption"
 
-SERVICE="test_py311"
+SERVICE="test_py312"
 
 if [ "$@" ]; then
-  if [ "$1" == "test_py37" ] || [ "$1" == "test_py38" ]  || \
-  [ "$1" == "test_py39"  ] || [ "$1" == "test_py310" ] || [ "$1" == "test_py311" ]; then
+  if [ "$1" == "test_py38" ]  || [ "$1" == "test_py39"  ] ||
+  [ "$1" == "test_py310" ] || [ "$1" == "test_py311" ] || [ "$1" == "test_py312" ]; then
     SERVICE="$1"
   fi
 fi