[Feature] OCR: output support newline (#58)

* Update README.md typo * Update ocr.py * Update optional.txt * Update README.md
InternLM · Dec 18, 2023 · 96fbb97 · 96fbb97
1 parent a4a381e
commit 96fbb97
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 4 deletions.
diff --git a/agentlego/tools/ocr/README.md b/agentlego/tools/ocr/README.md
@@ -14,11 +14,12 @@ wget https://raw.githubusercontent.com/open-mmlab/mmocr/main/demo/demo_kie.jpeg
 from agentlego.apis import load_tool
 
 # load tool
-tool = load_tool('OCR', device='cuda', lang='en', x_ths=3.)
+tool = load_tool('OCR', device='cuda', lang='en', x_ths=3., line_group_tolerance=30)
 
 # apply tool
 res = tool('demo_kie.jpeg')
 ```
+For bilingual Chinese and English OCR, `lang` may be `['en', 'ch_sim']`, [here](https://www.jaided.ai/easyocr/) is all supported language code name.
 
 **With Lagent**
 

diff --git a/agentlego/tools/ocr/ocr.py b/agentlego/tools/ocr/ocr.py
@@ -35,15 +35,21 @@ def __init__(self,
                  parser: Callable = DefaultParser,
                  lang: Union[str, Sequence[str]] = 'en',
                  device: Union[bool, str] = True,
+                 line_group_tolerance = -1,
                  **read_args):
         super().__init__(toolmeta=toolmeta, parser=parser)
         if isinstance(lang, str):
             lang = [lang]
         self.lang = list(lang)
-        read_args.setdefault('decoder', 'beamsearch')
-        read_args.setdefault('paragraph', True)
         self.read_args = read_args
         self.device = device
+        self.line_group_tolerance = line_group_tolerance
+        read_args.setdefault('decoder', 'beamsearch')
+
+        if line_group_tolerance >= 0:
+            read_args.setdefault('paragraph', False)
+        else:
+            read_args.setdefault('paragraph', True)
 
     def setup(self):
         import easyocr
@@ -53,6 +59,29 @@ def setup(self):
     def apply(self, image: ImageIO) -> str:
 
         image = image.to_array()
-        ocr_results = self._reader.readtext(image, detail=0, **self.read_args)
+        if self.line_group_tolerance >= 0:
+            results = self._reader.readtext(image, **self.read_args)
+            results.sort(key=lambda x: x[0][0][1])
+
+            lines = []
+            line = [results[0]]
+
+            for result in results[1:]:
+                if abs(result[0][0][1] - line[0][0][0][1]) <= self.line_group_tolerance:
+                    line.append(result)
+                else:
+                    lines.append(line)
+                    line = [result]
+
+            lines.append(line)
+
+            ocr_results = []
+            for line in lines:
+                # For each line, sort the elements by their left x-coordinate and join their texts
+                sorted_line = sorted(line, key=lambda x: x[0][0][0])
+                text_line = ' '.join(item[1] for item in sorted_line)
+                ocr_results.append(text_line)
+        else:
+            ocr_results = self._reader.readtext(image, detail=0, **self.read_args)
         outputs = '\n'.join(ocr_results)
         return outputs
diff --git a/requirements/optional.txt b/requirements/optional.txt
@@ -7,3 +7,4 @@ torchaudio
 torchvision
 typing-extensions
 uvicorn[standard]
+easyocr