Skip to content

Commit

Permalink
[Feature] OCR: output support newline (#58)
Browse files Browse the repository at this point in the history
* Update README.md

typo

* Update ocr.py

* Update optional.txt

* Update README.md
  • Loading branch information
tpoisonooo authored Dec 18, 2023
1 parent a4a381e commit 96fbb97
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 4 deletions.
3 changes: 2 additions & 1 deletion agentlego/tools/ocr/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@ wget https://raw.githubusercontent.com/open-mmlab/mmocr/main/demo/demo_kie.jpeg
from agentlego.apis import load_tool

# load tool
tool = load_tool('OCR', device='cuda', lang='en', x_ths=3.)
tool = load_tool('OCR', device='cuda', lang='en', x_ths=3., line_group_tolerance=30)

# apply tool
res = tool('demo_kie.jpeg')
```
For bilingual Chinese and English OCR, `lang` may be `['en', 'ch_sim']`, [here](https://www.jaided.ai/easyocr/) is all supported language code name.

**With Lagent**

Expand Down
35 changes: 32 additions & 3 deletions agentlego/tools/ocr/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,21 @@ def __init__(self,
parser: Callable = DefaultParser,
lang: Union[str, Sequence[str]] = 'en',
device: Union[bool, str] = True,
line_group_tolerance = -1,
**read_args):
super().__init__(toolmeta=toolmeta, parser=parser)
if isinstance(lang, str):
lang = [lang]
self.lang = list(lang)
read_args.setdefault('decoder', 'beamsearch')
read_args.setdefault('paragraph', True)
self.read_args = read_args
self.device = device
self.line_group_tolerance = line_group_tolerance
read_args.setdefault('decoder', 'beamsearch')

if line_group_tolerance >= 0:
read_args.setdefault('paragraph', False)
else:
read_args.setdefault('paragraph', True)

def setup(self):
import easyocr
Expand All @@ -53,6 +59,29 @@ def setup(self):
def apply(self, image: ImageIO) -> str:

image = image.to_array()
ocr_results = self._reader.readtext(image, detail=0, **self.read_args)
if self.line_group_tolerance >= 0:
results = self._reader.readtext(image, **self.read_args)
results.sort(key=lambda x: x[0][0][1])

lines = []
line = [results[0]]

for result in results[1:]:
if abs(result[0][0][1] - line[0][0][0][1]) <= self.line_group_tolerance:
line.append(result)
else:
lines.append(line)
line = [result]

lines.append(line)

ocr_results = []
for line in lines:
# For each line, sort the elements by their left x-coordinate and join their texts
sorted_line = sorted(line, key=lambda x: x[0][0][0])
text_line = ' '.join(item[1] for item in sorted_line)
ocr_results.append(text_line)
else:
ocr_results = self._reader.readtext(image, detail=0, **self.read_args)
outputs = '\n'.join(ocr_results)
return outputs
1 change: 1 addition & 0 deletions requirements/optional.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ torchaudio
torchvision
typing-extensions
uvicorn[standard]
easyocr

0 comments on commit 96fbb97

Please sign in to comment.