Skip to content

Commit

Permalink
Store cmap's as json
Browse files Browse the repository at this point in the history
  • Loading branch information
pietermarsman committed Jan 27, 2024
1 parent 45e8443 commit e266243
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 15 deletions.
12 changes: 6 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,21 @@ MKDIR=mkdir
CONV_CMAP=$(PYTHON) tools/conv_cmap.py
CMAPSRC=cmaprsrc
CMAPDST=pdfminer/cmap
cmap: $(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz $(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz \
$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz
cmap: $(CMAPDST)/to-unicode-Adobe-CNS1.json.gz $(CMAPDST)/to-unicode-Adobe-GB1.json.gz \
$(CMAPDST)/to-unicode-Adobe-Japan1.json.gz $(CMAPDST)/to-unicode-Adobe-Korea1.json.gz
cmap_clean:
-$(RM) -r $(CMAPDST)
$(CMAPDST):
$(MKDIR) $(CMAPDST)
$(CMAPDST)/to-unicode-Adobe-CNS1.pickle.gz: $(CMAPDST)
$(CMAPDST)/to-unicode-Adobe-CNS1.json.gz: $(CMAPDST)
$(CONV_CMAP) -c B5=cp950 -c UniCNS-UTF8=utf-8 \
$(CMAPDST) Adobe-CNS1 $(CMAPSRC)/cid2code_Adobe_CNS1.txt
$(CMAPDST)/to-unicode-Adobe-GB1.pickle.gz: $(CMAPDST)
$(CMAPDST)/to-unicode-Adobe-GB1.json.gz: $(CMAPDST)
$(CONV_CMAP) -c GBK-EUC=cp936 -c UniGB-UTF8=utf-8 \
$(CMAPDST) Adobe-GB1 $(CMAPSRC)/cid2code_Adobe_GB1.txt
$(CMAPDST)/to-unicode-Adobe-Japan1.pickle.gz: $(CMAPDST)
$(CMAPDST)/to-unicode-Adobe-Japan1.json.gz: $(CMAPDST)
$(CONV_CMAP) -c RKSJ=cp932 -c EUC=euc-jp -c UniJIS-UTF8=utf-8 \
$(CMAPDST) Adobe-Japan1 $(CMAPSRC)/cid2code_Adobe_Japan1.txt
$(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST)
$(CMAPDST)/to-unicode-Adobe-Korea1.json.gz: $(CMAPDST)
$(CONV_CMAP) -c KSC-EUC=euc-kr -c KSC-Johab=johab -c KSCms-UHC=cp949 -c UniKS-UTF8=utf-8 \
$(CMAPDST) Adobe-Korea1 $(CMAPSRC)/cid2code_Adobe_Korea1.txt
16 changes: 7 additions & 9 deletions tools/conv_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import argparse
import codecs
import gzip
import pickle as pickle
import json
import sys
from pathlib import Path
from typing import List, Any
Expand Down Expand Up @@ -140,16 +140,14 @@ def dump_cmap(self, fp, enc):
IS_VERTICAL=self.is_vertical.get(enc, False),
CODE2CID=self.code2cid.get(enc),
)
fp.write(pickle.dumps(data, 2))
return
json.dump(data, fp)

def dump_unicodemap(self, fp):
data = dict(
CID2UNICHR_H=self.cid2unichr_h,
CID2UNICHR_V=self.cid2unichr_v,
)
fp.write(pickle.dumps(data, 2))
return
json.dump(data, fp)


def create_parser() -> argparse.ArgumentParser:
Expand Down Expand Up @@ -195,14 +193,14 @@ def main(argv: List[Any]):

outdir.mkdir(exist_ok=True)
for enc in converter.get_encs():
path = outdir / f"{enc}.pickle.gz"
path = outdir / f"{enc}.json.gz"
print(f"writing: {path}...")
with gzip.open(path, "wb") as fp:
with gzip.open(path, "wt") as fp:
converter.dump_cmap(fp, enc)

path = outdir / f"to-unicode-{regname}.pickle.gz"
path = outdir / f"to-unicode-{regname}.json.gz"
print(f"writing: {path}...")
with gzip.open(path, "wb") as fp:
with gzip.open(path, "wt") as fp:
converter.dump_unicodemap(fp)


Expand Down

0 comments on commit e266243

Please sign in to comment.