-
Notifications
You must be signed in to change notification settings - Fork 1
/
tesseract_Terminal.py
90 lines (72 loc) · 3.16 KB
/
tesseract_Terminal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
'''
inputFile needs to be an image
inputFile format: PDFname_count (if you want the naming convention to be different, then change the regex too.)
outputFile needs to be pdf
Put "your path\poppler-0.68.0\bin" location to the environment variable
'''
import os
import re
import csv
from PyPDF2 import PdfFileReader, PdfFileWriter, PdfFileMerger
import pandas as pd
''' Enter path of folder containing images '''
# C:\\Users\lakshay.saini\Desktop\Med Legal\PDF\Images
path = input("Enter folder path containing images: ")
# try:
#
# os.makedirs(str(path) + "/OCR")
#
# except Exception:
# pass
df = pd.read_csv("C:\\Users\lakshay.saini\Documents\TEST\PDF_Name_Listing.csv")
df1 = pd.read_csv("C:\\Users\lakshay.saini\Documents\TEST\PDF_Page_Listing.csv")
def extract_list():
f = open("C:\\Users\lakshay.saini\Desktop\LIVE\Listing.csv", "w+", newline="")
w = csv.writer(f, delimiter=",")
w.writerow(["File Name"])
# Keep the directory files in an alphabetical order
# topdown parameter does not do anything
# we can't use os.walk to get path alphabetical or in any other order
for root, dir, files in os.walk("C:\\Users\lakshay.saini\Documents\image", topdown=True):
for file in files:
if file.lower().endswith(".jpeg"):
w.writerow([str(file)])
''' OCR Code '''
def perform_ocr(oem_mode=None, psm_mode=None):
for root, dir, files in os.walk(path, topdown=True):
for jpeg in files:
if jpeg.lower().endswith(".jpeg"):
inputFile = root + "/" + jpeg
outputFile = root + "/" + jpeg.replace(".jpeg", "")
os.system('tesseract --psm 6 "' + inputFile + '" "' + outputFile + '" pdf')
# os.system('tesseract "' + inputFile + '" "' + outputFile + '" --oem 1 --psm 6 pdf')
def pdf_merge():
''' PDF creation and merging from images. '''
for i in df.index:
print(df)
fileName = df["File Name"][i]
pdf_writer = PdfFileWriter()
for j in df1.index:
pdf = df1["File Name"][j]
root = df1["Root"][i]
''' Extracting Parent Name '''
# matchObj = re.match("(.*)_.*.pdf", pdf, re.I | re.M)
matchObj = re.match("(.*)-.*.pdf", pdf, re.I | re.M)
if matchObj:
file_name = matchObj.group(1)
print(file_name)
else:
file_name = pdf
'''------------------------'''
if pdf.lower().endswith(".pdf") and (file_name == fileName):
''' Code for merging PDF '''
pdf_reader = PdfFileReader(root + pdf)
for page in range(pdf_reader.getNumPages()):
pdf_writer.addPage(pdf_reader.getPage(page))
with open(path + "\OCR\\" + fileName + ".pdf", 'wb') as fh:
pdf_writer.write(fh)
print (fileName + " PDF has been created")
if __name__ == '__main__':
# perform_ocr(None, None)
# extract_list()
pdf_merge()