-
Notifications
You must be signed in to change notification settings - Fork 0
/
functions_words.py
119 lines (92 loc) · 3.22 KB
/
functions_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import cv2
import numpy as np
def findSpaces(line, thres_space):
# making vertical projections
verProj = cv2.reduce(line, 0, cv2.REDUCE_AVG)
# make hist - same dimension as horProj - if 0 (space), then True, else False
th = 0; # black pixels threshold value. this represents the space lines
hist = verProj <= th;
#Get mean coordinate of white white pixels groups
xcoords = []
x = 0
count = 0
isSpace = False
for i in range(0, line.shape[1]):
if (not isSpace):
if (hist[0][i]): #if space is detected, get the first starting x-coordinates and start count at 1
isSpace = True
count = 1
x = i
else:
if (not hist[0][i]):
isSpace = False
#when smoothing, thin letters will breakdown, creating a new blank lines or pixel columns, but the count will be small, so we set a threshold.
#print count,"\t",
if (count > thres_space):
xcoords.append(x // count)
else:
x = x + i
count = count + 1
xcoords.append(x // count)
return xcoords
def SpacesMedian(line):
# making vertical projections
verProj = cv2.reduce(line, 0, cv2.REDUCE_AVG)
# make hist - same dimension as horProj - if 0 (space), then True, else False
th = 0; # black pixels threshold value. this represents the space lines
hist = verProj <= th;
#Get mean coordinate of white white pixels groups
xcoords = []
x = 0
count = 0
isSpace = False
median_count = []
for i in range(0, line.shape[1]):
if (not isSpace):
if (hist[0][i]): #if space is detected, get the first starting x-coordinates and start count at 1
isSpace = True
count = 1
#x = i
else:
if (not hist[0][i]):
isSpace = False
#when smoothing, thin letters will breakdown, creating a new blank lines or pixel columns, but the count will be small, so we set a threshold.
#print count,"\t",
#append each count of rows of blank gaps found
median_count.append(count)
#if (count > 15):
#xcoords.append(x / count)
else:
#x = x + i
count = count + 1
median_count.append(count)
xcoords.append(x // count)
#returns x-coordinates of the spaces found in the line
return median_count
def get_spaces_threshold(ycoords, img_for_det) :
## Find Median for setting threshold
medianList = []
for i in range ( 0, len(ycoords)-1 ):
line = img_for_det[range(ycoords[i],ycoords[i+1])]
medianList.append(SpacesMedian(line))
#medianList contains count of each blank columns found in all lines
#including spaces found between each characters too
#find the row among medianList[] with maximum length
max_len = len(medianList[0])
max_in = 0 #for index number
for i in range (0, len(medianList)):
if max_len < len(medianList[i]):
max_len = len(medianList[i])
max_in = i
#sort the row having the maximum no. of elements (decending order)
mList = sorted(medianList[max_in],reverse=True)
#delete elements produced from the page's margin
mList = np.delete(mList, [0,1,2])
#print('mList',mList)
firstItem = mList[0]
for i in range (len(mList)-1, 0, -1):
if mList[i] < firstItem/2:
mList = np.delete(mList,i)
mean = np.mean(mList)
threshold_space = mean/2
return threshold_space