receipt_indexer/code/textdataretriever/textextractor/extractorfunctions.py
Ethan Wellenreiter ae2e6366e1 Updating line isolator to work better with TrOCR model
Signed-off-by: Ethan Wellenreiter <ewellenreiter@gmail.com>
2023-11-14 13:30:54 -05:00

567 lines
20 KiB
Python

import cv2
import numpy as np
import sys
sys.path.insert(0, '../../autocropper')
import myfunctions as mf
## helper functions
def rectcenterpt(rect, xywhrect=True, retint=False):
if (xywhrect):
x = rect[0] + rect[2]/2
y = rect[1] + rect[3]/2
else:
x = (rect[0]+rect[2])/2
y = (rect[1]+rect[3])/2
if (retint):
x = int(x)
y = int(y)
return (x,y)
def containsamount(outerrect, innerrect, percentage=1):
tinyrect = mf.overlapRect([outerrect, innerrect])
tinyarea = tinyrect[2]*tinyrect[3]
if (tinyrect[0] == -1):
tinyarea = 0
innerrectarea = innerrect[2]*innerrect[3]
if (tinyarea/innerrectarea >= percentage):
return True
return False
def aboveandbelow(outerrect, innerrect):
if (outerrect[1] < innerrect[1] and outerrect[1]+outerrect[3] > innerrect[1]+innerrect[3]):
return True
return False
## Below code is an almost direct copy from https://github.com/scrunts23/CS-Data-Science-Build-Week-1/blob/master/model/dbscan.py
def dbscan(D, eps, MinPts):
'''
Cluster the dataset `D` using the DBSCAN algorithm.
dbscan takes a dataset `D` (a list of vectors), a threshold distance
`eps`, and a required number of points `MinPts`.
It will return a list of cluster labels. The label -1 means noise, and then
the clusters are numbered starting from 1.
'''
# This list will hold the final cluster assignment for each point in D.
# There are two reserved values:
# -1 - Indicates a noise point
# 0 - Means the point hasn't been considered yet.
# Initially all labels are 0.
labels = [0]*len(D)
# C is the ID of the current cluster.
C = 0
# This outer loop is just responsible for picking new seed points--a point
# from which to grow a new cluster.
# Once a valid seed point is found, a new cluster is created, and the
# cluster growth is all handled by the 'expandCluster' routine.
# For each point P in the Dataset D...
# ('P' is the index of the datapoint, rather than the datapoint itself.)
for P in range(0, len(D)):
# Only points that have not already been claimed can be picked as new
# seed points.
# If the point's label is not 0, continue to the next point.
if not (labels[P] == 0):
continue
# Find all of P's neighboring points.
NeighborPts = region_query(D, P, eps)
# If the number is below MinPts, this point is noise.
# This is the only condition under which a point is labeled
# NOISE--when it's not a valid seed point. A NOISE point may later
# be picked up by another cluster as a boundary point (this is the only
# condition under which a cluster label can change--from NOISE to
# something else).
if len(NeighborPts) < MinPts:
labels[P] = -1
# Otherwise, if there are at least MinPts nearby, use this point as the
# seed for a new cluster.
else:
C += 1
grow_cluster(D, labels, P, NeighborPts, C, eps, MinPts)
# All data has been clustered!
return labels
def grow_cluster(D, labels, P, NeighborPts, C, eps, MinPts):
'''
Grow a new cluster with label `C` from the seed point `P`.
This function searches through the dataset to find all points that belong
to this new cluster. When this function returns, cluster `C` is complete.
Parameters:
`D` - The dataset (a list of vectors)
`labels` - List storing the cluster labels for all dataset points
`P` - Index of the seed point for this new cluster
`NeighborPts` - All of the neighbors of `P`
`C` - The label for this new cluster.
`eps` - Threshold distance
`MinPts` - Minimum required number of neighbors
'''
# Assign the cluster label to the seed point.
labels[P] = C
# Look at each neighbor of P (neighbors are referred to as Pn).
# NeighborPts will be used as a FIFO queue of points to search--that is, it
# will grow as we discover new branch points for the cluster. The FIFO
# behavior is accomplished by using a while-loop rather than a for-loop.
# In NeighborPts, the points are represented by their index in the original
# dataset.
i = 0
while i < len(NeighborPts):
# Get the next point from the queue.
Pn = NeighborPts[i]
# If Pn was labelled NOISE during the seed search, then we
# know it's not a branch point (it doesn't have enough neighbors), so
# make it a leaf point of cluster C and move on.
if labels[Pn] == -1:
labels[Pn] = C
# Otherwise, if Pn isn't already claimed, claim it as part of C.
elif labels[Pn] == 0:
# Add Pn to cluster C (Assign cluster label C).
labels[Pn] = C
# Find all the neighbors of Pn
PnNeighborPts = region_query(D, Pn, eps)
# If Pn has at least MinPts neighbors, it's a branch point!
# Add all of its neighbors to the FIFO queue to be searched.
if len(PnNeighborPts) >= MinPts:
NeighborPts = NeighborPts + PnNeighborPts
# If Pn *doesn't* have enough neighbors, then it's a leaf point.
# Don't queue up it's neighbors as expansion points.
#else:
# Do nothing
#NeighborPts = NeighborPts
# Advance to the next point in the FIFO queue.
i += 1
# We've finished growing cluster C!
def region_query(D, P, eps):
'''
Find all points in dataset `D` within distance `eps` of point `P`.
This function calculates the distance between a point P and every other
point in the dataset, and then returns only those points which are within a
threshold distance `eps`.
'''
neighbors = []
# For each point in the dataset...
for Pn in range(0, len(D)):
# If the distance is below the threshold, add it to the neighbors list.
if (rectcenterpt(D[P])[1] - rectcenterpt(D[Pn])[1]) < eps:
neighbors.append(Pn)
return neighbors
def padWithColour(img, hpadding=0, vpadding=0, fill=(0,0,0)):
borderType = cv2.BORDER_CONSTANT
out = cv2.copyMakeBorder(img, vpadding, vpadding, hpadding, hpadding, borderType, None, fill)
return out
def mergecontours(contours):
cont = np.vstack(contours)
finalcontour = cv2.convexHull(cont)
return finalcontour
def getSkewAngle(cvImage) -> float:
# Prep image, copy, convert to gray scale, blur, and threshold
newImage = padWithColour(cvImage, hpadding=50, vpadding=50, fill=(255,255,255))
# return newImage
gray = cv2.cvtColor(newImage, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (9, 9), 0)
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# Apply dilate to merge text into meaningful lines/paragraphs.
# Use larger kernel on X axis to merge characters into single line, cancelling out any spaces.
# But use smaller kernel on Y axis to separate between different blocks of text
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 5))
dilate = cv2.dilate(thresh, kernel, iterations=5)
# return dilate
# Find all contours
contours, hierarchy = cv2.findContours(dilate, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key = cv2.contourArea, reverse = True)
# Find largest contour and surround in min area box
largestContour = contours[0]
mergedcontour = mergecontours(contours)
# return cv2.drawContours(newImage, [mergedcontour], -1, (0,255,0), thickness=3)
minAreaRect = cv2.minAreaRect(mergedcontour)
# return cv2.drawContours(newImage, [largestContour], -1, (0,255,0), thickness=3)
# minAreaRect = cv2.minAreaRect(largestContour)
box = cv2.boxPoints(minAreaRect)
box = np.intp(box)
newImage = cv2.drawContours(newImage, [box], -1, (0,255,0), thickness=3)
# return newImage
# Determine the angle. Convert it to the value that was originally used to obtain skewed image
angle = minAreaRect[-1]
# print(angle)
if angle > 45:
angle = angle - 90
if angle < -45:
angle = 90 + angle
# print(angle)
return angle
def minboxdeskew(img, fill=(0,0,0)):
colourimg = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
angle = getSkewAngle(colourimg)
padimg = padWithColour(img, hpadding=50, vpadding=50, fill=fill)
rotated = mf.rotate(padimg, angle, fill=fill)
return rotated
def l1linerectretriever(image, divider=2):
shape = image.shape
imgcopy = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
# return imgcopy
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
linekernel = cv2.getStructuringElement(cv2.MORPH_RECT, (shape[1]//40, 1))
# reducedimage = image
reducedimage = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel, iterations=1)
# reducedimage = cv2.morphologyEx(reducedimage, cv2.MORPH_ERODE, kernel)
# return reducedimage
charcanny = cv2.Canny(reducedimage, 0, 500, None, 3)
# return canny
lettercontours, heirarchy = cv2.findContours(charcanny,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# contours, heirarchy = cv2.findContours(255-reducedimage,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# imgcopy = cv2.drawContours(imgcopy, lettercontours, -1, color=(0,255,0), thickness=1)
# return imgcopy
letterboxes = np.empty((len(lettercontours), 4), dtype=int)
for i, contour in enumerate(lettercontours):
b = list(cv2.boundingRect(contour))
# b[0] -= (kernel.shape[0]-1)
# b[1] -= (kernel.shape[1]-1)
# b[2] += (2*kernel.shape[0]-1)
# b[3] += (2*kernel.shape[1]-1)
letterboxes[i] = b
# imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), 128, thickness=3)
# return imgcopy
epsilonvalue = np.median(letterboxes, axis=0)[3]/divider
# print(epsilonvalue)
linemade = 255-cv2.morphologyEx(255-image, cv2.MORPH_DILATE, linekernel)
# return linemade
linecanny = cv2.Canny(linemade, 0, 500, None, 3)
linecontours, heirarchy = cv2.findContours(linecanny,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# imgcopy = cv2.drawContours(imgcopy, linecontours, -1, color=(0,255,0), thickness=1)
# return imgcopy
# for i, contour in enumerate(linecontours):
# k = i+1
# colour = ((k*23123)%255, (k*8654)%255, (k*45242)%255)
# imgcopy = cv2.drawContours(imgcopy, [contour], -1, colour, thickness=1)
# return imgcopy
lineboxes = np.empty((len(linecontours), 4), dtype=int)
for i, contour in enumerate(linecontours):
b = list(cv2.boundingRect(contour))
# b[0] -= (kernel.shape[0]-1)
# b[1] -= (kernel.shape[1]-1)
# b[2] += (2*kernel.shape[0]-1)
# b[3] += (2*kernel.shape[1]-1)
lineboxes[i] = b
# imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), (0,255,0), thickness=3)
# return imgcopy
linelabels = dbscan(lineboxes, epsilonvalue, 1)
# print(linelabels)
numclusters = max(linelabels)
letterboxesbyline = [[] for _ in range(numclusters)]
for i, linebox in enumerate(lineboxes):
for j, letterbox in enumerate(letterboxes):
if containsamount(linebox, letterbox, 0.9):
letterboxesbyline[linelabels[i]-1].append(letterbox.tolist())
# print(len(letterboxesbyline))
# # COLOUR THE RECTANGLES GROUPED
# for i, setofboxes in enumerate(letterboxesbyline):
# k = i+1
# colour = ((k*23123)%255, (k*8654)%255, (k*45242)%255)
# # print(colour)
# # b = lineboxes[i]
# # imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), colour, thickness=3)
# print(i)
# for b in setofboxes:
# print(i)
# imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), colour, thickness=3)
# return imgcopy
mergedboxes = np.empty((numclusters,4), dtype=int)
tobedeleted = []
for i in range(numclusters):
b = mf.mergerects(letterboxesbyline[i])
# if (b[0] == -1):
# tobedeleted.append(i)
mergedboxes[i] = b
# if (tobedeleted != []):
# # print("hi")
# mergedboxes = np.delete(mergedboxes, tobedeleted, axis=0)
# letterboxesbyline = [ele for idx, ele in enumerate(letterboxesbyline) if idx not in tobedeleted]
return mergedboxes, letterboxesbyline
def sublinerectretriever(image, divider=2):
shape = image.shape
imgcopy = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
# return imgcopy
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
# reducedimage = image
reducedimage = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel, iterations=1)
# reducedimage = cv2.morphologyEx(reducedimage, cv2.MORPH_ERODE, kernel)
# return reducedimage
canny = cv2.Canny(reducedimage, 0, 500, None, 3)
# return canny
contours, heirarchy = cv2.findContours(canny,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# contours, heirarchy = cv2.findContours(255-reducedimage,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# imgcopy = cv2.drawContours(imgcopy, contours, -1, color=(0,255,0), thickness=1)
# return imgcopy
boundingboxes = np.empty((len(contours), 4), dtype=int)
for i, contour in enumerate(contours):
b = list(cv2.boundingRect(contour))
b[0] -= (kernel.shape[0]-1)
b[1] -= (kernel.shape[1]-1)
b[2] += (2*kernel.shape[0]-1)
b[3] += (2*kernel.shape[1]-1)
boundingboxes[i] = b
# imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), 128, thickness=3)
# return imgcopy
epsilonvalue = np.median(boundingboxes, axis=0)[3]/divider
# print(epsilonvalue)
labels = dbscan(boundingboxes, epsilonvalue, 1)
# print(labels)
numclusters = max(labels)
lineboxes = [[] for _ in range(numclusters)]
for i, item in enumerate(labels):
lineboxes[item-1].append(boundingboxes[i].tolist())
# # COLOUR THE RECTANGLES GROUPED
# for i, setofboxes in enumerate(lineboxes):
# k = i+1
# colour = ((k*23123)%255, (k*8654)%255, (k*45242)%255)
# # print(colour)
# for b in setofboxes:
# imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), colour, thickness=3)
# return imgcopy
mergedboxes = np.empty((numclusters,4), dtype=int)
for i in range(numclusters):
b = mf.mergerects(lineboxes[i])
mergedboxes[i] = b
j = 0
while (j < len(mergedboxes)):
i = 0
while (i < len(mergedboxes)):
if (i == j):
i += 1
continue
outerbox = mergedboxes[j]
innerbox = mergedboxes[i]
if containsamount(outerbox, innerbox, 1) or aboveandbelow(outerbox, innerbox) or innerbox[3] < epsilonvalue:
mergedboxes = np.delete(mergedboxes, i, axis=0)
lineboxes.pop(i)
if (i < j):
j -= 1
i -= 1
i += 1
j += 1
return mergedboxes, lineboxes
def linerectretriever(image, divider=2, sublines=False):
if (sublines):
return sublinerectretriever(image, divider=divider)
else:
return l1linerectretriever(image, divider=divider)
def lineimagemaker(thresholded, divider=2, sublines=False):
lineimages = []
mergedboxes, originalboxes = linerectretriever(thresholded, divider=divider, sublines=sublines)
# print(mergedboxes)
# print(originalboxes)
# return thresholded
mergedboxesordering = (mergedboxes[:,1]).argsort() # sorted by y value (aka lines from top to bottom)
# print(mergedboxesordering)
goodpoint = 0
for i, item in enumerate(mergedboxesordering):
if (mergedboxes[item][0] != -1):
goodpoint = i
break
mergedboxesordering = mergedboxesordering[goodpoint:]
mergedboxes = mergedboxes[mergedboxesordering]
originalboxes = [originalboxes[i] for i in mergedboxesordering]
out = cv2.cvtColor(thresholded.copy(), cv2.COLOR_GRAY2BGR)
# lineimages.append(out)
for i, box in enumerate(mergedboxes):
# print(box)
mask = np.zeros(thresholded.shape, dtype=np.uint8)
whitebackground = np.full(thresholded.shape, fill_value=255, dtype=np.uint8)
# print(originalboxes[i])
for lb in originalboxes[i]:
mask = cv2.rectangle(mask, (lb[0],lb[1]), (lb[0]+lb[2], lb[1]+lb[3]), (255,255,255), thickness=cv2.FILLED)
# lineimages[0] = cv2.rectangle(lineimages[0], (box[0],box[1]), (box[0]+box[2], box[1]+box[3]), (0,255,0), thickness=1)
invertedmask = cv2.bitwise_not(mask)
whitedscreen = cv2.bitwise_and(whitebackground, whitebackground, mask=invertedmask)
lineimage = cv2.bitwise_and(thresholded, thresholded, mask=mask)
lineimage = cv2.bitwise_or(whitedscreen, lineimage)[box[1]:box[1]+box[3], box[0]:box[0]+box[2]]
# lineimage = mf.externaldeskew(lineimage, fill=(255,255,255), alreadygray=True)
# lineimage = thresholded[box[1]:box[1]+box[3], box[0]:box[0]+box[2]]
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
lineimage = cv2.morphologyEx(lineimage, cv2.MORPH_CLOSE, kernel, iterations=1)
lineimages.append(lineimage)
# lineimages.append(mask)
return lineimages
def ismultiline(img):
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
reducedimage = cv2.morphologyEx(img, cv2.MORPH_DILATE, kernel)
# reducedimage = cv2.morphologyEx(reducedimage, cv2.MORPH_ERODE, kernel)
canny = cv2.Canny(reducedimage, 0, 500, None, 3)
# return canny
contours, heirarchy = cv2.findContours(canny,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# imgcopy = cv2.drawContours(imgcopy, contours, -1, color=(0,255,0), thickness=1)
# return imgcopy
boundingboxes = np.empty((len(contours), 4), dtype=int)
for i, contour in enumerate(contours):
boundingboxes[i] = cv2.boundingRect(contour)
b = boundingboxes[i]
# heightdetermination = np.median(boundingboxes, axis=0)[3]
heightdetermination = np.max(boundingboxes, axis=0)[3]
# print(heightdetermination)
if (img.shape[0] > (heightdetermination*1.5) + (2*50)):
return True
return False
### actual function
def lineisolator(image):
# imgcopy = image.copy()
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresholded = gray
lineimages = lineimagemaker(thresholded, 1.5, False)
finallineimages = []
for i, lineimage in enumerate(lineimages):
# if (i == 0):
# finallineimages.append(lineimages[0])
# continue
deskewedlineimage = minboxdeskew(lineimage, fill=255)
# finallineimages.append(deskewedlineimage)
# print(deskewedlineimage.shape)
if (ismultiline(deskewedlineimage)):
# print("hi" + str(i))
templineimages = lineimagemaker(deskewedlineimage, 2.5, True)
else:
templineimages = lineimagemaker(deskewedlineimage, 1.5, True)
# templineimages = lineimagemaker(deskewedlineimage, 2)
finallineimages += templineimages
# finallineimages += templineimages[1:]
for i, lineimage in enumerate(finallineimages):
deskewedli = minboxdeskew(lineimage, fill=255)
dim = int((deskewedli.shape[0]-90)//20)
# print(dim)
ellipsekernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (dim, dim))
rectkernel = cv2.getStructuringElement(cv2.MORPH_RECT, (dim, dim))
deskewedli = cv2.morphologyEx(deskewedli, cv2.MORPH_DILATE, rectkernel,iterations=2)
# deskewedli = cv2.morphologyEx(deskewedli, cv2.MORPH_ERODE, ellipsekernel,iterations=1)
deskewedli = cv2.morphologyEx(deskewedli, cv2.MORPH_DILATE, ellipsekernel,iterations=1)
deskewedli = cv2.morphologyEx(deskewedli, cv2.MORPH_ERODE, rectkernel,iterations=1)
# deskewedli = cv2.morphologyEx(deskewedli, cv2.MORPH_OPEN, ellipsekernel)
# finallineimages[i] = cv2.threshold(deskewedli, 254, 255, cv2.THRESH_BINARY)[1]
finallineimages[i] = deskewedli
return finallineimages