receipt_indexer/code/textdataretriever/textextractor/extractorfunctions.py

import cv2
import numpy as np

import sys
sys.path.insert(0, '../../autocropper')
import myfunctions as mf


## helper functions
def rectcenterpt(rect, xywhrect=True, retint=False):
    if (xywhrect):
        x = rect[0] + rect[2]/2
        y = rect[1] + rect[3]/2
    else:
        x = (rect[0]+rect[2])/2
        y = (rect[1]+rect[3])/2
    if (retint):
        x = int(x)
        y = int(y)
    return (x,y)

def containsamount(outerrect, innerrect, percentage=1):
    tinyrect = mf.overlapRect([outerrect, innerrect])
    tinyarea = tinyrect[2]*tinyrect[3]
    if (tinyrect[0] == -1):
        tinyarea = 0
    innerrectarea = innerrect[2]*innerrect[3]
    if (tinyarea/innerrectarea >= percentage):
        return True
    return False

def aboveandbelow(outerrect, innerrect):
    if (outerrect[1] < innerrect[1] and outerrect[1]+outerrect[3] > innerrect[1]+innerrect[3]):
        return True
    return False

## Below code is an almost direct copy from https://github.com/scrunts23/CS-Data-Science-Build-Week-1/blob/master/model/dbscan.py

def dbscan(D, eps, MinPts):
    '''
    Cluster the dataset `D` using the DBSCAN algorithm.

    dbscan takes a dataset `D` (a list of vectors), a threshold distance
    `eps`, and a required number of points `MinPts`.

    It will return a list of cluster labels. The label -1 means noise, and then
    the clusters are numbered starting from 1.
    '''

    # This list will hold the final cluster assignment for each point in D.
    # There are two reserved values:
    #    -1 - Indicates a noise point
    #     0 - Means the point hasn't been considered yet.
    # Initially all labels are 0.
    labels = [0]*len(D)

    # C is the ID of the current cluster.
    C = 0

    # This outer loop is just responsible for picking new seed points--a point
    # from which to grow a new cluster.
    # Once a valid seed point is found, a new cluster is created, and the
    # cluster growth is all handled by the 'expandCluster' routine.

    # For each point P in the Dataset D...
    # ('P' is the index of the datapoint, rather than the datapoint itself.)
    for P in range(0, len(D)):

        # Only points that have not already been claimed can be picked as new
        # seed points.
        # If the point's label is not 0, continue to the next point.
        if not (labels[P] == 0):
           continue

        # Find all of P's neighboring points.
        NeighborPts = region_query(D, P, eps)

        # If the number is below MinPts, this point is noise.
        # This is the only condition under which a point is labeled
        # NOISE--when it's not a valid seed point. A NOISE point may later
        # be picked up by another cluster as a boundary point (this is the only
        # condition under which a cluster label can change--from NOISE to
        # something else).
        if len(NeighborPts) < MinPts:
            labels[P] = -1
        # Otherwise, if there are at least MinPts nearby, use this point as the
        # seed for a new cluster.
        else:
           C += 1
           grow_cluster(D, labels, P, NeighborPts, C, eps, MinPts)

    # All data has been clustered!
    return labels


def grow_cluster(D, labels, P, NeighborPts, C, eps, MinPts):
    '''
    Grow a new cluster with label `C` from the seed point `P`.

    This function searches through the dataset to find all points that belong
    to this new cluster. When this function returns, cluster `C` is complete.

    Parameters:
      `D`      - The dataset (a list of vectors)
      `labels` - List storing the cluster labels for all dataset points
      `P`      - Index of the seed point for this new cluster
      `NeighborPts` - All of the neighbors of `P`
      `C`      - The label for this new cluster.
      `eps`    - Threshold distance
      `MinPts` - Minimum required number of neighbors
    '''

    # Assign the cluster label to the seed point.
    labels[P] = C

    # Look at each neighbor of P (neighbors are referred to as Pn).
    # NeighborPts will be used as a FIFO queue of points to search--that is, it
    # will grow as we discover new branch points for the cluster. The FIFO
    # behavior is accomplished by using a while-loop rather than a for-loop.
    # In NeighborPts, the points are represented by their index in the original
    # dataset.
    i = 0
    while i < len(NeighborPts):

        # Get the next point from the queue.
        Pn = NeighborPts[i]

        # If Pn was labelled NOISE during the seed search, then we
        # know it's not a branch point (it doesn't have enough neighbors), so
        # make it a leaf point of cluster C and move on.
        if labels[Pn] == -1:
           labels[Pn] = C

        # Otherwise, if Pn isn't already claimed, claim it as part of C.
        elif labels[Pn] == 0:
            # Add Pn to cluster C (Assign cluster label C).
            labels[Pn] = C

            # Find all the neighbors of Pn
            PnNeighborPts = region_query(D, Pn, eps)

            # If Pn has at least MinPts neighbors, it's a branch point!
            # Add all of its neighbors to the FIFO queue to be searched.
            if len(PnNeighborPts) >= MinPts:
                NeighborPts = NeighborPts + PnNeighborPts
            # If Pn *doesn't* have enough neighbors, then it's a leaf point.
            # Don't queue up it's neighbors as expansion points.
            #else:
                # Do nothing
                #NeighborPts = NeighborPts

        # Advance to the next point in the FIFO queue.
        i += 1

    # We've finished growing cluster C!


def region_query(D, P, eps):
    '''
    Find all points in dataset `D` within distance `eps` of point `P`.

    This function calculates the distance between a point P and every other
    point in the dataset, and then returns only those points which are within a
    threshold distance `eps`.
    '''
    neighbors = []

    # For each point in the dataset...
    for Pn in range(0, len(D)):

        # If the distance is below the threshold, add it to the neighbors list.
        if (rectcenterpt(D[P])[1] - rectcenterpt(D[Pn])[1]) < eps:
           neighbors.append(Pn)

    return neighbors


def padWithColour(img, hpadding=0, vpadding=0, fill=(0,0,0)):
    borderType = cv2.BORDER_CONSTANT
    out = cv2.copyMakeBorder(img, vpadding, vpadding, hpadding, hpadding, borderType, None, fill)
    return out

def mergecontours(contours):
    cont = np.vstack(contours)
    finalcontour = cv2.convexHull(cont)
    return finalcontour

def getSkewAngle(cvImage) -> float:
    # Prep image, copy, convert to gray scale, blur, and threshold
    newImage = padWithColour(cvImage, hpadding=50, vpadding=50, fill=(255,255,255))
    # return newImage
    gray = cv2.cvtColor(newImage, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (9, 9), 0)
    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

    # Apply dilate to merge text into meaningful lines/paragraphs.
    # Use larger kernel on X axis to merge characters into single line, cancelling out any spaces.
    # But use smaller kernel on Y axis to separate between different blocks of text
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 5))
    dilate = cv2.dilate(thresh, kernel, iterations=5)
    # return dilate

    # Find all contours
    contours, hierarchy = cv2.findContours(dilate, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key = cv2.contourArea, reverse = True)

    # Find largest contour and surround in min area box
    largestContour = contours[0]

    mergedcontour = mergecontours(contours)

    # return cv2.drawContours(newImage, [mergedcontour], -1, (0,255,0), thickness=3)
    minAreaRect = cv2.minAreaRect(mergedcontour)
    # return cv2.drawContours(newImage, [largestContour], -1, (0,255,0), thickness=3)
    # minAreaRect = cv2.minAreaRect(largestContour)

    box = cv2.boxPoints(minAreaRect)
    box = np.intp(box)
    newImage = cv2.drawContours(newImage, [box], -1, (0,255,0), thickness=3)
    # return newImage

    # Determine the angle. Convert it to the value that was originally used to obtain skewed image
    angle = minAreaRect[-1]
    # print(angle)
    if angle > 45:
        angle = angle - 90
    if angle < -45:
        angle = 90 + angle
    # print(angle)
    return angle

def minboxdeskew(img, fill=(0,0,0)):
    colourimg = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
    angle = getSkewAngle(colourimg)
    padimg = padWithColour(img, hpadding=50, vpadding=50, fill=fill)
    rotated = mf.rotate(padimg, angle, fill=fill)
    return rotated


def l1linerectretriever(image, divider=2):
    shape = image.shape

    imgcopy = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    # return imgcopy

    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
    linekernel = cv2.getStructuringElement(cv2.MORPH_RECT, (shape[1]//40, 1))
    # reducedimage = image
    reducedimage = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel, iterations=1)
    # reducedimage = cv2.morphologyEx(reducedimage, cv2.MORPH_ERODE, kernel)
    # return reducedimage

    charcanny = cv2.Canny(reducedimage, 0, 500, None, 3)
    # return canny


    lettercontours, heirarchy = cv2.findContours(charcanny,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    # contours, heirarchy = cv2.findContours(255-reducedimage,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # imgcopy = cv2.drawContours(imgcopy, lettercontours, -1, color=(0,255,0), thickness=1)
    # return imgcopy

    letterboxes = np.empty((len(lettercontours), 4), dtype=int)

    for i, contour in enumerate(lettercontours):
        b = list(cv2.boundingRect(contour))
        # b[0] -= (kernel.shape[0]-1)
        # b[1] -= (kernel.shape[1]-1)
        # b[2] += (2*kernel.shape[0]-1)
        # b[3] += (2*kernel.shape[1]-1)
        letterboxes[i] = b
    #     imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), 128, thickness=3)
    # return imgcopy

    epsilonvalue = np.median(letterboxes, axis=0)[3]/divider
    # print(epsilonvalue)


    linemade = 255-cv2.morphologyEx(255-image, cv2.MORPH_DILATE, linekernel)
    # return linemade

    linecanny = cv2.Canny(linemade, 0, 500, None, 3)
    linecontours, heirarchy = cv2.findContours(linecanny,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # imgcopy = cv2.drawContours(imgcopy, linecontours, -1, color=(0,255,0), thickness=1)
    # return imgcopy
    # for i, contour in enumerate(linecontours):
    #     k = i+1
    #     colour = ((k*23123)%255, (k*8654)%255, (k*45242)%255)
    #     imgcopy = cv2.drawContours(imgcopy, [contour], -1, colour, thickness=1)
    # return imgcopy


    lineboxes = np.empty((len(linecontours), 4), dtype=int)

    for i, contour in enumerate(linecontours):
        b = list(cv2.boundingRect(contour))
        # b[0] -= (kernel.shape[0]-1)
        # b[1] -= (kernel.shape[1]-1)
        # b[2] += (2*kernel.shape[0]-1)
        # b[3] += (2*kernel.shape[1]-1)
        lineboxes[i] = b
    #     imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), (0,255,0), thickness=3)
    # return imgcopy

    linelabels = dbscan(lineboxes, epsilonvalue, 1)
    # print(linelabels)
    numclusters = max(linelabels)

    letterboxesbyline = [[] for _ in range(numclusters)]

    for i, linebox in enumerate(lineboxes):
        for j, letterbox in enumerate(letterboxes):
            if containsamount(linebox, letterbox, 0.9):
                letterboxesbyline[linelabels[i]-1].append(letterbox.tolist())

    # print(len(letterboxesbyline))


    # # COLOUR THE RECTANGLES GROUPED
    # for i, setofboxes in enumerate(letterboxesbyline):
    #     k = i+1
    #     colour = ((k*23123)%255, (k*8654)%255, (k*45242)%255)
    #     # print(colour)
    #     # b = lineboxes[i]
    #     # imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), colour, thickness=3)
    #     print(i)
    #     for b in setofboxes:
    #         print(i)
    #         imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), colour, thickness=3)
    # return imgcopy

    mergedboxes = np.empty((numclusters,4), dtype=int)

    tobedeleted = []

    for i in range(numclusters):
        b = mf.mergerects(letterboxesbyline[i])
        # if (b[0] == -1):
        #     tobedeleted.append(i)
        mergedboxes[i] = b

    # if (tobedeleted != []):
    #     # print("hi")
    #     mergedboxes = np.delete(mergedboxes, tobedeleted, axis=0)
    #     letterboxesbyline = [ele for idx, ele in enumerate(letterboxesbyline) if idx not in tobedeleted]

    return mergedboxes, letterboxesbyline

def sublinerectretriever(image, divider=2):
    shape = image.shape

    imgcopy = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    # return imgcopy

    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
    # reducedimage = image
    reducedimage = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel, iterations=1)
    # reducedimage = cv2.morphologyEx(reducedimage, cv2.MORPH_ERODE, kernel)
    # return reducedimage

    canny = cv2.Canny(reducedimage, 0, 500, None, 3)
    # return canny


    contours, heirarchy = cv2.findContours(canny,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    # contours, heirarchy = cv2.findContours(255-reducedimage,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # imgcopy = cv2.drawContours(imgcopy, contours, -1, color=(0,255,0), thickness=1)
    # return imgcopy

    boundingboxes = np.empty((len(contours), 4), dtype=int)

    for i, contour in enumerate(contours):
        b = list(cv2.boundingRect(contour))
        b[0] -= (kernel.shape[0]-1)
        b[1] -= (kernel.shape[1]-1)
        b[2] += (2*kernel.shape[0]-1)
        b[3] += (2*kernel.shape[1]-1)
        boundingboxes[i] = b
    #     imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), 128, thickness=3)
    # return imgcopy

    epsilonvalue = np.median(boundingboxes, axis=0)[3]/divider
    # print(epsilonvalue)

    labels = dbscan(boundingboxes, epsilonvalue, 1)
    # print(labels)
    numclusters = max(labels)
    lineboxes = [[] for _ in range(numclusters)]

    for i, item in enumerate(labels):
        lineboxes[item-1].append(boundingboxes[i].tolist())


    # # COLOUR THE RECTANGLES GROUPED
    # for i, setofboxes in enumerate(lineboxes):
    #     k = i+1
    #     colour = ((k*23123)%255, (k*8654)%255, (k*45242)%255)
    #     # print(colour)
    #     for b in setofboxes:
    #         imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), colour, thickness=3)
    # return imgcopy


    mergedboxes = np.empty((numclusters,4), dtype=int)


    for i in range(numclusters):
        b = mf.mergerects(lineboxes[i])
        mergedboxes[i] = b

    j = 0
    while (j < len(mergedboxes)):
        i = 0
        while (i < len(mergedboxes)):
            if (i == j):
                i += 1
                continue
            outerbox = mergedboxes[j]
            innerbox = mergedboxes[i]
            if containsamount(outerbox, innerbox, 1) or aboveandbelow(outerbox, innerbox) or innerbox[3] < epsilonvalue:
                mergedboxes = np.delete(mergedboxes, i, axis=0)
                lineboxes.pop(i)
                if (i < j):
                    j -= 1
                i -= 1
            i += 1
        j += 1

    return mergedboxes, lineboxes

def linerectretriever(image, divider=2, sublines=False):
    if (sublines):
        return sublinerectretriever(image, divider=divider)
    else:
        return l1linerectretriever(image, divider=divider)

def lineimagemaker(thresholded, divider=2, sublines=False):
    lineimages = []
    mergedboxes, originalboxes = linerectretriever(thresholded, divider=divider, sublines=sublines)
    # print(mergedboxes)
    # print(originalboxes)
    # return thresholded

    mergedboxesordering = (mergedboxes[:,1]).argsort() # sorted by y value (aka lines from top to bottom)
    # print(mergedboxesordering)

    goodpoint = 0
    for i, item in enumerate(mergedboxesordering):
        if (mergedboxes[item][0] != -1):
            goodpoint = i
            break
    mergedboxesordering = mergedboxesordering[goodpoint:]

    mergedboxes = mergedboxes[mergedboxesordering]
    originalboxes = [originalboxes[i] for i in mergedboxesordering]
    out = cv2.cvtColor(thresholded.copy(), cv2.COLOR_GRAY2BGR)
    # lineimages.append(out)
    for i, box in enumerate(mergedboxes):
        # print(box)
        mask = np.zeros(thresholded.shape, dtype=np.uint8)
        whitebackground = np.full(thresholded.shape, fill_value=255, dtype=np.uint8)
        # print(originalboxes[i])
        for lb in originalboxes[i]:
            mask = cv2.rectangle(mask, (lb[0],lb[1]), (lb[0]+lb[2], lb[1]+lb[3]), (255,255,255), thickness=cv2.FILLED)

        # lineimages[0] = cv2.rectangle(lineimages[0], (box[0],box[1]), (box[0]+box[2], box[1]+box[3]), (0,255,0), thickness=1)

        invertedmask = cv2.bitwise_not(mask)
        whitedscreen = cv2.bitwise_and(whitebackground, whitebackground, mask=invertedmask)
        lineimage = cv2.bitwise_and(thresholded, thresholded, mask=mask)
        lineimage = cv2.bitwise_or(whitedscreen, lineimage)[box[1]:box[1]+box[3], box[0]:box[0]+box[2]]
        # lineimage = mf.externaldeskew(lineimage, fill=(255,255,255), alreadygray=True)
        # lineimage = thresholded[box[1]:box[1]+box[3], box[0]:box[0]+box[2]]
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
        lineimage = cv2.morphologyEx(lineimage, cv2.MORPH_CLOSE, kernel, iterations=1)
        lineimages.append(lineimage)
        # lineimages.append(mask)
    return lineimages


def ismultiline(img):
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
    reducedimage = cv2.morphologyEx(img, cv2.MORPH_DILATE, kernel)
    # reducedimage = cv2.morphologyEx(reducedimage, cv2.MORPH_ERODE, kernel)

    canny = cv2.Canny(reducedimage, 0, 500, None, 3)
    # return canny


    contours, heirarchy = cv2.findContours(canny,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    # imgcopy = cv2.drawContours(imgcopy, contours, -1, color=(0,255,0), thickness=1)
    # return imgcopy

    boundingboxes = np.empty((len(contours), 4), dtype=int)

    for i, contour in enumerate(contours):
        boundingboxes[i] = cv2.boundingRect(contour)
        b = boundingboxes[i]

    # heightdetermination = np.median(boundingboxes, axis=0)[3]
    heightdetermination = np.max(boundingboxes, axis=0)[3]
    # print(heightdetermination)

    if (img.shape[0] > (heightdetermination*1.5) + (2*50)):
        return True
    return False


### actual function
def lineisolator(image):
    # imgcopy = image.copy()
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # thresholded = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    # return gray
    # return thresholded
    thresholded = gray


    # kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))


    lineimages = lineimagemaker(thresholded, 1.5, False)

    # for i, lineimage in enumerate(lineimages):
    #     lineimages[i] = cv2.morphologyEx(lineimage, cv2.MORPH_ERODE, kernel)


    finallineimages = []

    for i, lineimage in enumerate(lineimages):
        # if (i == 0):
        #     finallineimages.append(lineimages[0])
        #     continue
        deskewedlineimage = minboxdeskew(lineimage, fill=255)

        # finallineimages.append(deskewedlineimage)
        # print(deskewedlineimage.shape)

        if (ismultiline(deskewedlineimage)):
            # print("hi" + str(i))
            templineimages = lineimagemaker(deskewedlineimage, 2.5, True)
        else:
            templineimages = lineimagemaker(deskewedlineimage, 1.5, True)

        # templineimages = lineimagemaker(deskewedlineimage, 2)

        finallineimages += templineimages
        # finallineimages += templineimages[1:]

    for i, lineimage in enumerate(finallineimages):
        deskewedli = minboxdeskew(lineimage, fill=255)
        dim = int((deskewedli.shape[0]-100)//20)
        # print(dim)
        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (dim, dim))
        deskewedli = cv2.morphologyEx(deskewedli, cv2.MORPH_DILATE, kernel,iterations=1)
        finallineimages[i] = cv2.morphologyEx(deskewedli, cv2.MORPH_OPEN, kernel)


    # mergedboxes, originalboxes = linerectretriever(thresholded)
    # mask = np.zeros(thresholded.shape, dtype=np.uint8)
    # for i, box in enumerate(mergedboxes):
    #     for lb in originalboxes[i]:
    #         mask = cv2.rectangle(mask, (lb[0],lb[1]), (lb[0]+lb[2], lb[1]+lb[3]), (255,255,255), thickness=cv2.FILLED)

    # return mask


    # out = tempfunc(thresholded)
    # return out

    return finallineimages