2023-10-24 15:08:24 -04:00
14 changed files with 764 additions and 119 deletions
--- a/code/textdataretriever/adjusted_test_images/IMG_7594.jpg
+++ b/code/textdataretriever/adjusted_test_images/IMG_7594.jpg
--- a/code/textdataretriever/adjusted_test_images/IMG_7604.jpg
+++ b/code/textdataretriever/adjusted_test_images/IMG_7604.jpg
--- a/code/textdataretriever/adjusted_test_images/IMG_7605.jpg
+++ b/code/textdataretriever/adjusted_test_images/IMG_7605.jpg
--- a/code/textdataretriever/adjusted_test_images/IMG_7640.jpg
+++ b/code/textdataretriever/adjusted_test_images/IMG_7640.jpg
--- a/code/textdataretriever/adjusted_test_images/IvV2y.png
+++ b/code/textdataretriever/adjusted_test_images/IvV2y.png
--- a/code/textdataretriever/test_images/IMG_7594.jpg
+++ b/code/textdataretriever/test_images/IMG_7594.jpg
--- a/code/textdataretriever/test_images/IMG_7604.jpg
+++ b/code/textdataretriever/test_images/IMG_7604.jpg
--- a/code/textdataretriever/test_images/IMG_7605.jpg
+++ b/code/textdataretriever/test_images/IMG_7605.jpg
--- a/code/textdataretriever/test_images/IMG_7640.jpg
+++ b/code/textdataretriever/test_images/IMG_7640.jpg
--- a/code/textdataretriever/test_images/IvV2y.png
+++ b/code/textdataretriever/test_images/IvV2y.png
--- a/code/textdataretriever/textextractor/extractorfunctions.py
+++ b/code/textdataretriever/textextractor/extractorfunctions.py
@ -0,0 +1,292 @@
+import cv2
+import numpy as np
+
+import sys
+sys.path.insert(0, '../../autocropper')
+import myfunctions as mf
+
+
+
+## helper functions
+def rectcenterpt(rect, xywhrect=True, retint=False):
+    if (xywhrect):
+        x = rect[0] + rect[2]/2
+        y = rect[1] + rect[3]/2
+    else:
+        x = (rect[0]+rect[2])/2
+        y = (rect[1]+rect[3])/2
+    if (retint):
+        x = int(x)
+        y = int(y)
+    return (x,y)
+
+def containsamount(outerrect, innerrect, percentage=1):
+    tinyrect = mf.overlapRect([outerrect, innerrect])
+    tinyarea = tinyrect[2]*tinyrect[3]
+    innerrectarea = innerrect[2]*innerrect[3]
+    if (tinyarea/innerrectarea >= percentage):
+        return True
+    return False
+
+def aboveandbelow(outerrect, innerrect):
+    if (outerrect[1] < innerrect[1] and outerrect[1]+outerrect[3] > innerrect[1]+innerrect[3]):
+        return True
+    return False
+
+## Below code is an almost direct copy from https://github.com/scrunts23/CS-Data-Science-Build-Week-1/blob/master/model/dbscan.py
+
+def dbscan(D, eps, MinPts):
+    '''
+    Cluster the dataset `D` using the DBSCAN algorithm.
+    
+    dbscan takes a dataset `D` (a list of vectors), a threshold distance
+    `eps`, and a required number of points `MinPts`.
+    
+    It will return a list of cluster labels. The label -1 means noise, and then
+    the clusters are numbered starting from 1.
+    '''
+ 
+    # This list will hold the final cluster assignment for each point in D.
+    # There are two reserved values:
+    #    -1 - Indicates a noise point
+    #     0 - Means the point hasn't been considered yet.
+    # Initially all labels are 0.    
+    labels = [0]*len(D)
+
+    # C is the ID of the current cluster.    
+    C = 0
+    
+    # This outer loop is just responsible for picking new seed points--a point
+    # from which to grow a new cluster.
+    # Once a valid seed point is found, a new cluster is created, and the 
+    # cluster growth is all handled by the 'expandCluster' routine.
+    
+    # For each point P in the Dataset D...
+    # ('P' is the index of the datapoint, rather than the datapoint itself.)
+    for P in range(0, len(D)):
+    
+        # Only points that have not already been claimed can be picked as new 
+        # seed points.    
+        # If the point's label is not 0, continue to the next point.
+        if not (labels[P] == 0):
+           continue
+        
+        # Find all of P's neighboring points.
+        NeighborPts = region_query(D, P, eps)
+        
+        # If the number is below MinPts, this point is noise. 
+        # This is the only condition under which a point is labeled 
+        # NOISE--when it's not a valid seed point. A NOISE point may later 
+        # be picked up by another cluster as a boundary point (this is the only
+        # condition under which a cluster label can change--from NOISE to 
+        # something else).
+        if len(NeighborPts) < MinPts:
+            labels[P] = -1
+        # Otherwise, if there are at least MinPts nearby, use this point as the 
+        # seed for a new cluster.    
+        else: 
+           C += 1
+           grow_cluster(D, labels, P, NeighborPts, C, eps, MinPts)
+    
+    # All data has been clustered!
+    return labels
+
+
+def grow_cluster(D, labels, P, NeighborPts, C, eps, MinPts):
+    '''
+    Grow a new cluster with label `C` from the seed point `P`.
+    
+    This function searches through the dataset to find all points that belong
+    to this new cluster. When this function returns, cluster `C` is complete.
+    
+    Parameters:
+      `D`      - The dataset (a list of vectors)
+      `labels` - List storing the cluster labels for all dataset points
+      `P`      - Index of the seed point for this new cluster
+      `NeighborPts` - All of the neighbors of `P`
+      `C`      - The label for this new cluster.  
+      `eps`    - Threshold distance
+      `MinPts` - Minimum required number of neighbors
+    '''
+
+    # Assign the cluster label to the seed point.
+    labels[P] = C
+    
+    # Look at each neighbor of P (neighbors are referred to as Pn). 
+    # NeighborPts will be used as a FIFO queue of points to search--that is, it
+    # will grow as we discover new branch points for the cluster. The FIFO
+    # behavior is accomplished by using a while-loop rather than a for-loop.
+    # In NeighborPts, the points are represented by their index in the original
+    # dataset.
+    i = 0
+    while i < len(NeighborPts):    
+        
+        # Get the next point from the queue.        
+        Pn = NeighborPts[i]
+       
+        # If Pn was labelled NOISE during the seed search, then we
+        # know it's not a branch point (it doesn't have enough neighbors), so
+        # make it a leaf point of cluster C and move on.
+        if labels[Pn] == -1:
+           labels[Pn] = C
+        
+        # Otherwise, if Pn isn't already claimed, claim it as part of C.
+        elif labels[Pn] == 0:
+            # Add Pn to cluster C (Assign cluster label C).
+            labels[Pn] = C
+            
+            # Find all the neighbors of Pn
+            PnNeighborPts = region_query(D, Pn, eps)
+            
+            # If Pn has at least MinPts neighbors, it's a branch point!
+            # Add all of its neighbors to the FIFO queue to be searched. 
+            if len(PnNeighborPts) >= MinPts:
+                NeighborPts = NeighborPts + PnNeighborPts
+            # If Pn *doesn't* have enough neighbors, then it's a leaf point.
+            # Don't queue up it's neighbors as expansion points.
+            #else:
+                # Do nothing                
+                #NeighborPts = NeighborPts               
+        
+        # Advance to the next point in the FIFO queue.
+        i += 1        
+    
+    # We've finished growing cluster C!
+
+
+def region_query(D, P, eps):
+    '''
+    Find all points in dataset `D` within distance `eps` of point `P`.
+    
+    This function calculates the distance between a point P and every other 
+    point in the dataset, and then returns only those points which are within a
+    threshold distance `eps`.
+    '''
+    neighbors = []
+    
+    # For each point in the dataset...
+    for Pn in range(0, len(D)):
+        
+        # If the distance is below the threshold, add it to the neighbors list.
+        if (rectcenterpt(D[P])[1] - rectcenterpt(D[Pn])[1]) < eps:
+           neighbors.append(Pn)
+            
+    return neighbors
+
+def linerectretriever(image):
+    shape = image.shape
+    
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
+    reducedimage = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel)
+    reducedimage = cv2.morphologyEx(reducedimage, cv2.MORPH_ERODE, kernel)
+    
+    canny = cv2.Canny(reducedimage, 0, 500, None, 3)
+    
+    
+    contours, heirarchy = cv2.findContours(canny,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    boundingboxes = np.empty((len(contours), 4), dtype=int)
+    
+    for i, contour in enumerate(contours):
+        boundingboxes[i] = cv2.boundingRect(contour)
+                             
+    epsilonvalue = np.median(boundingboxes, axis=0)[3]/3
+    
+    labels = dbscan(boundingboxes, epsilonvalue, 1)
+    # print(labels)
+    numclusters = max(labels)
+    lineboxes = [[] for _ in range(numclusters)]
+
+    for i, item in enumerate(labels):
+        lineboxes[item-1].append(boundingboxes[i].tolist())
+        
+            
+    mergedboxes = np.empty((numclusters,4), dtype=int)
+    
+    
+    for i in range(numclusters):
+        b = mf.mergerects(lineboxes[i])
+        mergedboxes[i] = b
+        
+    j = 0
+    while (j < len(mergedboxes)):
+        i = 0
+        while (i < len(mergedboxes)):
+            if (i == j):
+                i += 1
+                continue
+            outerbox = mergedboxes[j]
+            innerbox = mergedboxes[i]
+            if containsamount(outerbox, innerbox, 1) or aboveandbelow(outerbox, innerbox) or innerbox[3] < epsilonvalue:
+                mergedboxes = np.delete(mergedboxes, i, axis=0)
+                lineboxes.pop(i)
+                if (i < j):
+                    j -= 1
+                i -= 1
+            i += 1
+        j += 1
+    
+    return mergedboxes, lineboxes
+        
+def lineimagemaker(thresholded):
+    lineimages = []
+    mergedboxes, originalboxes = linerectretriever(thresholded)
+    
+    mergedboxesordering = (mergedboxes[:,1]).argsort() # sorted by y value (aka lines from top to bottom)
+    mergedboxes = mergedboxes[mergedboxesordering]
+    originalboxes = [originalboxes[i] for i in mergedboxesordering]
+    for i, box in enumerate(mergedboxes):
+        mask = np.zeros(thresholded.shape, dtype=np.uint8)
+        whitebackground = np.full(thresholded.shape, fill_value=255, dtype=np.uint8)
+        for lb in originalboxes[i]:
+            mask = cv2.rectangle(mask, (lb[0],lb[1]), (lb[0]+lb[2], lb[1]+lb[3]), (255,255,255), thickness=cv2.FILLED)
+
+        invertedmask = cv2.bitwise_not(mask)
+        whitedscreen = cv2.bitwise_and(whitebackground, whitebackground, mask=invertedmask)
+        lineimage = cv2.bitwise_and(thresholded, thresholded, mask=mask)
+        lineimage = cv2.bitwise_or(whitedscreen, lineimage)[box[1]:box[1]+box[3], box[0]:box[0]+box[2]]
+        # lineimage = mf.externaldeskew(lineimage, fill=(255,255,255), alreadygray=True)
+        # lineimage = thresholded[box[1]:box[1]+box[3], box[0]:box[0]+box[2]]
+        lineimages.append(lineimage)
+        # lineimages.append(mask)
+    return lineimages
+        
+
+
+
+
+### actual function
+def lineisolator(image):
+    imgcopy = image.copy()
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    thresholded = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
+    
+    
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
+    
+    
+    
+    lineimages = lineimagemaker(thresholded)
+    
+    # for i, lineimage in enumerate(lineimages):
+    #     lineimages[i] = cv2.morphologyEx(lineimage, cv2.MORPH_ERODE, kernel)
+
+    
+    finallineimages = []
+    for i, lineimage in enumerate(lineimages):
+        templineimages = lineimagemaker(lineimage)
+        finallineimages += templineimages
+        
+        
+    # mergedboxes, originalboxes = linerectretriever(thresholded) 
+    # mask = np.zeros(thresholded.shape, dtype=np.uint8)
+    # for i, box in enumerate(mergedboxes):
+    #     for lb in originalboxes[i]:
+    #         mask = cv2.rectangle(mask, (lb[0],lb[1]), (lb[0]+lb[2], lb[1]+lb[3]), (255,255,255), thickness=cv2.FILLED)
+
+    # return mask
+        
+        
+    # out = tempfunc(thresholded)
+    # return out
+        
+    return finallineimages
--- a/code/textdataretriever/textextractor/temp.ipynb
+++ b/code/textdataretriever/textextractor/temp.ipynb
@ -2,15 +2,13 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cv2\n",
    "import numpy as np\n",
    "\n",
-    "import myfunctions as mf\n",
-    "\n",
    "\n",
    "import scipy.stats as st\n",
    "import math\n",
@ -20,36 +18,495 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
-    "img = cv2.imread('./test_images/IMG_7594.jpg')"
+    "import sys\n",
+    "sys.path.insert(0, '../../autocropper')\n",
+    "import myfunctions as mf\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
-    "out = mf.houghlineprocessing(img)"
+    "def rectcenterpt(rect, xywhrect=True, retint=False):\n",
+    "    if (xywhrect):\n",
+    "        x = rect[0] + rect[2]/2\n",
+    "        y = rect[1] + rect[3]/2\n",
+    "    else:\n",
+    "        x = (rect[0]+rect[2])/2\n",
+    "        y = (rect[1]+rect[3])/2\n",
+    "    if (retint):\n",
+    "        x = int(x)\n",
+    "        y = int(y)\n",
+    "    return (x,y)\n",
+    "\n",
+    "def containsamount(outerrect, innerrect, percentage=1):\n",
+    "    tinyrect = mf.overlapRect([outerrect, innerrect])\n",
+    "    tinyarea = tinyrect[2]*tinyrect[3]\n",
+    "    innerrectarea = innerrect[2]*innerrect[3]\n",
+    "    if (tinyarea/innerrectarea >= percentage):\n",
+    "        return True\n",
+    "    return False\n",
+    "\n",
+    "def aboveandbelow(outerrect, innerrect):\n",
+    "    if (outerrect[1] < innerrect[1] and outerrect[1]+outerrect[3] > innerrect[1]+innerrect[3]):\n",
+    "        return True\n",
+    "    return False\n",
+    "\n",
+    "## Below code is an almost direct copy from https://github.com/scrunts23/CS-Data-Science-Build-Week-1/blob/master/model/dbscan.py\n",
+    "\n",
+    "def dbscan(D, eps, MinPts):\n",
+    "    '''\n",
+    "    Cluster the dataset `D` using the DBSCAN algorithm.\n",
+    "    \n",
+    "    dbscan takes a dataset `D` (a list of vectors), a threshold distance\n",
+    "    `eps`, and a required number of points `MinPts`.\n",
+    "    \n",
+    "    It will return a list of cluster labels. The label -1 means noise, and then\n",
+    "    the clusters are numbered starting from 1.\n",
+    "    '''\n",
+    " \n",
+    "    # This list will hold the final cluster assignment for each point in D.\n",
+    "    # There are two reserved values:\n",
+    "    #    -1 - Indicates a noise point\n",
+    "    #     0 - Means the point hasn't been considered yet.\n",
+    "    # Initially all labels are 0.    \n",
+    "    labels = [0]*len(D)\n",
+    "\n",
+    "    # C is the ID of the current cluster.    \n",
+    "    C = 0\n",
+    "    \n",
+    "    # This outer loop is just responsible for picking new seed points--a point\n",
+    "    # from which to grow a new cluster.\n",
+    "    # Once a valid seed point is found, a new cluster is created, and the \n",
+    "    # cluster growth is all handled by the 'expandCluster' routine.\n",
+    "    \n",
+    "    # For each point P in the Dataset D...\n",
+    "    # ('P' is the index of the datapoint, rather than the datapoint itself.)\n",
+    "    for P in range(0, len(D)):\n",
+    "    \n",
+    "        # Only points that have not already been claimed can be picked as new \n",
+    "        # seed points.    \n",
+    "        # If the point's label is not 0, continue to the next point.\n",
+    "        if not (labels[P] == 0):\n",
+    "           continue\n",
+    "        \n",
+    "        # Find all of P's neighboring points.\n",
+    "        NeighborPts = region_query(D, P, eps)\n",
+    "        \n",
+    "        # If the number is below MinPts, this point is noise. \n",
+    "        # This is the only condition under which a point is labeled \n",
+    "        # NOISE--when it's not a valid seed point. A NOISE point may later \n",
+    "        # be picked up by another cluster as a boundary point (this is the only\n",
+    "        # condition under which a cluster label can change--from NOISE to \n",
+    "        # something else).\n",
+    "        if len(NeighborPts) < MinPts:\n",
+    "            labels[P] = -1\n",
+    "        # Otherwise, if there are at least MinPts nearby, use this point as the \n",
+    "        # seed for a new cluster.    \n",
+    "        else: \n",
+    "           C += 1\n",
+    "           grow_cluster(D, labels, P, NeighborPts, C, eps, MinPts)\n",
+    "    \n",
+    "    # All data has been clustered!\n",
+    "    return labels\n",
+    "\n",
+    "\n",
+    "def grow_cluster(D, labels, P, NeighborPts, C, eps, MinPts):\n",
+    "    '''\n",
+    "    Grow a new cluster with label `C` from the seed point `P`.\n",
+    "    \n",
+    "    This function searches through the dataset to find all points that belong\n",
+    "    to this new cluster. When this function returns, cluster `C` is complete.\n",
+    "    \n",
+    "    Parameters:\n",
+    "      `D`      - The dataset (a list of vectors)\n",
+    "      `labels` - List storing the cluster labels for all dataset points\n",
+    "      `P`      - Index of the seed point for this new cluster\n",
+    "      `NeighborPts` - All of the neighbors of `P`\n",
+    "      `C`      - The label for this new cluster.  \n",
+    "      `eps`    - Threshold distance\n",
+    "      `MinPts` - Minimum required number of neighbors\n",
+    "    '''\n",
+    "\n",
+    "    # Assign the cluster label to the seed point.\n",
+    "    labels[P] = C\n",
+    "    \n",
+    "    # Look at each neighbor of P (neighbors are referred to as Pn). \n",
+    "    # NeighborPts will be used as a FIFO queue of points to search--that is, it\n",
+    "    # will grow as we discover new branch points for the cluster. The FIFO\n",
+    "    # behavior is accomplished by using a while-loop rather than a for-loop.\n",
+    "    # In NeighborPts, the points are represented by their index in the original\n",
+    "    # dataset.\n",
+    "    i = 0\n",
+    "    while i < len(NeighborPts):    \n",
+    "        \n",
+    "        # Get the next point from the queue.        \n",
+    "        Pn = NeighborPts[i]\n",
+    "       \n",
+    "        # If Pn was labelled NOISE during the seed search, then we\n",
+    "        # know it's not a branch point (it doesn't have enough neighbors), so\n",
+    "        # make it a leaf point of cluster C and move on.\n",
+    "        if labels[Pn] == -1:\n",
+    "           labels[Pn] = C\n",
+    "        \n",
+    "        # Otherwise, if Pn isn't already claimed, claim it as part of C.\n",
+    "        elif labels[Pn] == 0:\n",
+    "            # Add Pn to cluster C (Assign cluster label C).\n",
+    "            labels[Pn] = C\n",
+    "            \n",
+    "            # Find all the neighbors of Pn\n",
+    "            PnNeighborPts = region_query(D, Pn, eps)\n",
+    "            \n",
+    "            # If Pn has at least MinPts neighbors, it's a branch point!\n",
+    "            # Add all of its neighbors to the FIFO queue to be searched. \n",
+    "            if len(PnNeighborPts) >= MinPts:\n",
+    "                NeighborPts = NeighborPts + PnNeighborPts\n",
+    "            # If Pn *doesn't* have enough neighbors, then it's a leaf point.\n",
+    "            # Don't queue up it's neighbors as expansion points.\n",
+    "            #else:\n",
+    "                # Do nothing                \n",
+    "                #NeighborPts = NeighborPts               \n",
+    "        \n",
+    "        # Advance to the next point in the FIFO queue.\n",
+    "        i += 1        \n",
+    "    \n",
+    "    # We've finished growing cluster C!\n",
+    "\n",
+    "\n",
+    "def region_query(D, P, eps):\n",
+    "    '''\n",
+    "    Find all points in dataset `D` within distance `eps` of point `P`.\n",
+    "    \n",
+    "    This function calculates the distance between a point P and every other \n",
+    "    point in the dataset, and then returns only those points which are within a\n",
+    "    threshold distance `eps`.\n",
+    "    '''\n",
+    "    neighbors = []\n",
+    "    \n",
+    "    # For each point in the dataset...\n",
+    "    for Pn in range(0, len(D)):\n",
+    "        \n",
+    "        # If the distance is below the threshold, add it to the neighbors list.\n",
+    "        if (rectcenterpt(D[P])[1] - rectcenterpt(D[Pn])[1]) < eps:\n",
+    "           neighbors.append(Pn)\n",
+    "            \n",
+    "    return neighbors"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
-    "cv2.imshow(\"result2\", mf.ResizeWithAspectRatio(out, height=1000))\n",
+    "def tempfunc(image):\n",
+    "    shape = image.shape\n",
+    "    \n",
+    "\n",
+    "    # blackout = np.zeros(tempout.shape, dtype=np.uint8)\n",
+    "    \n",
+    "    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))\n",
+    "    reducedimage = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel)\n",
+    "    reducedimage = cv2.morphologyEx(reducedimage, cv2.MORPH_ERODE, kernel)\n",
+    "    \n",
+    "    tempout = cv2.cvtColor(reducedimage, cv2.COLOR_GRAY2BGR)\n",
+    "    \n",
+    "    \n",
+    "    canny = cv2.Canny(reducedimage, 0, 500, None, 3)\n",
+    "    \n",
+    "    \n",
+    "    contours, heirarchy = cv2.findContours(canny,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
+    "    boundingboxes = np.empty((len(contours), 4), dtype=int)\n",
+    "    \n",
+    "    for i, contour in enumerate(contours):\n",
+    "        boundingboxes[i] = cv2.boundingRect(contour)\n",
+    "        \n",
+    "    epsilonvalue = np.median(boundingboxes, axis=0)[3]/2\n",
+    "    \n",
+    "    labels = dbscan(boundingboxes, epsilonvalue, 1)\n",
+    "    print(labels)\n",
+    "    numclusters = max(labels)\n",
+    "    lineboxes = [[] for _ in range(numclusters)]\n",
+    "\n",
+    "    for i, item in enumerate(labels):\n",
+    "        lineboxes[item-1].append(boundingboxes[i].tolist())\n",
+    "        \n",
+    "            \n",
+    "    mergedboxes = np.empty((numclusters,4), dtype=int)\n",
+    "    \n",
+    "    \n",
+    "    for i in range(numclusters):\n",
+    "        b = mf.mergerects(lineboxes[i])\n",
+    "        mergedboxes[i] = b\n",
+    "        \n",
+    "    j = 0\n",
+    "    while (j < len(mergedboxes)):\n",
+    "        i = 0\n",
+    "        while (i < len(mergedboxes)):\n",
+    "            if (i == j):\n",
+    "                i += 1\n",
+    "                continue\n",
+    "            outerbox = mergedboxes[j]\n",
+    "            innerbox = mergedboxes[i]\n",
+    "            if containsamount(outerbox, innerbox, 1) or aboveandbelow(outerbox, innerbox) or innerbox[3] < epsilonvalue:\n",
+    "                mergedboxes = np.delete(mergedboxes, i, axis=0)\n",
+    "                lineboxes.pop(i)\n",
+    "                if (i < j):\n",
+    "                    j -= 1\n",
+    "                i -= 1\n",
+    "            i += 1\n",
+    "        j += 1\n",
+    "    \n",
+    "    # return mergedboxes, lineboxes\n",
+    "    for i, b in enumerate(mergedboxes):\n",
+    "        tempout = cv2.rectangle(tempout, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), (0,255,0), thickness=1)\n",
+    "        for t in lineboxes[i]:\n",
+    "            tempout = cv2.rectangle(tempout, (t[0],t[1]), (t[0]+t[2], t[1]+t[3]), (0,0,255), thickness=1)\n",
+    "    \n",
+    "    print(epsilonvalue)\n",
+    "    return tempout"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def linerectretriever(image):\n",
+    "    shape = image.shape\n",
+    "    \n",
+    "    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))\n",
+    "    reducedimage = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel)\n",
+    "    reducedimage = cv2.morphologyEx(reducedimage, cv2.MORPH_ERODE, kernel)\n",
+    "    \n",
+    "    canny = cv2.Canny(reducedimage, 0, 500, None, 3)\n",
+    "    \n",
+    "    \n",
+    "    contours, heirarchy = cv2.findContours(canny,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
+    "    boundingboxes = np.empty((len(contours), 4), dtype=int)\n",
+    "    \n",
+    "    for i, contour in enumerate(contours):\n",
+    "        boundingboxes[i] = cv2.boundingRect(contour)\n",
+    "                             \n",
+    "    epsilonvalue = np.median(boundingboxes, axis=0)[3]/3\n",
+    "    \n",
+    "    labels = dbscan(boundingboxes, epsilonvalue, 1)\n",
+    "    # print(labels)\n",
+    "    numclusters = max(labels)\n",
+    "    lineboxes = [[] for _ in range(numclusters)]\n",
+    "\n",
+    "    for i, item in enumerate(labels):\n",
+    "        lineboxes[item-1].append(boundingboxes[i].tolist())\n",
+    "        \n",
+    "            \n",
+    "    mergedboxes = np.empty((numclusters,4), dtype=int)\n",
+    "    \n",
+    "    \n",
+    "    for i in range(numclusters):\n",
+    "        b = mf.mergerects(lineboxes[i])\n",
+    "        mergedboxes[i] = b\n",
+    "        \n",
+    "    j = 0\n",
+    "    while (j < len(mergedboxes)):\n",
+    "        i = 0\n",
+    "        while (i < len(mergedboxes)):\n",
+    "            if (i == j):\n",
+    "                i += 1\n",
+    "                continue\n",
+    "            outerbox = mergedboxes[j]\n",
+    "            innerbox = mergedboxes[i]\n",
+    "            if containsamount(outerbox, innerbox, 1) or aboveandbelow(outerbox, innerbox) or innerbox[3] < epsilonvalue:\n",
+    "                mergedboxes = np.delete(mergedboxes, i, axis=0)\n",
+    "                lineboxes.pop(i)\n",
+    "                if (i < j):\n",
+    "                    j -= 1\n",
+    "                i -= 1\n",
+    "            i += 1\n",
+    "        j += 1\n",
+    "    \n",
+    "    return mergedboxes, lineboxes\n",
+    "        \n",
+    "def lineimagemaker(thresholded):\n",
+    "    lineimages = []\n",
+    "    mergedboxes, originalboxes = linerectretriever(thresholded)\n",
+    "    \n",
+    "    mergedboxesordering = (mergedboxes[:,1]).argsort() # sorted by y value (aka lines from top to bottom)\n",
+    "    mergedboxes = mergedboxes[mergedboxesordering]\n",
+    "    originalboxes = [originalboxes[i] for i in mergedboxesordering]\n",
+    "    for i, box in enumerate(mergedboxes):\n",
+    "        mask = np.zeros(thresholded.shape, dtype=np.uint8)\n",
+    "        whitebackground = np.full(thresholded.shape, fill_value=255, dtype=np.uint8)\n",
+    "        for lb in originalboxes[i]:\n",
+    "            mask = cv2.rectangle(mask, (lb[0],lb[1]), (lb[0]+lb[2], lb[1]+lb[3]), (255,255,255), thickness=cv2.FILLED)\n",
+    "\n",
+    "        invertedmask = cv2.bitwise_not(mask)\n",
+    "        whitedscreen = cv2.bitwise_and(whitebackground, whitebackground, mask=invertedmask)\n",
+    "        lineimage = cv2.bitwise_and(thresholded, thresholded, mask=mask)\n",
+    "        lineimage = cv2.bitwise_or(whitedscreen, lineimage)[box[1]:box[1]+box[3], box[0]:box[0]+box[2]]\n",
+    "        # lineimage = mf.externaldeskew(lineimage, fill=(255,255,255), alreadygray=True)\n",
+    "        # lineimage = thresholded[box[1]:box[1]+box[3], box[0]:box[0]+box[2]]\n",
+    "        lineimages.append(lineimage)\n",
+    "        # lineimages.append(mask)\n",
+    "    return lineimages\n",
+    "        \n",
+    "\n",
+    "def lineisolator(image):\n",
+    "    imgcopy = image.copy()\n",
+    "    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n",
+    "    thresholded = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]\n",
+    "    \n",
+    "    \n",
+    "    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))\n",
+    "    \n",
+    "    \n",
+    "    \n",
+    "    lineimages = lineimagemaker(thresholded)\n",
+    "    \n",
+    "    # for i, lineimage in enumerate(lineimages):\n",
+    "    #     lineimages[i] = cv2.morphologyEx(lineimage, cv2.MORPH_ERODE, kernel)\n",
+    "\n",
+    "    \n",
+    "    finallineimages = []\n",
+    "    for i, lineimage in enumerate(lineimages):\n",
+    "        templineimages = lineimagemaker(lineimage)\n",
+    "        finallineimages += templineimages\n",
+    "        \n",
+    "        \n",
+    "    # mergedboxes, originalboxes = linerectretriever(thresholded) \n",
+    "    # mask = np.zeros(thresholded.shape, dtype=np.uint8)\n",
+    "    # for i, box in enumerate(mergedboxes):\n",
+    "    #     for lb in originalboxes[i]:\n",
+    "    #         mask = cv2.rectangle(mask, (lb[0],lb[1]), (lb[0]+lb[2], lb[1]+lb[3]), (255,255,255), thickness=cv2.FILLED)\n",
+    "\n",
+    "    # return mask\n",
+    "        \n",
+    "        \n",
+    "    # out = tempfunc(thresholded)\n",
+    "    # return out\n",
+    "        \n",
+    "    return finallineimages\n",
+    "    \n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pathname = \"../adjusted_test_images/\"\n",
+    "filename = \"IMG_7594.jpg\"\n",
+    "\n",
+    "# print(pathname+filename)\n",
+    "img = cv2.imread(pathname+filename)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "outs = lineisolator(img)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
+    "# thresholded = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]\n",
+    "# monke = tempfunc(thresholded)\n",
+    "# cv2.imwrite(\"../temp/monke.jpg\", monke)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# cv2.imshow(\"test\", mf.ResizeWithAspectRatio(img, height=1000))\n",
+    "# # cv2.imshow(\"test1\", mf.ResizeWithAspectRatio(out, height=1000))\n",
+    "# cv2.waitKey(0)\n",
+    "# cv2.destroyAllWindows()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for out in outs:\n",
+    "#     if (out.shape[0] > out.shape[1]):\n",
+    "#         cv2.imshow(\"test1\", mf.ResizeWithAspectRatio(out, height=1000))\n",
+    "#     else:\n",
+    "#         cv2.imshow(\"test1\", mf.ResizeWithAspectRatio(out, width=1000))\n",
+    "#     key = cv2.waitKey(0)\n",
+    "#     cv2.destroyAllWindows()\n",
+    "#     if (key == 107):\n",
+    "#         break\n",
+    "if (isinstance(outs, np.ndarray)):\n",
+    "    if (outs.shape[0] > outs.shape[1]):\n",
+    "        cv2.imshow(\"test\", mf.ResizeWithAspectRatio(outs, height=1350))\n",
+    "    else:\n",
+    "        cv2.imshow(\"test\", mf.ResizeWithAspectRatio(outs, width=1000))\n",
+    "else:\n",
+    "    for i, out in enumerate(outs):\n",
+    "        if (out.shape[0] > out.shape[1]):\n",
+    "            cv2.imshow(\"test\"+str(i), mf.ResizeWithAspectRatio(out, height=1350))\n",
+    "        else:\n",
+    "            cv2.imshow(\"test\"+str(i), mf.ResizeWithAspectRatio(out, width=1000))\n",
    "cv2.waitKey(0)\n",
-    "cv2.destroyAllWindows()"
+    "cv2.destroyAllWindows()\n",
+    "\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 107,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# cv2.imshow(\"test\", mf.ResizeWithAspectRatio(outs[30], width=1000))\n",
+    "# cv2.waitKey(0)\n",
+    "# cv2.destroyAllWindows()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# results = tempfunc(outs[30])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# cv2.imshow(\"test\", mf.ResizeWithAspectRatio(results, width=1000))\n",
+    "# cv2.waitKey(0)\n",
+    "# cv2.destroyAllWindows()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
@ -59,7 +516,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
@ -68,7 +525,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
--- a/code/textextractor/temp.ipynb
+++ b/code/textextractor/temp.ipynb
@ -1,104 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import cv2\n",
-    "import numpy as np\n",
-    "\n",
-    "import myfunctions as mf\n",
-    "\n",
-    "\n",
-    "import scipy.stats as st\n",
-    "import math\n",
-    "\n",
-    "import matplotlib.pyplot as plt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "img = cv2.imread('./test_images/IMG_7594.jpg')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "out = mf.houghlineprocessing(img)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "cv2.imshow(\"result2\", mf.ResizeWithAspectRatio(out, height=1000))\n",
-    "cv2.waitKey(0)\n",
-    "cv2.destroyAllWindows()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# https://medium.com/@vatvenger/extracting-lines-from-ocr-a8f410448fc\n",
-    "# https://www.width.ai/post/the-best-ways-to-extract-text-from-images-without-tesseract-python"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## Potential Next Steps. Isolate a line of text and then feed that into the OCR Model to extract the text."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# #IDEA:\n",
-    "# 1. Isolate lines into rectangles\n",
-    "# 2. feed that rectangle portion of the image into an OCR model\n",
-    "# 3. append that to the final output string with the end character for nextline\n",
-    "# 4. give the whole final string to a model which gives the outputs"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/docker/textextractordockerfile
+++ b/docker/textextractordockerfile
@ -10,14 +10,14 @@ RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

 #-y is for accepting yes when the system asked us for installing the package
 RUN apt-get update && \
-    apt-get install -y build-essential cmake git gdb pkg-config valgrind systemd-coredump python3 python3-opencv libopencv-dev python3-pip python3-dev && \ 
+    apt-get install -y build-essential cmake git gdb pkg-config valgrind systemd-coredump python3-opencv libopencv-dev python3-pip python3-dev && \ 
    apt-get -y clean && apt-get -y autoremove

 RUN python3 -m pip install --upgrade pip

 RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117

-RUN pip3 install datasets && pip3 install jupyter notebook && pip3 install matplotlib
+RUN pip3 install datasets && pip3 install jupyter notebook && pip3 install matplotlib && pip3 install deskew

 RUN pip3 install easyocr && pip3 uninstall -y opencv-python-headless