Working on dataset making/preprocessing
Signed-off-by: Ethan Wellenreiter <ewellenreiter@gmail.com>
This commit is contained in:
parent
b7ebbb21bd
commit
527362ac0f
12
code/libraries/process.py
Normal file
12
code/libraries/process.py
Normal file
@ -0,0 +1,12 @@
|
||||
|
||||
|
||||
|
||||
def relabel(datasetpath):
|
||||
mappingpathwithindataset = "/baseimages/unaugmentednames/mapping.txt"
|
||||
mappingfilepath = datasetpath+mappingpathwithindataset
|
||||
mappingfile = open(mappingfilepath, 'r')
|
||||
maptext = mappingfile.read()
|
||||
mappingfile.close()
|
||||
print(maptext)
|
||||
|
||||
|
||||
287
code/libraries/testprocessing.ipynb
Normal file
287
code/libraries/testprocessing.ipynb
Normal file
@ -0,0 +1,287 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import pathlib\n",
|
||||
"import shutil\n",
|
||||
"import cv2\n",
|
||||
"import numpy as np"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"sys.path.insert(0, '/mnt/code/autocropper')\n",
|
||||
"import myfunctions as mf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.getcwd()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# filenames = next(os.walk(\"/mnt/dataset/baseimages/unaugmentednames/\"), (None, None, []))[2]\n",
|
||||
"# filenames.remove(\"mapping.txt\")\n",
|
||||
"# print(filenames)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"imagefileextensions = [\".jpg\", \".png\"]\n",
|
||||
"\n",
|
||||
"def parsemaptext(text):\n",
|
||||
" lineseperated = text.split('\\n')\n",
|
||||
" # if (lineseperated[-1] == ''):\n",
|
||||
" # lineseperated = lineseperated[:-1]\n",
|
||||
" # print(lineseperated)\n",
|
||||
" mappingdict = {}\n",
|
||||
" for line in lineseperated:\n",
|
||||
" if line == '':\n",
|
||||
" continue\n",
|
||||
" splitline = line.split(\" | \")\n",
|
||||
" if splitline[0] not in mappingdict:\n",
|
||||
" mappingdict[splitline[0]] = splitline[1]\n",
|
||||
" # print(splitline)\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" return mappingdict\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def readmapfiletodict(mapfilepath):\n",
|
||||
" if (not os.path.isfile(mapfilepath)):\n",
|
||||
" # f = open(mapfilepath, \"x\")\n",
|
||||
" # f.close()\n",
|
||||
" return {}\n",
|
||||
" mappingfile = open(mapfilepath, 'r')\n",
|
||||
" maptext = mappingfile.read()\n",
|
||||
" mappingfile.close()\n",
|
||||
" \n",
|
||||
" mappingdict = parsemaptext(maptext)\n",
|
||||
" return mappingdict\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"def writemapdicttofile(mapfilepath, mappingdict):\n",
|
||||
" starting = False\n",
|
||||
" if (not os.path.isfile(mapfilepath) or os.stat(mapfilepath).st_size == 0):\n",
|
||||
" file = open(mapfilepath, \"w\")\n",
|
||||
" starting = True\n",
|
||||
" # f.close()\n",
|
||||
" # return {}\n",
|
||||
" else:\n",
|
||||
" file = open(mapfilepath, 'a')\n",
|
||||
" for key in mappingdict:\n",
|
||||
" if starting:\n",
|
||||
" file.write(key+\" | \"+mappingdict[key])\n",
|
||||
" starting = False\n",
|
||||
" else:\n",
|
||||
" file.write(\"\\n\"+key+\" | \"+mappingdict[key])\n",
|
||||
" file.close()\n",
|
||||
"\n",
|
||||
"def renameoriginals(datasetpath):\n",
|
||||
" pathtooriginals = \"baseimages/unaugmentednames/\"\n",
|
||||
" mappingfilename = \"mapping.txt\"\n",
|
||||
" mappingpathwithindataset = pathtooriginals+mappingfilename\n",
|
||||
" mappingfilepath = datasetpath+mappingpathwithindataset\n",
|
||||
"\n",
|
||||
" \n",
|
||||
" mappingdict = readmapfiletodict(mappingfilepath)\n",
|
||||
" print(mappingdict)\n",
|
||||
" blacklistednumbers = []\n",
|
||||
" for key in mappingdict:\n",
|
||||
" value = mappingdict[key]\n",
|
||||
" suffix = pathlib.Path(value).suffix\n",
|
||||
" # print(pathlib.Path(value).name)\n",
|
||||
" valnum = value[:-len(suffix)]\n",
|
||||
" blacklistednumbers.append(int(valnum))\n",
|
||||
" print(blacklistednumbers)\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" \n",
|
||||
" filenames = next(os.walk(datasetpath+pathtooriginals), (None, None, []))[2]\n",
|
||||
" if (mappingfilename in filenames):\n",
|
||||
" filenames.remove(mappingfilename)\n",
|
||||
" # print(filenames)\n",
|
||||
" \n",
|
||||
" mappeddict = {}\n",
|
||||
" filenamecounter = 0\n",
|
||||
" for filename in filenames:\n",
|
||||
" suffix = pathlib.Path(filename).suffix\n",
|
||||
" if (suffix not in imagefileextensions):\n",
|
||||
" print(\"Not a valid image \"+filename)\n",
|
||||
" continue\n",
|
||||
" if filename in mappingdict:\n",
|
||||
" continue\n",
|
||||
" while filenamecounter in blacklistednumbers:\n",
|
||||
" filenamecounter += 1\n",
|
||||
" shutil.copyfile(datasetpath+pathtooriginals+filename, datasetpath+\"baseimages/\"+str(filenamecounter)+suffix)\n",
|
||||
" mappeddict[filename] = str(filenamecounter)+suffix\n",
|
||||
" filenamecounter += 1\n",
|
||||
" print(mappeddict)\n",
|
||||
" writemapdicttofile(mappingfilepath, mappeddict)\n",
|
||||
" \n",
|
||||
" # print(maptext)\n",
|
||||
" \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def autocrop(datasetpath):\n",
|
||||
" subpathtobasefiles = \"baseimages/\"\n",
|
||||
" subpathtoaugmentedfiles = \"autocropped/\"\n",
|
||||
" imagespath = datasetpath + subpathtobasefiles\n",
|
||||
" \n",
|
||||
" filenames = next(os.walk(imagespath), (None, None, []))[2]\n",
|
||||
" \n",
|
||||
" for filename in filenames:\n",
|
||||
" suffix = pathlib.Path(filename).suffix\n",
|
||||
" if (suffix not in imagefileextensions):\n",
|
||||
" print(\"Not a valid image \"+filename)\n",
|
||||
" continue\n",
|
||||
" print(imagespath+filename)\n",
|
||||
" if (not os.path.isfile(imagespath+filename)):\n",
|
||||
" print(\"hi\")\n",
|
||||
" continue\n",
|
||||
" img = cv2.imread(imagespath+filename)\n",
|
||||
" # print(img)\n",
|
||||
" autocropped = mf.houghlineprocessing(img)\n",
|
||||
" cv2.imwrite(datasetpath+subpathtoaugmentedfiles+filename, autocropped)\n",
|
||||
" \n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def showimgs(imgs):\n",
|
||||
" if (isinstance(imgs, np.ndarray)):\n",
|
||||
" if (imgs.shape[0] > imgs.shape[1]):\n",
|
||||
" cv2.imshow(\"test\", mf.ResizeWithAspectRatio(imgs, height=1350))\n",
|
||||
" else:\n",
|
||||
" cv2.imshow(\"test\", mf.ResizeWithAspectRatio(imgs, width=1000))\n",
|
||||
" else:\n",
|
||||
" for i, out in enumerate(imgs):\n",
|
||||
" if (out.shape[0] > out.shape[1]):\n",
|
||||
" cv2.imshow(\"test\"+str(i), mf.ResizeWithAspectRatio(out, height=1350))\n",
|
||||
" else:\n",
|
||||
" cv2.imshow(\"test\"+str(i), mf.ResizeWithAspectRatio(out, width=1000))\n",
|
||||
" cv2.waitKey(0)\n",
|
||||
" cv2.destroyAllWindows()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'IMG_7736.jpg': '0.jpg', 'IMG_7737.jpg': '1.jpg', 'IMG_7738.jpg': '2.jpg', 'IMG_7739.jpg': '3.jpg', 'IMG_7740.jpg': '4.jpg', 'IMG_7741.jpg': '5.jpg', 'IMG_7742.jpg': '6.jpg', 'IMG_7743.jpg': '7.jpg', 'IMG_7744.jpg': '8.jpg', 'IMG_7745.jpg': '9.jpg', 'IMG_7747.jpg': '10.jpg', 'IMG_7748.jpg': '11.jpg'}\n",
|
||||
"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]\n",
|
||||
"{}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"renameoriginals(\"/mnt/dataset/\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "error",
|
||||
"evalue": "OpenCV(4.5.4) ./modules/imgproc/src/resize.cpp:4051: error: (-215:Assertion failed) !ssize.empty() in function 'resize'\n",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31merror\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[1;32m/mnt/code/libraries/testprocessing.ipynb Cell 9\u001b[0m line \u001b[0;36m2\n\u001b[1;32m <a href='vscode-notebook-cell://attached-container%2B7b22636f6e7461696e65724e616d65223a222f72696c6962726172696573646576656e76227d/mnt/code/libraries/testprocessing.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=0'>1</a>\u001b[0m img \u001b[39m=\u001b[39m cv2\u001b[39m.\u001b[39mimread(\u001b[39m'\u001b[39m\u001b[39m/mnt/dataset/baseimages/1.jpg\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m----> <a href='vscode-notebook-cell://attached-container%2B7b22636f6e7461696e65724e616d65223a222f72696c6962726172696573646576656e76227d/mnt/code/libraries/testprocessing.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=1'>2</a>\u001b[0m out \u001b[39m=\u001b[39m mf\u001b[39m.\u001b[39;49mhoughlineprocessing(img)\n",
|
||||
"File \u001b[0;32m/mnt/code/autocropper/myfunctions.py:1042\u001b[0m, in \u001b[0;36mhoughlineprocessing\u001b[0;34m(image)\u001b[0m\n\u001b[1;32m 1041\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mhoughlineprocessing\u001b[39m(image):\n\u001b[0;32m-> 1042\u001b[0m croppedanddeskewed, _ \u001b[39m=\u001b[39m houghlinedeskewandcrop(image)\n\u001b[1;32m 1043\u001b[0m \u001b[39m##IF IT DOESN'T CHANGE THE IMAGE (CHANGE THE _ TO SOMETHING USEFUL), THEN CROPCLARIFYING SHOULD JUST DO THE TEXT ISOLATION SECTION AND NOT TRY AND WHITE OUT ANY BACKGROUND.\u001b[39;00m\n\u001b[1;32m 1044\u001b[0m \u001b[39m## IF THERE'S NO CROPPING, MAYBE EVEN JUMP RIGHT TO USING THE EXTERNAL DESKEW FIRST BEFORE TOSSING IT INTO CROPCLARIFYING\u001b[39;00m\n\u001b[1;32m 1046\u001b[0m postprocessed \u001b[39m=\u001b[39m cropclarifying(croppedanddeskewed)\n",
|
||||
"File \u001b[0;32m/mnt/code/autocropper/myfunctions.py:452\u001b[0m, in \u001b[0;36mhoughlinedeskewandcrop\u001b[0;34m(image)\u001b[0m\n\u001b[1;32m 446\u001b[0m rotationangle \u001b[39m=\u001b[39m houghlinedeskewangle(dst1)\n\u001b[1;32m 448\u001b[0m \u001b[39m# -----------------end of finding angle to deskew-----------------\u001b[39;00m\n\u001b[1;32m 449\u001b[0m \n\u001b[1;32m 450\u001b[0m \u001b[39m## -----------------deskewing and then cropping-----------------\u001b[39;00m\n\u001b[0;32m--> 452\u001b[0m \u001b[39mreturn\u001b[39;00m houghlinedeskewthencrop(croppedogimage, dst1, rotationangle)\n",
|
||||
"File \u001b[0;32m/mnt/code/autocropper/myfunctions.py:420\u001b[0m, in \u001b[0;36mhoughlinedeskewthencrop\u001b[0;34m(baseimage, preppedimage, rotationangle)\u001b[0m\n\u001b[1;32m 414\u001b[0m scaledrect \u001b[39m=\u001b[39m (\u001b[39mint\u001b[39m(rect[\u001b[39m0\u001b[39m]\u001b[39m*\u001b[39msizemultiplier), \u001b[39mint\u001b[39m(rect[\u001b[39m1\u001b[39m]\u001b[39m*\u001b[39msizemultiplier), \u001b[39mint\u001b[39m(rect[\u001b[39m2\u001b[39m]\u001b[39m*\u001b[39msizemultiplier), \u001b[39mint\u001b[39m(rect[\u001b[39m3\u001b[39m]\u001b[39m*\u001b[39msizemultiplier))\n\u001b[1;32m 416\u001b[0m croppedbaseimage \u001b[39m=\u001b[39m rotatedbaseimage[scaledrect[\u001b[39m1\u001b[39m]:scaledrect[\u001b[39m3\u001b[39m], scaledrect[\u001b[39m0\u001b[39m]:scaledrect[\u001b[39m2\u001b[39m], :]\n\u001b[0;32m--> 420\u001b[0m shrunkencbi, sizemultiplier \u001b[39m=\u001b[39m ResizeWithAspectRatio(croppedbaseimage, width\u001b[39m=\u001b[39;49m\u001b[39m1000\u001b[39;49m, retscale\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n\u001b[1;32m 421\u001b[0m gray \u001b[39m=\u001b[39m cv2\u001b[39m.\u001b[39mcvtColor(shrunkencbi, cv2\u001b[39m.\u001b[39mCOLOR_BGR2GRAY)\n\u001b[1;32m 422\u001b[0m thresh \u001b[39m=\u001b[39m cv2\u001b[39m.\u001b[39mthreshold(gray, \u001b[39m200\u001b[39m, \u001b[39m255\u001b[39m, cv2\u001b[39m.\u001b[39mTHRESH_BINARY)[\u001b[39m1\u001b[39m]\n",
|
||||
"File \u001b[0;32m/mnt/code/autocropper/myfunctions.py:27\u001b[0m, in \u001b[0;36mResizeWithAspectRatio\u001b[0;34m(image, width, height, inter, retscale)\u001b[0m\n\u001b[1;32m 23\u001b[0m dim \u001b[39m=\u001b[39m (width, \u001b[39mint\u001b[39m(h \u001b[39m*\u001b[39m r))\n\u001b[1;32m 25\u001b[0m \u001b[39mif\u001b[39;00m (retscale \u001b[39m==\u001b[39m \u001b[39mTrue\u001b[39;00m):\n\u001b[1;32m 26\u001b[0m \u001b[39m# print(\"hi\")\u001b[39;00m\n\u001b[0;32m---> 27\u001b[0m \u001b[39mreturn\u001b[39;00m (cv2\u001b[39m.\u001b[39;49mresize(image, dim, interpolation\u001b[39m=\u001b[39;49minter), \u001b[39m1\u001b[39m\u001b[39m/\u001b[39mr)\n\u001b[1;32m 28\u001b[0m \u001b[39mreturn\u001b[39;00m cv2\u001b[39m.\u001b[39mresize(image, dim, interpolation\u001b[39m=\u001b[39minter)\n",
|
||||
"\u001b[0;31merror\u001b[0m: OpenCV(4.5.4) ./modules/imgproc/src/resize.cpp:4051: error: (-215:Assertion failed) !ssize.empty() in function 'resize'\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"img = cv2.imread('/mnt/dataset/baseimages/1.jpg')\n",
|
||||
"out = mf.houghlineprocessing(img)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"showimgs(out)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# autocrop(\"/mnt/dataset/\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@ -12,3 +12,9 @@ RUN apt-get update && \
|
||||
apt-get install -y build-essential cmake git gdb pkg-config valgrind systemd-coredump python3-opencv libopencv-dev python3-pip python3-dev && \
|
||||
apt-get -y clean && apt-get -y autoremove
|
||||
|
||||
RUN pip3 install jupyter notebook
|
||||
|
||||
|
||||
RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
|
||||
RUN pip3 install matplotlib && pip3 install deskew
|
||||
|
||||
|
||||
4
run.sh
4
run.sh
@ -115,7 +115,9 @@ for branch in ${branches[@]}; do
|
||||
# ${imagename}"
|
||||
|
||||
# echo "hi"
|
||||
docker run --rm --mount type=bind,source="$(pwd)"/code,target=/mnt/code -w "//mnt/code" -it -d --name "${dockercontainername}" ${DISPLAYFLAGS} \
|
||||
docker run --rm --mount type=bind,source="$(pwd)"/code,target=/mnt/code --mount type=bind,source="$(pwd)"/customreceiptdataset,target=/mnt/dataset \
|
||||
-w "//mnt/code" \
|
||||
-it -d --name "${dockercontainername}" ${DISPLAYFLAGS} \
|
||||
--memory=8g --cpus=6 \
|
||||
${extrarunflags} ${imagename}
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user