Separating text extractor and text classifier.
Exactly what the title says. Signed-off-by: Ethan Wellenreiter <ewellenreiter@gmail.com>
This commit is contained in:
parent
a4d75fc6bd
commit
83306830ac
104
code/textdataretriever/textextractor/temp.ipynb
Normal file
104
code/textdataretriever/textextractor/temp.ipynb
Normal file
@ -0,0 +1,104 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import cv2\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"import myfunctions as mf\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"import scipy.stats as st\n",
|
||||
"import math\n",
|
||||
"\n",
|
||||
"import matplotlib.pyplot as plt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"img = cv2.imread('./test_images/IMG_7594.jpg')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"out = mf.houghlineprocessing(img)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cv2.imshow(\"result2\", mf.ResizeWithAspectRatio(out, height=1000))\n",
|
||||
"cv2.waitKey(0)\n",
|
||||
"cv2.destroyAllWindows()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# https://medium.com/@vatvenger/extracting-lines-from-ocr-a8f410448fc\n",
|
||||
"# https://www.width.ai/post/the-best-ways-to-extract-text-from-images-without-tesseract-python"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"## Potential Next Steps. Isolate a line of text and then feed that into the OCR Model to extract the text."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# #IDEA:\n",
|
||||
"# 1. Isolate lines into rectangles\n",
|
||||
"# 2. feed that rectangle portion of the image into an OCR model\n",
|
||||
"# 3. append that to the final output string with the end character for nextline\n",
|
||||
"# 4. give the whole final string to a model which gives the outputs"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user