receipt_indexer/code/autocropper/notebooks/helper_notebooks/saveaspng.ipynb
Ethan Wellenreiter 423b511dd9 Cleanup commit
Moving around the testing notebooks. Autocropping is about done
with exception to any new versions or converting the stuff to C
code.

Signed-off-by: Ethan Wellenreiter <ewellenreiter@gmail.com>
2023-10-18 22:48:24 -04:00

160 lines
4.2 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# from datasets import load_dataset, Image\n",
"import datasets as ds\n",
"import PIL\n",
"import torchvision.transforms.functional as tvf\n",
"from torchvision.transforms import v2\n",
"import random\n",
"import numpy as np\n",
"\n",
"import torchvision.utils as utils\n",
"\n",
"from tqdm.autonotebook import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"original_dataset = ds.load_dataset(\"aharley/rvl_cdip\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Create own dataset from the images of the original dataset but make the labels the float value for the rotation. do the random rotation on all of the training ones but the labels for the validation and test should/can be 0\n",
"trainblacklist = []\n",
"testblacklist = [33669] # index 33669 is just corrupted\n",
"validationblacklist = []\n",
"og_training_dataset = original_dataset['train'].select([i for i in range(len(original_dataset['train'])) if i not in trainblacklist])\n",
"og_testing_dataset = original_dataset['test'].select([i for i in range(len(original_dataset['test'])) if i not in testblacklist])\n",
"og_validation_dataset = original_dataset['validation'].select([i for i in range(len(original_dataset['validation'])) if i not in validationblacklist])\n",
"\n",
"tensorize = v2.Compose([v2.ToImageTensor(), v2.ConvertImageDtype()])\n",
"\n",
"og_training_dataset.set_transform(tensorize)\n",
"og_testing_dataset.set_transform(tensorize)\n",
"og_validation_dataset.set_transform(tensorize)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "755255ae5bea49cc866c96f0d291b570",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/39999 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pbar = tqdm(og_testing_dataset)\n",
"\n",
"for i, entry in enumerate(pbar):\n",
" index = i\n",
" if (i >= 33669):\n",
" index = index + 1\n",
" utils.save_image(entry['image'], \"./datasetimages/test/\"+str(index)+\".jpg\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "88288b649a64430bb52e2ae5720e4b1f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/320000 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pbar = tqdm(og_training_dataset)\n",
"\n",
"for i, entry in enumerate(pbar):\n",
" utils.save_image(entry['image'], \"./datasetimages/train/\"+str(i)+\".jpg\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ed6ce8bc3d224f278df6723fc0c41d72",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/40000 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pbar = tqdm(og_validation_dataset)\n",
"\n",
"for i, entry in enumerate(pbar):\n",
" utils.save_image(entry['image'], \"./datasetimages/validation/\"+str(i)+\".jpg\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}