Starting progress for golang backend

Implemented password hashing and hash checking (with salting). Still need to
add the connecting to database portion.
Also implemented the base digestion for connecting to Redis as a session token store.

Signed-off-by: Ethan Wellenreiter <ewellenreiter@gmail.com>
This commit is contained in:
Ethan Wellenreiter 2025-04-03 15:59:40 -04:00
parent 7fe6adad99
commit f718158a0c
109 changed files with 413 additions and 23375 deletions

13
backend/config.yaml Normal file
View File

@ -0,0 +1,13 @@
redis:
address:
port:
DB:
protocol:
password:
database:
address:
port:
user:
password:

31
backend/src/go.mod Normal file
View File

@ -0,0 +1,31 @@
module backend
go 1.24.2
replace signin => ./session_processing
require (
github.com/spf13/viper v1.20.1
signin v0.0.0-00010101000000-000000000000
)
require (
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
github.com/fsnotify/fsnotify v1.8.0 // indirect
github.com/go-viper/mapstructure/v2 v2.2.1 // indirect
github.com/pelletier/go-toml/v2 v2.2.3 // indirect
github.com/redis/go-redis/v9 v9.7.3 // indirect
github.com/sagikazarmark/locafero v0.7.0 // indirect
github.com/sourcegraph/conc v0.3.0 // indirect
github.com/spf13/afero v1.12.0 // indirect
github.com/spf13/cast v1.7.1 // indirect
github.com/spf13/pflag v1.0.6 // indirect
github.com/subosito/gotenv v1.6.0 // indirect
go.uber.org/atomic v1.9.0 // indirect
go.uber.org/multierr v1.9.0 // indirect
golang.org/x/crypto v0.36.0 // indirect
golang.org/x/sys v0.31.0 // indirect
golang.org/x/text v0.23.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)

64
backend/src/go.sum Normal file
View File

@ -0,0 +1,64 @@
github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=
github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c=
github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0=
github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M=
github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
github.com/go-viper/mapstructure/v2 v2.2.1 h1:ZAaOCxANMuZx5RCeg0mBdEZk7DZasvvZIxtHqx8aGss=
github.com/go-viper/mapstructure/v2 v2.2.1/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M=
github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xlb6rzQu9GuUkc=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/redis/go-redis/v9 v9.7.3 h1:YpPyAayJV+XErNsatSElgRZZVCwXX9QzkKYNvO7x0wM=
github.com/redis/go-redis/v9 v9.7.3/go.mod h1:bGUrSggJ9X9GUmZpZNEOQKaANxSGgOEBRltRTZHSvrA=
github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
github.com/sagikazarmark/locafero v0.7.0 h1:5MqpDsTGNDhY8sGp0Aowyf0qKsPrhewaLSsFaodPcyo=
github.com/sagikazarmark/locafero v0.7.0/go.mod h1:2za3Cg5rMaTMoG/2Ulr9AwtFaIppKXTRYnozin4aB5k=
github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo=
github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0=
github.com/spf13/afero v1.12.0 h1:UcOPyRBYczmFn6yvphxkn9ZEOY65cpwGKb5mL36mrqs=
github.com/spf13/afero v1.12.0/go.mod h1:ZTlWwG4/ahT8W7T0WQ5uYmjI9duaLQGy3Q2OAl4sk/4=
github.com/spf13/cast v1.7.1 h1:cuNEagBQEHWN1FnbGEjCXL2szYEXqfJPbP2HNUaca9Y=
github.com/spf13/cast v1.7.1/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo=
github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o=
github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/spf13/viper v1.20.1 h1:ZMi+z/lvLyPSCoNtFCpqjy0S4kPbirhpTMwl8BkW9X4=
github.com/spf13/viper v1.20.1/go.mod h1:P9Mdzt1zoHIG8m2eZQinpiBjo6kCmZSKBClNNqjJvu4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8=
github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU=
go.uber.org/atomic v1.9.0 h1:ECmE8Bn/WFTYwEW/bpKD3M8VtR/zQVbavAoalC1PYyE=
go.uber.org/atomic v1.9.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
go.uber.org/multierr v1.9.0 h1:7fIwc/ZtS0q++VgcfqFDxSBZVv/Xo49/SYnDFupUwlI=
go.uber.org/multierr v1.9.0/go.mod h1:X2jQV1h+kxSjClGpnseKVIxpmcjrj7MNnI0bnlfKTVQ=
golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34=
golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc=
golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik=
golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

48
backend/src/main.go Normal file
View File

@ -0,0 +1,48 @@
package main
import (
"fmt"
"signin"
"github.com/spf13/viper"
)
func setupConfig() {
viper.SetConfigName("config")
viper.SetConfigType("yaml")
viper.AddConfigPath("..")
viper.AddConfigPath(".")
if err := viper.ReadInConfig(); err != nil {
if _, ok := err.(viper.ConfigFileNotFoundError); ok {
// Config file not found; ignore error if desired
} else {
// Config file was found but another error was produced
}
}
}
func main() {
// fmt.Println("hi")
// params := signin.DefaultArgon2Params
// test, _ := params.GeneratePassEncoding("hello")
// fmt.Println(test)
// fmt.Println(signin.CheckPasswordAgainstEncoding("hello", test))
// fmt.Println(signin.CheckPasswordAgainstEncoding("hello1", test))
// authenticate.test2()
setupConfig()
// fmt.Println(viper.AllKeys())
// redis := viper.GetStringMapString("redis")
// fmt.Println(viper.Get("redis"))
// fmt.Println(redis["address"])
err := signin.InitializeRedis(viper.GetStringMapString("redis"))
fmt.Println(err)
// fmt.Println(rs)
// fmt.Println(map[string]int{"a": 1, "b": 2, "c": 3})
}

View File

@ -0,0 +1,14 @@
module signin
go 1.24.2
require (
github.com/redis/go-redis/v9 v9.7.3
golang.org/x/crypto v0.36.0
)
require (
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
golang.org/x/sys v0.31.0 // indirect
)

View File

@ -0,0 +1,14 @@
github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=
github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c=
github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
github.com/redis/go-redis/v9 v9.7.3 h1:YpPyAayJV+XErNsatSElgRZZVCwXX9QzkKYNvO7x0wM=
github.com/redis/go-redis/v9 v9.7.3/go.mod h1:bGUrSggJ9X9GUmZpZNEOQKaANxSGgOEBRltRTZHSvrA=
golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34=
golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc=
golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik=
golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=

View File

@ -0,0 +1,128 @@
package signin
// credit to https://www.alexedwards.net/blog/how-to-hash-and-verify-passwords-with-argon2-in-go
import (
"fmt"
"runtime"
"strings"
"golang.org/x/crypto/argon2"
"crypto/rand"
"crypto/subtle"
"encoding/base64"
"errors"
)
// PUBLIC METHODS AND STRUCTURES
// argon2 parameter struct
type Argon2params struct {
Memory uint32
Iterations uint32
Parallelism uint8
SaltLength uint32
KeyLength uint32
}
var DefaultArgon2Params = &Argon2params{
Memory: 64 * 1024,
Iterations: 1,
Parallelism: uint8(runtime.NumCPU()),
SaltLength: 16,
KeyLength: 32,
}
func (p *Argon2params) GeneratePassEncoding(password string) (encoding string, err error) {
salt, err := generateRandomBytes(p.SaltLength)
if err != nil {
return "", err
}
hash := argon2.IDKey([]byte(password), salt, p.Iterations, p.Memory, p.Parallelism, p.KeyLength)
// Base64 encode the salt and hashed password.
b64Salt := base64.RawStdEncoding.EncodeToString(salt)
b64Hash := base64.RawStdEncoding.EncodeToString(hash)
// Return a string using the standard encoded hash representation.
encoding = fmt.Sprintf("$argon2id$v=%d$m=%d,t=%d,p=%d$%s$%s", argon2.Version, p.Memory, p.Iterations, p.Parallelism, b64Salt, b64Hash)
return encoding, nil
}
func CheckPasswordAgainstEncoding(password string, encodedHash string) (match bool, err error) {
p, salt, hash, err := decodeHash(encodedHash)
if err != nil {
return false, err
}
// Derive the key from the other password using the same parameters.
otherHash := argon2.IDKey([]byte(password), salt, p.Iterations, p.Memory, p.Parallelism, p.KeyLength)
// Check that the contents of the hashed passwords are identical. Note
// that we are using the subtle.ConstantTimeCompare() function for this
// to help prevent timing attacks.
if subtle.ConstantTimeCompare(hash, otherHash) == 1 {
return true, nil
}
return false, nil
}
// PRIVATE STUFF
// error statements
var ErrInvalidHash = errors.New("the encoded hash is not in the correct format")
var ErrIncompatibleVersion = errors.New("incompatible version of argon2")
func decodeHash(encodedHash string) (p *Argon2params, salt []byte, hash []byte, err error) {
vals := strings.Split(encodedHash, "$")
if len(vals) != 6 {
return nil, nil, nil, ErrInvalidHash
}
var version int
_, err = fmt.Sscanf(vals[2], "v=%d", &version)
if err != nil {
return nil, nil, nil, err
}
if version != argon2.Version {
return nil, nil, nil, ErrIncompatibleVersion
}
p = &Argon2params{}
_, err = fmt.Sscanf(vals[3], "m=%d,t=%d,p=%d", &p.Memory, &p.Iterations, &p.Parallelism)
if err != nil {
return nil, nil, nil, err
}
salt, err = base64.RawStdEncoding.Strict().DecodeString(vals[4])
if err != nil {
return nil, nil, nil, err
}
p.SaltLength = uint32(len(salt))
hash, err = base64.RawStdEncoding.Strict().DecodeString(vals[5])
if err != nil {
return nil, nil, nil, err
}
p.KeyLength = uint32(len(hash))
return p, salt, hash, nil
}
func generateRandomBytes(saltLen uint32) ([]byte, error) {
var bytes []byte = make([]byte, saltLen)
_, err := rand.Read(bytes)
if err != nil {
return nil, err
}
return bytes, nil
}

View File

@ -0,0 +1,101 @@
package signin
import (
"errors"
"strconv"
"time"
"github.com/redis/go-redis/v9"
)
// implement a periodic function to clean up the redis database of old tokens
var redis_client *redis.Client = nil
type session_data struct {
token string
username string
expiresAt time.Time
}
type redisSettings struct {
host string
port uint16
password string
db uint64
protocol uint64
}
var (
ErrInvalidRedisPort = errors.New("Invalid Redis Port")
ErrInvalidRedisDB = errors.New("Invalid Redis DB mode")
ErrInvalidRedisProtocol = errors.New("Invalid Redis Protocol")
)
func processSettingsMap(settings map[string]string, setting_struct *redisSettings) error {
for key, val := range settings {
if val == "" {
continue
}
switch key {
case "host":
setting_struct.host = val
case "port":
s, err := strconv.ParseUint(val, 10, 16)
if s < 3000 || err != nil {
return ErrInvalidRedisPort
}
setting_struct.port = uint16(s)
case "password":
setting_struct.password = val
case "db":
s, err := strconv.ParseUint(val, 10, 64)
if err != nil {
return ErrInvalidRedisDB
}
setting_struct.db = s
case "protocol":
s, err := strconv.ParseUint(val, 10, 64)
if err != nil {
return ErrInvalidRedisProtocol
}
setting_struct.db = s
}
}
return nil
}
func Login() { // add entry into
}
func ValidateSession(token string) bool { // check if it's a valid session against the redis database
return false
}
func InitializeRedis(settings map[string]string) error {
// pulling settings from env map
redis_setup := &redisSettings{
host: "localhost",
port: 6379,
password: "",
db: 0,
protocol: 2,
}
err := processSettingsMap(settings, redis_setup)
if err != nil {
return err
}
// initializing redis connection
redis_client = redis.NewClient(&redis.Options{
Addr: string(strconv.AppendUint([]byte(redis_setup.host+":"), uint64(redis_setup.port), 10)),
Password: redis_setup.password,
DB: int(redis_setup.db),
Protocol: int(redis_setup.protocol),
})
return nil
}

View File

@ -1,17 +0,0 @@
{
"configurations": [
{
"name": "Linux",
"includePath": [
"${workspaceFolder}/**"
],
"defines": [],
"compilerPath": "/usr/bin/gcc",
// "cStandard": "c17",
// "cppStandard": "gnu++17",
"intelliSenseMode": "linux-gcc-x64",
"configurationProvider": "ms-vscode.cmake-tools"
}
],
"version": 4
}

View File

@ -1,65 +0,0 @@
{
"C_Cpp.errorSquiggles": "enabled",
"files.associations": {
"array": "cpp",
"atomic": "cpp",
"bit": "cpp",
"*.tcc": "cpp",
"cctype": "cpp",
"chrono": "cpp",
"clocale": "cpp",
"cmath": "cpp",
"compare": "cpp",
"complex": "cpp",
"concepts": "cpp",
"condition_variable": "cpp",
"cstdarg": "cpp",
"cstddef": "cpp",
"cstdint": "cpp",
"cstdio": "cpp",
"cstdlib": "cpp",
"cstring": "cpp",
"ctime": "cpp",
"cwchar": "cpp",
"cwctype": "cpp",
"deque": "cpp",
"list": "cpp",
"map": "cpp",
"set": "cpp",
"string": "cpp",
"unordered_map": "cpp",
"vector": "cpp",
"exception": "cpp",
"algorithm": "cpp",
"functional": "cpp",
"iterator": "cpp",
"memory": "cpp",
"memory_resource": "cpp",
"numeric": "cpp",
"random": "cpp",
"ratio": "cpp",
"string_view": "cpp",
"system_error": "cpp",
"tuple": "cpp",
"type_traits": "cpp",
"utility": "cpp",
"fstream": "cpp",
"initializer_list": "cpp",
"iomanip": "cpp",
"iosfwd": "cpp",
"iostream": "cpp",
"istream": "cpp",
"limits": "cpp",
"mutex": "cpp",
"new": "cpp",
"numbers": "cpp",
"ostream": "cpp",
"semaphore": "cpp",
"sstream": "cpp",
"stdexcept": "cpp",
"stop_token": "cpp",
"streambuf": "cpp",
"thread": "cpp",
"typeinfo": "cpp"
},
}

View File

@ -1,24 +0,0 @@
cmake_minimum_required(VERSION 3.22)
project(autocropper
VERSION 0.1
DESCRIPTION "Autocrops Receipt Pictures"
LANGUAGES CXX)
#GLOBING
file(GLOB_RECURSE SOURCE_FILES src/*.cpp)
add_executable(CropperEx main.cpp ${SOURCE_FILES})
# add_executable(CropperEx main.cpp
# src/dog.cpp
# src/operations.cpp)
target_compile_features(CropperEx PRIVATE cxx_std_20)
find_package(OpenCV REQUIRED)
target_link_libraries(CropperEx ${OpenCV_LIBS})
target_include_directories(CropperEx
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../externallibraries/stbimagehelpers
PRIVATE ${OpenCV_INCLUDE_DIRS})

View File

@ -1,721 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.10/dist-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().\n",
" warnings.warn(_BETA_TRANSFORMS_WARNING)\n",
"/usr/local/lib/python3.10/dist-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().\n",
" warnings.warn(_BETA_TRANSFORMS_WARNING)\n"
]
}
],
"source": [
"import cv2\n",
"import myfunctions as mf\n",
"import numpy as np\n",
"import math\n",
"import scipy.stats as st"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pathlib\n",
"import time\n",
"\n",
"def removeextensionandnumeric(filename):\n",
" suffix = pathlib.Path(filename).suffix\n",
" num = filename[:-len(suffix)]\n",
" numint = int(num)\n",
" return numint\n",
" \n",
"\n",
"def testondataset(pathtodataset, function):\n",
" imagefileextensions = [\".jpg\", \".png\"]\n",
" filenames = next(os.walk(pathtodataset), (None, None, []))[2]\n",
" \n",
" filenames.sort(key=removeextensionandnumeric)\n",
" # print(filenames)\n",
" outs = []\n",
" tdiffs = []\n",
" for filename in filenames:\n",
" suffix = pathlib.Path(filename).suffix\n",
" if (suffix not in imagefileextensions):\n",
" print(\"Not a valid image \"+filename)\n",
" continue\n",
" img = cv2.imread(pathtodataset+filename)\n",
" t1 = time.time()\n",
" outs.append(function(img))\n",
" tdiffs.append(time.time() - t1)\n",
" tdiffs = np.array(tdiffs)\n",
" print(\"average time: \" + str(np.mean(tdiffs))+\"(s)\")\n",
" return outs\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def showimgs(imgs):\n",
" if (isinstance(imgs, np.ndarray)):\n",
" if (imgs.shape[0] > imgs.shape[1]):\n",
" cv2.imshow(\"test\", mf.ResizeWithAspectRatio(imgs, height=1350))\n",
" else:\n",
" cv2.imshow(\"test\", mf.ResizeWithAspectRatio(imgs, width=1000))\n",
" else:\n",
" for i, out in enumerate(imgs):\n",
" if (out.shape[0] > out.shape[1]):\n",
" cv2.imshow(\"test\"+str(i), mf.ResizeWithAspectRatio(out, height=1350))\n",
" else:\n",
" cv2.imshow(\"test\"+str(i), mf.ResizeWithAspectRatio(out, width=1000))\n",
" cv2.waitKey(0)\n",
" cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def writeimgs(directorypath, imgs):\n",
" if (isinstance(imgs, np.ndarray)):\n",
" cv2.imwrite(directorypath+\"test.png\", imgs)\n",
" else:\n",
" for i, out in enumerate(imgs):\n",
" cv2.imwrite(directorypath+\"test\"+str(i)+\".png\", out)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"img = cv2.imread('/mnt/dataset/baseimages/12.jpg')\n",
"# img = cv2.imread('/mnt/code/autocropper/test_images/IMG_7605.jpg')\n",
"testall = False"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"## NEED TO FIX THE EARLIER PARTS SO THAT IT DOESN'T HAVE THOSE BLACK SECTIONS AFTER THE ROTATION\n",
"\n",
"\n",
"def whiteoutbackground(image):\n",
" ogshape = image.shape\n",
" shrunkdim=1000\n",
" if (image.shape[1] > image.shape[0]):\n",
" shrunkimg, scaler = mf.ResizeWithAspectRatio(image, width=shrunkdim, retscale=True)\n",
" else:\n",
" shrunkimg, scaler = mf.ResizeWithAspectRatio(image, height=shrunkdim, retscale=True)\n",
" \n",
" mainimage = shrunkimg\n",
" \n",
" sdim = int(min(mainimage.shape[0], mainimage.shape[1])/5)\n",
" srkernel = cv2.getStructuringElement(cv2.MORPH_RECT, (sdim, sdim))\n",
" skernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (sdim, sdim))\n",
" \n",
" \n",
" lab = cv2.cvtColor(mainimage, cv2.COLOR_BGR2LAB)\n",
" \n",
" imglist = []\n",
" # imglist.append(mainimage)\n",
" \n",
" labl = lab[:,:,0]\n",
" # imglist.append(labl)\n",
" # imglist.append(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY))\n",
" laba = lab[:,:,1]\n",
" # imglist.append(laba)\n",
" labb = lab[:,:,2]\n",
" # imglist.append(labb)\n",
" \n",
" \n",
" # canny = cv2.Canny(labl, 0, 500)\n",
" threshl = cv2.threshold(labl, 0, 255, cv2.THRESH_OTSU)[1]\n",
" # return threshl\n",
" \n",
" \n",
" dim = int(min(mainimage.shape[0], mainimage.shape[1])/100)\n",
" # dim = 2\n",
" # dim = dotsize\n",
" kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (dim, dim))\n",
" kernelell = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (dim, dim))\n",
" \n",
" paddedl = mf.padWithColour(threshl, sdim*2, sdim*2, fill=0)\n",
" # return paddedl\n",
" \n",
" \n",
" # morphedl = 255-cv2.morphologyEx(255-threshl, cv2.MORPH_OPEN, kernel, iterations=3)\n",
" morphedl = paddedl\n",
" # morphedl = cv2.morphologyEx(morphedl, cv2.MORPH_ERODE, kernel, iterations=1)\n",
" morphed1l = cv2.morphologyEx(morphedl, cv2.MORPH_ERODE, kernelell, iterations=1)\n",
"\n",
" # return morphedl\n",
" \n",
" contours, heirarchy = cv2.findContours(morphed1l, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
" biggestcontour = max(contours, key=cv2.contourArea)\n",
" \n",
" \n",
" blank = np.full(labl.shape, 255, dtype=np.uint8)\n",
" mask1 = blank.copy()\n",
" mask1 = mf.padWithColour(mask1, sdim*2, sdim*2, fill=255)\n",
" mask1 = cv2.drawContours(mask1, [biggestcontour], -1, 0, thickness=cv2.FILLED)\n",
" \n",
" \n",
" mask1 = cv2.morphologyEx(mask1, cv2.MORPH_DILATE, kernelell, iterations=2)\n",
" \n",
" \n",
" # mask1 = mask1[(sdim*2):-(sdim*2), (sdim*2):-(sdim*2)]\n",
" # return mask1\n",
" \n",
" # morphed2l = mf.padWithColour(morphedl, sdim*2, sdim*2, fill=255)\n",
" morphed2l = cv2.morphologyEx(morphedl, cv2.MORPH_OPEN, kernel, iterations=1)\n",
" # morphed2l = morphed2l[(sdim*2):-(sdim*2), (sdim*2):-(sdim*2)]\n",
" \n",
" # return morphed2l\n",
" # print(mask1.shape)\n",
" # print(morphed2l.shape)\n",
" morphed2l = cv2.bitwise_or(morphed2l, 255-mask1)\n",
" # return morphed2l\n",
" \n",
" morphed2l = morphed2l[(sdim*2):-(sdim*2), (sdim*2):-(sdim*2)]\n",
" temp_final = cv2.bitwise_or(threshl, 255-morphed2l)\n",
" return temp_final\n",
" \n",
" canny = cv2.Canny(morphed2l, 0, 500)\n",
" # return canny\n",
"\n",
" vminlength = mainimage.shape[0]//10\n",
" vmaxgap = mainimage.shape[0]//50\n",
" vlinesP = cv2.HoughLinesP(canny, 1, np.pi / 180, 10, None, vminlength, vmaxgap)\n",
" \n",
" hminlength = mainimage.shape[1]//15\n",
" hmaxgap = mainimage.shape[1]//40\n",
" hlinesP = cv2.HoughLinesP(canny, 1, np.pi / 180, 10, None, hminlength, hmaxgap)\n",
" # print(linesP)\n",
" \n",
" vmarginlines = mf.WithinXDegrees(vlinesP, 15)\n",
" hmarginlines = mf.WithinXDegrees(hlinesP, 15, baseangle=90)\n",
" \n",
" marginlines = np.append(vmarginlines, hmarginlines, axis=0)\n",
" # marginlines = marginlines.astype(int)\n",
" # # print(marginlines)\n",
" # reshaped = np.reshape(marginlines, (-1,1, 2))\n",
" # # reshaped = cv2.convexHull(reshaped)\n",
" # # print(reshaped)\n",
" \n",
" \n",
" \n",
" colourdst = cv2.cvtColor(morphedl, cv2.COLOR_GRAY2BGR)\n",
" # out = cv2.drawContours(colourdst, [reshaped], -1, (0,255,0), thickness=3)\n",
" # return out\n",
" \n",
" \n",
" #### NEW IDEA: MERGE THE WHITEOUT BACKGROUND AND TEXT CLARIFICATION STEP BECAUSE DOING THE OTSU THRESHOLD SEEMS TO WORK PRETTY WELL AND IF I JUST WHITE OUT THE OUTER AREA (ACTUALLY WHITE)\n",
" # THEN I HAVE JUST THE TEXT\n",
" \n",
"\n",
" if marginlines is not None:\n",
" for l in marginlines:\n",
" cv2.line(colourdst, (int(l[0]), int(l[1])), (int(l[2]), int(l[3])), (0,0,255), 3, cv2.LINE_AA)\n",
" return colourdst\n",
"\n",
"\n",
"\n",
"\n",
"\n",
" ## IDEA:\n",
" # MASK OUT THE WORDS USING OUR MASKS MADE FROM THE STUFF BELOW. THEN WHEN CANNY IS DONE TO IT, IT SHOULDN'T HAVE A WHOLE BUNCH OF SHIT IN THE CENTER. STILL NEED TO FIGURE OUT HOW TO LINK THE HOUGH LINES AROUND THE RECEIPT\n",
"\n",
"\n",
"\n",
"\n",
"\n",
" # morphedl = 255-cv2.morphologyEx(255-threshl, cv2.MORPH_OPEN, kernel, iterations=3)\n",
" morphedl = paddedl\n",
" morphedl = cv2.morphologyEx(morphedl, cv2.MORPH_ERODE, kernel, iterations=1)\n",
" morphedl = cv2.morphologyEx(morphedl, cv2.MORPH_ERODE, kernelell, iterations=1)\n",
"\n",
" # return morphedl\n",
" \n",
" contours, heirarchy = cv2.findContours(morphedl, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
" # print(contours[0].shape)\n",
" print(contours[0])\n",
" biggestcontour = max(contours, key=cv2.contourArea)\n",
" return canny\n",
" \n",
" \n",
" blank = np.full(labl.shape, 255, dtype=np.uint8)\n",
" mask1 = blank.copy()\n",
" mask1 = mf.padWithColour(mask1, sdim*2, sdim*2, fill=255)\n",
" mask1 = cv2.drawContours(mask1, [biggestcontour], -1, 0, thickness=cv2.FILLED)\n",
" \n",
" \n",
" mask1 = mask1[(sdim*2):-(sdim*2), (sdim*2):-(sdim*2)]\n",
" \n",
" \n",
" # resizemask = cv2.resize(mask1, (ogshape[1], ogshape[0]))\n",
" # return resizemask\n",
" maskc = cv2.cvtColor(mask1, cv2.COLOR_GRAY2BGR)\n",
" # print(maskc.shape)\n",
" # print(image.shape)\n",
" whitedbackground = cv2.bitwise_or(mainimage, maskc)\n",
" # return whitedbackground\n",
" \n",
" \n",
" lab2 = cv2.cvtColor(whitedbackground, cv2.COLOR_BGR2LAB)\n",
" \n",
" lab2l = lab2[:,:,0]\n",
" \n",
" \n",
" otsu2 = cv2.threshold(lab2l, 0, 255, cv2.THRESH_OTSU)[1]\n",
" \n",
" expandedmask1 = cv2.morphologyEx(mask1, cv2.MORPH_DILATE, kernel, iterations=1)\n",
" expandedmask1 = cv2.morphologyEx(expandedmask1, cv2.MORPH_DILATE, kernelell, iterations=1)\n",
" # return expandedmask1\n",
" \n",
" maskmerge = cv2.bitwise_and(otsu2, 255-expandedmask1)\n",
" return mask1\n",
" return maskmerge\n",
" \n",
" # return otsu2\n",
" \n",
" mpad = mf.padWithColour(maskmerge, sdim*2, sdim*2, fill=0)\n",
" return mpad\n",
" \n",
" #MORPHOLOGIES \n",
" morphed2 = cv2.morphologyEx(mpad, cv2.MORPH_ERODE, kernel, iterations=1)\n",
" morphed2 = cv2.morphologyEx(morphed2, cv2.MORPH_ERODE, kernelell, iterations=1)\n",
" return morphed2\n",
" \n",
" contours, heirarchy = cv2.findContours(morphed2, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
" biggestcontour = max(contours, key=cv2.contourArea)\n",
" \n",
" \n",
" mask2 = blank.copy()\n",
" mask2 = mf.padWithColour(mask2, sdim*2, sdim*2, fill=255)\n",
" mask2 = cv2.drawContours(mask2, [biggestcontour], -1, 0, thickness=cv2.FILLED)\n",
" \n",
" \n",
" mask2 = mask2[(sdim*2):-(sdim*2), (sdim*2):-(sdim*2)]\n",
" \n",
" return mask2\n",
" \n",
" test = cv2.inpaint(whitedbackground, resizemask, 3, cv2.INPAINT_TELEA)\n",
" \n",
" return test\n",
" \n",
" contours, heirarchy = cv2.findContours(255-labl, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)\n",
" \n",
" imgout = cv2.drawContours(mainimage, contours, -1, (0,255,0), thickness=3)\n",
" return imgout\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def textleaver(image):\n",
" ogshape = image.shape\n",
" shrunkdim=1000\n",
" if (image.shape[1] > image.shape[0]):\n",
" shrunkimg, scaler = mf.ResizeWithAspectRatio(image, width=shrunkdim, retscale=True)\n",
" else:\n",
" shrunkimg, scaler = mf.ResizeWithAspectRatio(image, height=shrunkdim, retscale=True)\n",
" \n",
" mainimage = shrunkimg\n",
" \n",
" sdim = int(min(mainimage.shape[0], mainimage.shape[1])/5)\n",
" srkernel = cv2.getStructuringElement(cv2.MORPH_RECT, (sdim, sdim))\n",
" skernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (sdim, sdim))\n",
" \n",
" oglab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)\n",
" lab = cv2.cvtColor(mainimage, cv2.COLOR_BGR2LAB)\n",
" \n",
" imglist = []\n",
" # imglist.append(mainimage)\n",
" \n",
" labl = lab[:,:,0]\n",
" oglabl = oglab[:,:,0]\n",
" # # imglist.append(labl)\n",
" # # imglist.append(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY))\n",
" # laba = lab[:,:,1]\n",
" # # imglist.append(laba)\n",
" # labb = lab[:,:,2]\n",
" # # imglist.append(labb)\n",
" \n",
" divisor = 1.5\n",
" window = int(min(labl.shape)/divisor)\n",
" window = window if window%2 == 1 else window + 1\n",
" # canny = cv2.Canny(labl, 0, 500)\n",
" ethreshl = cv2.threshold(labl, 0, 255, cv2.THRESH_OTSU)[1]\n",
" threshl = cv2.threshold(labl, 0, 255, cv2.THRESH_OTSU)[1]\n",
" # threshl = cv2.adaptiveThreshold(labl, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, window, 35)\n",
" \n",
" \n",
" ogwindow = int(min(oglabl.shape)/divisor)\n",
" ogwindow = window if window%2 == 1 else window + 1\n",
" print(ogwindow)\n",
" ogthreshl = cv2.threshold(oglabl, 0, 255, cv2.THRESH_TRIANGLE)[1]\n",
" return ogthreshl\n",
" # ogthreshl = cv2.adaptiveThreshold(oglabl, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, ogwindow, 35)\n",
" # return threshl\n",
" \n",
" colourthresh = cv2.cvtColor(threshl, cv2.COLOR_GRAY2BGR)\n",
" \n",
" dim = int(min(mainimage.shape[0], mainimage.shape[1])/100)\n",
" # dim = 2\n",
" # dim = dotsize\n",
" dim = max(3,dim)\n",
" kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (dim, dim))\n",
" kernelell = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (dim, dim))\n",
" \n",
" # paddedl = mf.padWithColour(threshl, sdim*2, sdim*2, fill=0)\n",
" paddedl = threshl\n",
" # return paddedl\n",
" \n",
" \n",
" # morphedl = 255-cv2.morphologyEx(255-threshl, cv2.MORPH_OPEN, kernel, iterations=3)\n",
" morphedl = paddedl\n",
" morphed1l = cv2.morphologyEx(morphedl, cv2.MORPH_ERODE, kernel, iterations=1)\n",
" # morphed1l = cv2.morphologyEx(morphed1l, cv2.MORPH_OPEN, kernel, iterations=1)\n",
" # morphed1l = cv2.morphologyEx(morphed1l, cv2.MORPH_OPEN, kernel, iterations=1)\n",
" # morphed1l = cv2.morphologyEx(morphedl, cv2.MORPH_ERODE, kernelell, iterations=2)\n",
" \n",
" emorphed1l = cv2.morphologyEx(ethreshl, cv2.MORPH_ERODE, kernel, iterations=1)\n",
"\n",
" # return morphedl\n",
" \n",
" contours, heirarchy = cv2.findContours(morphed1l, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
" biggestcontour = max(contours, key=cv2.contourArea)\n",
" \n",
" # temp = cv2.drawContours(colourthresh, [biggestcontour], -1, (0,255,0), thickness=1)\n",
" # return temp\n",
" \n",
" \n",
" blank = np.full(labl.shape, 255, dtype=np.uint8)\n",
" mask1 = blank.copy()\n",
" # mask1 = mf.padWithColour(mask1, sdim*2, sdim*2, fill=255)\n",
" mask1 = cv2.drawContours(mask1, [biggestcontour], -1, 0, thickness=cv2.FILLED)\n",
" ## need to change the erosion so that if the paper goes to the edge, it doesn't get eroded in (because that means the paper is right to the edge and writing may be close)\n",
" \n",
" contours, heirarchy = cv2.findContours(morphed1l, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
" biggestcontour = max(contours, key=cv2.contourArea)\n",
" \n",
" emask1 = blank.copy()\n",
" emask1 = cv2.drawContours(emask1, [biggestcontour], -1, 0, thickness=cv2.FILLED)\n",
" \n",
" mask1 = 255-cv2.morphologyEx(255-mask1, cv2.MORPH_ERODE, kernel, iterations=2)\n",
" \n",
" emask1 = 255-cv2.morphologyEx(255-emask1, cv2.MORPH_ERODE, kernel, iterations=2)\n",
" \n",
" \n",
" # mask1 = mask1[(sdim*2):-(sdim*2), (sdim*2):-(sdim*2)]\n",
" # return mask1\n",
" \n",
" # morphed2l = mf.padWithColour(morphedl, sdim*2, sdim*2, fill=255)\n",
" morphed2l = cv2.morphologyEx(morphedl, cv2.MORPH_OPEN, kernel, iterations=1)\n",
" morphed2l = cv2.morphologyEx(morphedl, cv2.MORPH_ERODE, kernel, iterations=1)\n",
" # morphed2l = morphed2l[(sdim*2):-(sdim*2), (sdim*2):-(sdim*2)]\n",
" \n",
" # return morphed2l\n",
" # print(mask1.shape)\n",
" # print(morphed2l.shape)\n",
" morphed2l = cv2.bitwise_or(morphed2l, 255-mask1)\n",
" # return morphed2l\n",
"\n",
" # paddedthreshl = mf.padWithColour(morphed2l, sdim*2, sdim*2, fill=255)\n",
" # temp = cv2.drawContours(colourthresh, [biggestcontour], -1, (0,255,0), thickness=1)\n",
" # return temp\n",
"\n",
"\n",
" morphed2l = cv2.morphologyEx(morphed2l, cv2.MORPH_ERODE, kernel, iterations=1)\n",
" morphed2l = cv2.morphologyEx(morphed2l, cv2.MORPH_ERODE, kernelell, iterations=1)\n",
" # return morphed2l\n",
" # morphed2l = cv2.bitwise_or(morphed2l, 255-emask1)\n",
" \n",
" # morphed2l = morphed2l[(sdim*2):-(sdim*2), (sdim*2):-(sdim*2)]\n",
" \n",
" resizedmask = cv2.resize(255-morphed2l, (ogshape[1], ogshape[0]))\n",
" temp_final = cv2.bitwise_or(ogthreshl, resizedmask)\n",
" \n",
" dim=3\n",
" kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (dim, dim))\n",
" temp_final = cv2.morphologyEx(temp_final, cv2.MORPH_OPEN, kernel)\n",
" temp_final = cv2.morphologyEx(temp_final, cv2.MORPH_OPEN, kernel)\n",
" # temp_final = cv2.morphologyEx(temp_final, cv2.MORPH_CLOSE, kernel)\n",
" # temp_final = cv2.morphologyEx(temp_final, cv2.MORPH_OPEN, kernel)\n",
" return temp_final"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def cropclarifying(image):\n",
" # whitedbackground = whiteoutbackground(image)\n",
" # return whitedbackground\n",
"\n",
" # textrefined = mf.textClarifying(whitedbackground)\n",
" textrefined = textleaver(image)\n",
" return textrefined\n",
" #maybe now is when I put in the line removing function\n",
"\n",
" lineout = mf.removeLinesFromText(textrefined)\n",
"\n",
" return lineout\n",
" # implement a function that's called refine text"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def houghlineprocessing(image):\n",
" croppedanddeskewed, angle = mf.houghlinedeskewandcrop(image)\n",
" # return croppedanddeskewed\n",
" \n",
" \n",
" # postprocessed = cropclarifying(croppedanddeskewed)\n",
" postprocessed = croppedanddeskewed\n",
" # return postprocessed\n",
" # postprocessed = mf.croptoblack(postprocessed)\n",
" \n",
" # postprocessed = cv2.cvtColor(postprocessed, cv2.COLOR_GRAY2BGR)\n",
" # return postprocessed\n",
" \n",
" # final = mf.externaldeskew(postprocessed, fill=(255,255,255))\n",
" # rotangle = mf.receipttextdeskew(postprocessed, fill=(255,255,255), returnangle=True)\n",
" final = postprocessed\n",
" \n",
" \n",
" # final = mf.croptoblack(final)\n",
" \n",
" # cv2.imshow(\"postprocessed\", mf.ResizeWithAspectRatio(postprocessed, 1000))\n",
" # cv2.imshow(\"final\", mf.ResizeWithAspectRatio(final, 1000))\n",
" # cv2.waitKey(0)\n",
" # cv2.destroyAllWindows()\n",
" \n",
" return final"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# print(img.shape)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0\n"
]
}
],
"source": [
"# prepped, scaler, hp, vp = mf.squareandthenresize(img, fill=255, width=1000, returnscalerinfo=True)\n",
"outs = houghlineprocessing(img)\n",
"# outs = prepimageforhoughline(img, returnrect=True)\n",
"# print(img.shape)\n",
"# outs = houghlinedeskewandcrop(img)\n",
"# outs = outs[0]\n",
"# print(croprect)\n",
"#need to fix premorphCrop. it removes too much"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# shrunk, scaler, hp, vp = mf.squareandthenresize(img, fill=255, width=1000, returnscalerinfo=True)\n",
"# shrunk1, croprect = mf.premorphCrop(shrunk)\n",
"# print(croprect)\n",
"# print(int(30*4.032 - 0))\n",
"# # temp = img[100:, :, :]\n",
"# temp = shrunk[croprect[1]:croprect[1]+croprect[3], croprect[0]:croprect[0]+croprect[2], :]\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# cv2.imshow(\"temp\", mf.ResizeWithAspectRatio(out, height=1000))\n",
"# # cv2.imshow(\"shrunk1\", mf.ResizeWithAspectRatio(shrunk1, height=1000))\n",
"# cv2.waitKey(0)\n",
"# cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"testall = True"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"if not testall:\n",
" showimgs(outs)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# # for out in outs:\n",
"# # if (out.shape[0] > out.shape[1]):\n",
"# # cv2.imshow(\"test1\", mf.ResizeWithAspectRatio(out, height=1000))\n",
"# # else:\\\n",
"# # cv2.imshow(\"test1\", mf.ResizeWithAspectRatio(out, width=1000))\n",
"# # key = cv2.waitKey(0)\n",
"# # cv2.destroyAllWindows()\n",
"# # if (key == 107):\n",
"# # break\n",
"# if (isinstance(outs, np.ndarray)):\n",
"# if (outs.shape[0] > outs.shape[1]):\n",
"# cv2.imshow(\"test\", mf.ResizeWithAspectRatio(outs, height=1350))\n",
"# else:\n",
"# cv2.imshow(\"test\", mf.ResizeWithAspectRatio(outs, width=1000))\n",
"# else:\n",
"# for i, out in enumerate(outs):\n",
"# if (out.shape[0] > out.shape[1]):\n",
"# cv2.imshow(\"test\"+str(i), mf.ResizeWithAspectRatio(out, height=1350))\n",
"# else:\n",
"# cv2.imshow(\"test\"+str(i), mf.ResizeWithAspectRatio(out, width=1000))\n",
"# cv2.waitKey(0)\n",
"# cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9740282517223996\n",
"-2.0053522829578814\n",
"-0.9740282517223996\n",
"0.0\n",
"0.9740282517223996\n",
"-0.9740282517223996\n",
"-0.011669615052326776\n",
"2.0053522829578814\n",
"0.0\n",
"0.0\n",
"0.0\n",
"-2.979380534680281\n",
"0.0\n",
"0.0\n",
"-2.0053522829578814\n",
"-11.000789666511807\n",
"average time: 0.19967518746852875(s)\n"
]
}
],
"source": [
"if testall:\n",
" results = testondataset(\"/mnt/dataset/baseimages/\", houghlineprocessing)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# if testall:\n",
"# showimgs(results)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# print(results[0])"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"if testall:\n",
" writeimgs(\"/mnt/code/autocropper/result_images/\", results)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,11 +0,0 @@
#ifndef CROPPER_H
#define CROPPER_H
#include <opencv2/opencv.hpp>
bool crop(cv::InputArray src, cv::OutputArray dst, bool fastsearch = true, int imageHeight = 700);
#endif //CROPPER_H

View File

@ -1 +0,0 @@
#define DEBUG 1

View File

@ -1,43 +0,0 @@
#include <cropper.h>
#include <opencv2/opencv.hpp>
// PLAN:
// Implement selective search
// Implement Canny edge detection and then find a good rectangle
// Do L2 loss with the corners of the rectangle and choose the selective search rectangle with the lowest loss
//for testing delete later
#include <iostream>
int main(int argc, char** argv) {
if (argc < 2) {
std::cerr << "BAD" << std::endl;
return -1;
}
cv::Mat imOut, result;
imOut = cv::imread(argv[1]);
if (imOut.empty()) {
std::cout << "Could not open or find the image!\n" << std::endl;
std::cout << "Usage: " << argv[0] << " <Input image>" << std::endl;
return -1;
}
crop(imOut, result, true, 1000);
int imageHeight = 800;
int newWidth = result.cols * imageHeight / result.rows;
cv::resize(result, result, cv::Size(newWidth, imageHeight));
cv::imshow("banana", result);
imwrite("../testing_space/cropped.jpg", result);
cv::waitKey();
return 0;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -1,430 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# https://docs.opencv.org/3.4/d9/db0/tutorial_hough_lines.html\n",
"# https://medium.com/@9sphere/machine-vision-recipes-deskewing-document-images-e17827894c34\n",
"# https://towardsdatascience.com/pre-processing-in-ocr-fc231c6035a7"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#initially for deskewing and cropping. moving to a doc for just cropping now that deskewing"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"import cv2\n",
"import numpy as np\n",
"import math\n",
"import myfunctions as mf\n",
"\n",
"import scipy.stats as st"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# def ResizeWithAspectRatio(image, width=None, height=None, inter=cv2.INTER_AREA, retscale=False):\n",
"# dim = None\n",
"# (h, w) = image.shape[:2]\n",
"\n",
"# if width is None and height is None:\n",
"# if (retscale == True):\n",
"# return (image, 1)\n",
"# return image\n",
"# if width is None:\n",
"# r = height / float(h)\n",
"# dim = (int(w * r), height)\n",
"# else:\n",
"# r = width / float(w)\n",
"# dim = (width, int(h * r))\n",
"\n",
"# if (retscale == True):\n",
"# # print(\"hi\")\n",
"# return (cv2.resize(image, dim, interpolation=inter), 1/r)\n",
"# return cv2.resize(image, dim, interpolation=inter)\n",
"\n",
"\n",
"# class SquarePad:\n",
"# def __init__(self, fill):\n",
"# self.fill = fill\n",
" \n",
"# def __call__(self, image):\n",
"# w, h = image.shape[1], image.shape[0]\n",
"# max_wh = np.max([w, h])\n",
"# hp = int((max_wh - w) / 2)\n",
"# vp = int((max_wh - h) / 2)\n",
"# padding = (hp, vp, hp, vp)\n",
"# return cv2.copyMakeBorder(image, vp, vp, hp, hp, cv2.BORDER_CONSTANT, self.fill)\n",
" \n",
" \n",
" \n",
"# def rotate(img, angle):\n",
"# rows,cols = img.shape[0], img.shape[1]\n",
"# M = cv2.getRotationMatrix2D((cols/2,rows/2),angle,1)\n",
"# dst = cv2.warpAffine(img,M,(cols,rows))\n",
"# return dst"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# def morphologyCrop(image):\n",
"# # convert to grayscale\n",
"# gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)\n",
"\n",
"# # threshold\n",
"# thresh = cv2.threshold(gray, 170, 255, cv2.THRESH_BINARY)[1]\n",
"\n",
"# # apply morphology\n",
"# kernel = np.ones((7,7), np.uint8)\n",
"# morph = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)\n",
"# kernel = np.ones((9,9), np.uint8)\n",
"# morph = cv2.morphologyEx(morph, cv2.MORPH_ERODE, kernel)\n",
"\n",
"# # get largest contour\n",
"# contours = cv2.findContours(morph, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)\n",
"# contours = contours[0] if len(contours) == 2 else contours[1]\n",
"# area_thresh = 0\n",
"# for c in contours:\n",
"# area = cv2.contourArea(c)\n",
"# if area > area_thresh:\n",
"# area_thresh = area\n",
"# big_contour = c\n",
"\n",
"\n",
"# # get bounding box\n",
"# x,y,w,h = cv2.boundingRect(big_contour)\n",
"\n",
"# # draw filled contour on black background\n",
"# mask = np.zeros_like(gray)\n",
"# mask = cv2.merge([mask,mask,mask])\n",
"# cv2.drawContours(mask, [big_contour], -1, (255,255,255), cv2.FILLED)\n",
"\n",
"# # apply mask to input\n",
"# result1 = image.copy()\n",
"# result1 = cv2.bitwise_and(result1, mask)\n",
"\n",
"# # crop result\n",
"# result2 = result1[y:y+h, x:x+w]\n",
"# return result2"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# x = -2*np.pi/3\n",
"# print(x)\n",
"# print(np.pi/3)\n",
"# print(x % np.pi)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# def lineAngle(line):\n",
"# # print(line)\n",
"# angle = (math.atan2(line[3] - line[1], line[2] - line[0]) % np.pi) - (np.pi/2)\n",
"# return angle\n",
" \n",
"# def WithinXDegrees(lines, margin):\n",
"# # outlines = np.array([[]])\n",
"# outlines = np.empty((0, 4))\n",
"# # print(outlines.shape)\n",
"# for line in lines:\n",
"# # print(type(line))\n",
"# # print(abs(lineAngle(line[0])))\n",
"# if (np.rad2deg(abs(lineAngle(line[0]))) <= margin):\n",
"# outlines = np.append(outlines, [line[0]], axis=0)\n",
"# return outlines\n",
"\n",
"# def lineBoundingRect(lines):\n",
"# maxvals = lines.max(0)\n",
"# minvals = lines.min(0)\n",
"# boundingrect = (min(minvals[0],minvals[2]), min(minvals[1],minvals[3]), max(maxvals[0],maxvals[2]),max(maxvals[1],maxvals[3]))\n",
"# return boundingrect\n",
"# # print(lines.max(0))\n",
"# # print(type(lines))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"img = cv2.imread('./test_images/IMG_7605.jpg')\n",
"img = mf.SquarePad(fill=255)(img)\n",
"img = mf.rotate(img, 54)\n",
"img = mf.morphologyCrop(mf.ResizeWithAspectRatio(img,1000))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
"# img = cv2.threshold(img, 200, 255, cv2.THRESH_BINARY)[1]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"cv2.imshow(\"Detected Lines (in red) - Standard Hough Line Transform\", mf.ResizeWithAspectRatio(mf.SquarePad(fill=255)(img), 1000))\n",
"cv2.waitKey(0)\n",
"cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"resizedimg = mf.ResizeWithAspectRatio(mf.SquarePad(fill=255)(img), 500)\n",
"\n",
"# cv2.imshow(\"Detected Lines (in red) - Standard Hough Line Transform\", img)\n",
"# cv2.waitKey(0)\n",
"# cv2.destroyAllWindows()\n",
"\n",
"gray = cv2.cvtColor(resizedimg ,cv2.COLOR_BGR2GRAY)\n",
"gray = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)[1]\n",
"cdst = resizedimg.copy()\n",
"\n",
"\n",
"dst = cv2.Canny(gray, 50, 200, None, 3)\n",
"lines = cv2.HoughLines(dst, 1, np.pi/180, 150, None, 0, 0)\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"angles = np.zeros(len(lines))\n",
"if lines is not None:\n",
" for i in range(0, len(lines)):\n",
" rho = lines[i][0][0]\n",
" theta = lines[i][0][1]\n",
" a = math.cos(theta)\n",
" b = math.sin(theta)\n",
" x0 = a * rho\n",
" y0 = b * rho\n",
" unroundedpt1 = (x0 + 1000*(-b), y0 + 1000*(a))\n",
" unroundedpt2 = (x0 - 1000*(-b), y0 - 1000*(a))\n",
" pt1 = (int(unroundedpt1[0]), int(unroundedpt1[1]))\n",
" pt2 = (int(unroundedpt2[0]), int(unroundedpt2[1]))\n",
" v1_theta = math.atan2(pt1[1], pt1[0])\n",
" v2_theta = math.atan2(pt2[1], pt2[0])\n",
" # print(math.atan2(unroundedpt2[1] - unroundedpt1[1], unroundedpt2[0] - unroundedpt1[0]) % np.pi)\n",
" # print(lineAngle((unroundedpt1[0], unroundedpt1[1], unroundedpt2[0], unroundedpt2[1])))\n",
" # angles[i] = math.atan2(unroundedpt2[1] - unroundedpt1[1], unroundedpt2[0] - unroundedpt1[0]) % np.pi\n",
" angles[i] = mf.lineAngle((unroundedpt1[0], unroundedpt1[1], unroundedpt2[0], unroundedpt2[1]))\n",
" cv2.line(cdst, pt1, pt2, (0,0,255), 3, cv2.LINE_AA)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-56.7228217179515\n",
"-56.7228217179515\n"
]
}
],
"source": [
"# print(st.mode(np.around(angles, decimals=1)))\n",
"mode = st.mode(np.around(angles, decimals=2))[0]\n",
"print(np.rad2deg(mode))\n",
"# slope = math.tan(np.deg2rad(mode))\n",
"# print(slope)\n",
"# myy0 = 0\n",
"# p1 = [0,myy0]\n",
"# p2 = [0,myy0]\n",
"# while (math.dist(p1, p2) < 5000):\n",
"# p2[0] += 0.5\n",
"# p2[1] += 0.5*slope*1000\n",
"# p2[1] = int(p2[1])\n",
"# print(p2)\n",
"# cv2.line(cdst, p1, p2, (0,255,0), 3, cv2.LINE_AA)\n",
"# rotationangle = np.rad2deg(mode)-90\n",
"rotationangle = np.rad2deg(mode)\n",
"print(rotationangle)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"cv2.imshow(\"Detected Lines (in red) - Standard Hough Line Transform\", cdst)\n",
"cv2.waitKey(0)\n",
"cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"# cv2.imshow(\"Detected Lines (in red) - Standard Hough Line Transform\", rotate(cdst,rotationangle))\n",
"# cv2.waitKey(0)\n",
"# cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"rotatedimg = mf.SquarePad(fill=255)(mf.rotate(img, rotationangle))\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"# cv2.imshow(\"Rotated Image\", ResizeWithAspectRatio(rotatedimg, 1000))\n",
"# cv2.waitKey(0)\n",
"# cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"resizedrotatedimg = mf.ResizeWithAspectRatio(rotatedimg, 500)\n",
"gray1 = cv2.cvtColor(resizedrotatedimg, cv2.COLOR_BGR2GRAY)\n",
"dst1 = cv2.Canny(gray1, 0, 500, None, 3)\n",
"cdstP = resizedrotatedimg.copy()\n",
"cdstPmargin = cdstP.copy()\n",
"linesP = cv2.HoughLinesP(dst1, 1, np.pi / 180, 30, None, 100, 30)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"if linesP is not None:\n",
" for i in range(0, len(linesP)):\n",
" l = linesP[i][0]\n",
" cv2.line(cdstP, (l[0], l[1]), (l[2], l[3]), (0,0,255), 3, cv2.LINE_AA)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"cv2.imshow(\"Detected Lines (in red) - Standard Hough Line Transform\", cdstP)\n",
"cv2.waitKey(0)\n",
"cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"# print(linesP)\n",
"marginlines = mf.WithinXDegrees(linesP, 2)\n",
"# print(marginlines)\n",
"if marginlines is not None:\n",
" for i in range(0, len(marginlines)):\n",
" l = marginlines[i]\n",
" cv2.line(cdstPmargin, (int(l[0]), int(l[1])), (int(l[2]), int(l[3])), (0,0,255), 3, cv2.LINE_AA)\n",
" \n",
"# boundingrectout = mf.lineBoundingRect(marginlines)\n",
"# # print(boundingrectout)\n",
"# cdstPmargin = cv2.rectangle(cdstPmargin,(int(boundingrectout[0]),int(boundingrectout[1])),(int(boundingrectout[2]),int(boundingrectout[3])),(0,255,0),2)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"cv2.imshow(\"Detected Lines (in red) - Standard Hough Line Transform\", cdstPmargin)\n",
"cv2.waitKey(0)\n",
"cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,385 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 137,
"metadata": {},
"outputs": [],
"source": [
"version=2.0\n",
"cachepath=\"../.cache/\"\n",
"savepath=\"./savespot/\""
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from torch.utils.data import DataLoader\n",
"import torch.nn as nn\n",
"import torch.nn.functional as fn\n",
"import torch.optim as optim\n",
"import torchvision.transforms.functional as tvf\n",
"import torchvision.transforms.v2 as v2\n",
"import torchvision.models as models\n",
"import torchvision.transforms as t\n",
"\n",
"\n",
"from PIL import Image\n",
"\n",
"import datasets as ds\n",
"from tqdm.autonotebook import tqdm\n",
"\n",
"import random\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"import numpy as np\n",
"\n",
"\n",
"torch.cuda.empty_cache()\n",
"\n",
"\n",
"import os\n",
"import cv2"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {},
"outputs": [],
"source": [
"# array = np.load(\"./testing_space/outputarray.npy\")\n",
"# counter = np.load(\"./testing_space/counter.npy\")"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {},
"outputs": [],
"source": [
"# print(array)\n",
"# print(counter)"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {},
"outputs": [],
"source": [
"class RotationDeterminer(nn.Module):\n",
" def __init__(self, new=False):\n",
" super(RotationDeterminer,self).__init__()\n",
" \n",
" torch.cuda.empty_cache()\n",
" \n",
" self.device = torch.device(\"cpu\")\n",
" if torch.cuda.is_available:\n",
" self.device = torch.device(\"cuda:0\")\n",
" \n",
" \n",
" self.appliers = [v2.RandomApply(transforms=[v2.RandomPosterize(bits=1)], p=0.25),\n",
" v2.RandomApply(transforms=[v2.ElasticTransform(alpha=25.0)], p=0.25), # maybe add fill=appliedFill\n",
" v2.RandomApply(transforms=[v2.GaussianBlur(kernel_size=(5,9), sigma=(0.1,2.))],p=0.25),\n",
" v2.RandomApply(transforms=[v2.RandomEqualize()],p=0.25)]\n",
" \n",
" \n",
" # self.conv = nn.Sequential(nn.Conv2d(3, 9, kernel_size=11,stride=3), # 1100 x 1100 => 201 x 201\n",
" # nn.ReLU(inplace=True),\n",
" # nn.Conv2d(9, 18, kernel_size=5,stride=1),\n",
" # nn.ReLU(inplace=True),\n",
" # nn.MaxPool2d(kernel_size=4, stride=2),\n",
" # nn.Conv2d(18, 36, kernel_size=3,stride=2),\n",
" # nn.BatchNorm2d(36),\n",
" # nn.ReLU(inplace=True),\n",
" # nn.Conv2d(36, 72, kernel_size=3,stride=2),\n",
" # nn.ReLU(inplace=True),\n",
" # nn.AvgPool2d(kernel_size=5, stride=3),\n",
" # nn.Conv2d(72, 144, kernel_size=3,stride=1),\n",
" # nn.ReLU(inplace=True),\n",
" # nn.Conv2d(144, 288, kernel_size=5,stride=1),\n",
" # nn.ReLU(inplace=True),\n",
" # nn.MaxPool2d(kernel_size=4, stride=1),\n",
" # nn.Conv2d(288, 192, kernel_size=3,stride=1),\n",
" # nn.ReLU(inplace=True),\n",
" # nn.Conv2d(192, 192, kernel_size=3,stride=1), # => 1\n",
" # nn.ReLU(inplace=True))\n",
" # print(\"hi\")\n",
" self.conv = models.resnet18(pretrained=new)\n",
" \n",
" self.classifier = nn.Sequential(nn.Linear(1000, 4096),\n",
" nn.ReLU(inplace=True),\n",
" nn.Linear(4096,1))\n",
" \n",
" self.lossfunc = nn.MSELoss()\n",
" \n",
" self.imageprep = v2.Compose([self.SquarePad(),v2.Resize(512),v2.Grayscale(num_output_channels=3),v2.CenterCrop(512),v2.ToImageTensor(), v2.ConvertImageDtype()])\n",
" \n",
" \n",
" class SquarePad:\n",
" def __call__(self, image):\n",
" # print(\"hi type:\", type(image))\n",
" temp = image.size()\n",
" w = temp[-2]\n",
" h = temp[-1]\n",
" max_wh = max([w, h])\n",
" hp = int((max_wh - w) / 2)\n",
" vp = int((max_wh - h) / 2)\n",
" padding = (hp, vp, hp, vp)\n",
" return tvf.pad(image, padding, 0, 'edge')\n",
"\n",
"\n",
" \n",
"\n",
" \n",
" def forward(self, image):\n",
"\n",
" transformedimage = self.imageprep(image)\n",
" transformedimage = transformedimage.to(self.device)\n",
"\n",
" if (len(transformedimage.shape) != 4 and len(transformedimage.shape) != 3):\n",
" raise Exception(\"Sorry, Dimension of image is incorrect (\", len(transformedimage.shape),\"). Expected a 3D (single image) or 4D (batch of images) tensor\")\n",
"\n",
" if (len(transformedimage.shape) == 3):\n",
" x = transformedimage.unsqueeze(0)\n",
" else:\n",
" x = transformedimage\n",
" \n",
" x = self.conv(x)\n",
" # print(x.shape)\n",
" # x = nn.Flatten(start_dim=-1)(x)\n",
" # print(x.shape)\n",
" x = self.classifier(x)\n",
" # print(x.shape)\n",
" guessRotation = nn.Flatten(start_dim=0)(x)\n",
" \n",
" return guessRotation\n",
" \n",
" def loss(self, guess, trueAnswer):\n",
" return self.lossfunc(guess, trueAnswer)\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.10/dist-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=ResNet18_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet18_Weights.DEFAULT` to get the most up-to-date weights.\n",
" warnings.warn(msg)\n"
]
}
],
"source": [
"model = RotationDeterminer(new=True)\n",
"device = torch.device(\"cpu\")\n",
"if torch.cuda.is_available:\n",
" device = torch.device(\"cuda:0\")\n",
" model = model.to(device)"
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {},
"outputs": [],
"source": [
"# def ResizeWithAspectRatio(image, width=None, height=None, inter=cv2.INTER_AREA):\n",
"# dim = None\n",
"# (h, w) = image.shape[:2]\n",
"\n",
"# if width is None and height is None:\n",
"# return image\n",
"# if width is None:\n",
"# r = height / float(h)\n",
"# dim = (int(w * r), height)\n",
"# else:\n",
"# r = width / float(w)\n",
"# dim = (width, int(h * r))\n",
"\n",
"# return cv2.resize(image, dim, interpolation=inter)"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([1, 4032, 3024])\n",
"torch.Size([3, 4032, 3024])\n",
"0.7532281875610352\n"
]
}
],
"source": [
"working_dataset = ds.load_from_disk(cachepath + \"datasets/customrotation/\")\n",
"prepimage = v2.Compose([v2.Grayscale(num_output_channels=3),v2.Resize(512), v2.CenterCrop(512),v2.ToImageTensor(), v2.ConvertImageDtype()])\n",
"tensorize = v2.Compose([v2.ToImageTensor(), v2.ConvertImageDtype()])\n",
"grayscaler = v2.Grayscale(num_output_channels=3)\n",
"working_dataset.set_transform(prepimage)\n",
"counter = np.load(savepath + \"/v\"+str(version)+\"/counter.npy\")\n",
"model.load_state_dict(torch.load(savepath + \"/v\"+str(version)+\"/modelsave\" + str(counter) +\"epochs\"))"
]
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([1, 800, 723])\n",
"torch.Size([3, 800, 723])\n",
"-1.3860492706298828\n"
]
}
],
"source": [
"filereadimage = cv2.imread(\"./testing_space/cropped.jpg\", 0)\n",
"# print(type(filereadimage))\n",
"tensorizedimage = torch.unsqueeze(torch.from_numpy(filereadimage),0)\n",
"print(tensorizedimage.shape)\n",
"adjustedtensorizedimage = tensorize(grayscaler(t.ToPILImage()(tensorizedimage)))\n",
"print(adjustedtensorizedimage.shape)\n",
"rotation = model(adjustedtensorizedimage).item()\n",
"print(rotation)\n",
"rotatedimage = t.Resize(size=1000)(tvf.rotate(adjustedtensorizedimage, rotation))\n",
"# imS = mf.ResizeWithAspectRatio(filereadimage, 1000)\n",
"# imS = cv2.resize(filereadimage, (960, 540)) \n",
"open_cv_image = np.array(t.ToPILImage()(rotatedimage))\n",
"cv2.imshow(f'image', open_cv_image)\n",
"key = cv2.waitKey(0)\n",
"cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"index = 0\n",
"active_dataset = working_dataset['test']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# plt.imshow(t.ToPILImage()(working_dataset['test'][3]['image']), cmap='gray', vmin=0, vmax=255)\n",
"# plt.show()\n",
"# rotationapplier = model(working_dataset['test'][3]['image']).item()\n",
"# print(rotationapplier)\n",
"# img = tvf.rotate(working_dataset['test'][3]['image'], rotationapplier)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# plt.imshow(t.ToPILImage()(img), cmap='gray', vmin=0, vmax=255)\n",
"# plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# # To call the model on a bunch of the images and rotate them back\n",
"\n",
"# while(True):\n",
"# activeimage = active_dataset[index]['image']\n",
"# # img = cv2.imread(active_dataset[index]['image'], 0)\n",
"# activeimage = tvf.rotate(activeimage, model(activeimage).item())\n",
"# open_cv_image = np.array(t.ToPILImage()(activeimage))\n",
"# print(index)\n",
"# cv2.imshow(f'current image', open_cv_image)\n",
"# key = cv2.waitKey(0)\n",
"\n",
"# if key == ord('c'):\n",
"# print(\"\\tCopying this one\")\n",
"# elif key == ord('x'):\n",
"# index -= 1\n",
"# elif key == ord('v'):\n",
"# index +=1\n",
"# elif key == ord('q'):\n",
"# break\n",
"\n",
"# cv2.destroyAllWindows()\n",
"# cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# # for trying to call the model on the picture repeatedly to see if it will just get more and more straight if it's called multiple times\n",
"\n",
"# currentimage = working_dataset['test'][3]['image']\n",
"# while(True):\n",
"# rotationapplier = model(currentimage).item()\n",
"# print(rotationapplier)\n",
"# img = tvf.rotate(currentimage, rotationapplier)\n",
"# open_cv_image = np.array(t.ToPILImage()(img))\n",
"# cv2.imshow(f'current image', open_cv_image)\n",
"# key = cv2.waitKey(0)\n",
" \n",
"# if key == ord('q'):\n",
"# break\n",
"# elif key == ord('v'):\n",
"# currentimage = img\n",
"# # cv2.destroyAllWindows()\n",
"# cv2.destroyAllWindows()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,417 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# from datasets import load_dataset, Image\n",
"import datasets as ds\n",
"import PIL\n",
"import torchvision.transforms.functional as tvf\n",
"from torchvision.transforms import v2\n",
"import random\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"original_dataset = ds.load_dataset(\"aharley/rvl_cdip\")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# Create own dataset from the images of the original dataset but make the labels the float value for the rotation. do the random rotation on all of the training ones but the labels for the validation and test should/can be 0\n",
"trainblacklist = [5, 102664, 102667, 277943]\n",
"testblacklist = [6, 11, 14, 18, 27, 35, 37, 54, 33669] # 33669 is a corrupt image\n",
"validationblacklist = []\n",
"og_training_dataset = original_dataset['train'].select([i for i in range(len(original_dataset['train'])) if i not in trainblacklist])\n",
"og_testing_dataset = original_dataset['test'].select([i for i in range(len(original_dataset['test'])) if i not in testblacklist])\n",
"og_validation_dataset = original_dataset['validation'].select([i for i in range(len(original_dataset['validation'])) if i not in validationblacklist])\n",
"\n",
"# type(og_testing_dataset)\n",
"\n",
"# print(type(transform_picture(og_testing_dataset[0], params)))\n",
"# out = transform_picture(og_testing_dataset[0], params)\n",
"# print(out['image'])\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"319998\n"
]
}
],
"source": [
"print(len(og_training_dataset))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def has_valid_image(ex):\n",
" print(type(ex))\n",
" try:\n",
" PIL.Image.open(ex[\"image\"][\"path\"])\n",
" except Exception:\n",
" print(\"hi\")\n",
" return False\n",
" return True\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# dataset = original_dataset.cast_column(\"image\", ds.Image(decode=False))\n",
"# dataset = dataset.filter(has_valid_image)\n",
"# filtered_dataset = dataset.cast_column(\"image\", ds.Image(decode=True))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Parameter Declaration\n",
"minRotation=-180\n",
"maxRotation=180\n",
"minTranslation=0\n",
"maxTranslation=150\n",
"minScale = 0.4\n",
"maxScale = 1\n",
"minShear = 0\n",
"maxShear = 0\n",
"\n",
"minFill=255\n",
"maxFill=255\n",
"\n",
"params = {\"minRotation\":minRotation,\"maxRotation\":maxRotation,\"minTranslation\":minTranslation,\"maxTranslation\":maxTranslation,\"minScale\":minScale,\"maxScale\":maxScale,\"minShear\":minShear,\"maxShear\":maxShear,\"minFill\":minFill,\"maxFill\":maxFill}"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"class SquarePad:\n",
" def __init__(self, fill):\n",
" self.fill = fill\n",
" \n",
" def __call__(self, image):\n",
" w, h = image.size\n",
" max_wh = np.max([w, h])\n",
" hp = int((max_wh - w) / 2)\n",
" vp = int((max_wh - h) / 2)\n",
" padding = (hp, vp, hp, vp)\n",
" return tvf.pad(image, padding,fill=self.fill, padding_mode='constant')\n",
"\n",
"\n",
"\n",
"\n",
"def transform_picture(image_label, parameters):\n",
" image = image_label['image']\n",
"\n",
" appliedRotation = random.uniform(parameters['minRotation'], parameters['maxRotation'])\n",
" appliedXTranslation = random.uniform(parameters['minTranslation'], parameters['maxTranslation'])\n",
" appliedYTranslation = random.uniform(parameters['minTranslation'], parameters['maxTranslation'])\n",
" appliedScale = random.uniform(parameters['minScale'], parameters['maxScale'])\n",
" appliedFill = random.uniform(parameters['minFill'], parameters['maxFill'])\n",
" appliedXShear = random.uniform(parameters['minShear'], parameters['maxShear'])\n",
" appliedYShear = random.uniform(parameters['minShear'], parameters['maxShear'])\n",
" \n",
" appliers = v2.Compose([v2.RandomApply(transforms=[v2.RandomPosterize(bits=1)], p=0.25),\n",
" v2.RandomApply(transforms=[v2.ElasticTransform(alpha=25.0, fill=appliedFill)], p=0.25), # maybe add fill=appliedFill\n",
" v2.RandomApply(transforms=[v2.GaussianBlur(kernel_size=(5,9), sigma=(0.1,2.))],p=0.25),\n",
" v2.RandomApply(transforms=[v2.RandomEqualize()],p=0.25),\n",
" SquarePad(fill=appliedFill),v2.Resize(1100)])\n",
" \n",
" adjustedimage = tvf.affine(image, appliedRotation, [appliedXTranslation,appliedYTranslation], appliedScale, [appliedXShear, appliedYShear], fill=appliedFill)\n",
"\n",
" adjustedimage = appliers(adjustedimage)\n",
"\n",
" \n",
" return {'image':adjustedimage,'rotation':appliedRotation}"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "bd95dc0201c2419e982f8167e16db6b5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Map (num_proc=4): 0%| | 0/39999 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"new_testing_dataset = og_testing_dataset.map(transform_picture, fn_kwargs={'parameters':params}, num_proc=4)\n",
"#33669 has bad EXIF data so it is ignored at load time"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "17c8b6a170ae4072b385b6d3e965d9e8",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Map (num_proc=4): 0%| | 0/40000 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"new_validation_dataset = og_validation_dataset.map(transform_picture, fn_kwargs={'parameters':params}, num_proc=4)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "32d7adecb479420cb8a4b3eee898ec1b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Map (num_proc=4): 0%| | 0/320000 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"new_training_dataset = og_training_dataset.map(transform_picture, fn_kwargs={'parameters':params}, num_proc=4)\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"def setlabelname(entry):\n",
" return {'image':entry['image'], 'rotation':entry['label']}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# new_testing_dataset = new_testing_dataset.map(setlabelname, num_proc=4, batch_size=700, batched=True, writer_batch_size=700)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# new_training_dataset = new_training_dataset.remove_columns(\"label\")\n",
"# new_testing_dataset = new_testing_dataset.remove_columns(\"label\")\n",
"# new_validation_dataset = new_validation_dataset.remove_columns(\"label\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"new_dataset = ds.DatasetDict({'train': new_training_dataset,'test': new_testing_dataset, 'validation': new_validation_dataset})\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DatasetDict({\n",
" train: Dataset({\n",
" features: ['image', 'label', 'rotation'],\n",
" num_rows: 320000\n",
" })\n",
" test: Dataset({\n",
" features: ['image', 'label', 'rotation'],\n",
" num_rows: 39999\n",
" })\n",
" validation: Dataset({\n",
" features: ['image', 'label', 'rotation'],\n",
" num_rows: 40000\n",
" })\n",
"})\n"
]
}
],
"source": [
"print(new_dataset)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"new_dataset = new_dataset.remove_columns(\"label\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DatasetDict({\n",
" train: Dataset({\n",
" features: ['image', 'rotation'],\n",
" num_rows: 320000\n",
" })\n",
" test: Dataset({\n",
" features: ['image', 'rotation'],\n",
" num_rows: 39999\n",
" })\n",
" validation: Dataset({\n",
" features: ['image', 'rotation'],\n",
" num_rows: 40000\n",
" })\n",
"})\n"
]
}
],
"source": [
"print(new_dataset)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a614c8b5206649f0b774dc25909bca75",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Saving the dataset (0/65 shards): 0%| | 0/320000 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "bc7984d6eafe443aa4980e43350fee03",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Saving the dataset (0/9 shards): 0%| | 0/39999 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ab4cc0e859bd46599345129fd70bcb37",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Saving the dataset (0/9 shards): 0%| | 0/40000 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"new_dataset.save_to_disk(\"../.cache/huggingfaces/datasets/customrotation/\", max_shard_size=\"500MB\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,159 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# from datasets import load_dataset, Image\n",
"import datasets as ds\n",
"import PIL\n",
"import torchvision.transforms.functional as tvf\n",
"from torchvision.transforms import v2\n",
"import random\n",
"import numpy as np\n",
"\n",
"import torchvision.utils as utils\n",
"\n",
"from tqdm.autonotebook import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"original_dataset = ds.load_dataset(\"aharley/rvl_cdip\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Create own dataset from the images of the original dataset but make the labels the float value for the rotation. do the random rotation on all of the training ones but the labels for the validation and test should/can be 0\n",
"trainblacklist = []\n",
"testblacklist = [33669] # index 33669 is just corrupted\n",
"validationblacklist = []\n",
"og_training_dataset = original_dataset['train'].select([i for i in range(len(original_dataset['train'])) if i not in trainblacklist])\n",
"og_testing_dataset = original_dataset['test'].select([i for i in range(len(original_dataset['test'])) if i not in testblacklist])\n",
"og_validation_dataset = original_dataset['validation'].select([i for i in range(len(original_dataset['validation'])) if i not in validationblacklist])\n",
"\n",
"tensorize = v2.Compose([v2.ToImageTensor(), v2.ConvertImageDtype()])\n",
"\n",
"og_training_dataset.set_transform(tensorize)\n",
"og_testing_dataset.set_transform(tensorize)\n",
"og_validation_dataset.set_transform(tensorize)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "755255ae5bea49cc866c96f0d291b570",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/39999 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pbar = tqdm(og_testing_dataset)\n",
"\n",
"for i, entry in enumerate(pbar):\n",
" index = i\n",
" if (i >= 33669):\n",
" index = index + 1\n",
" utils.save_image(entry['image'], \"./datasetimages/test/\"+str(index)+\".jpg\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "88288b649a64430bb52e2ae5720e4b1f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/320000 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pbar = tqdm(og_training_dataset)\n",
"\n",
"for i, entry in enumerate(pbar):\n",
" utils.save_image(entry['image'], \"./datasetimages/train/\"+str(i)+\".jpg\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ed6ce8bc3d224f278df6723fc0c41d72",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/40000 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"pbar = tqdm(og_validation_dataset)\n",
"\n",
"for i, entry in enumerate(pbar):\n",
" utils.save_image(entry['image'], \"./datasetimages/validation/\"+str(i)+\".jpg\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,430 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"## ORIGINAL FILE FOR SELECTIVE SEGMENTATION SEARCH"
]
},
{
"cell_type": "code",
"execution_count": 350,
"metadata": {},
"outputs": [],
"source": [
"import cv2\n",
"import numpy as np\n",
"from queue import PriorityQueue\n",
"import myfunctions as mf\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import random"
]
},
{
"cell_type": "code",
"execution_count": 351,
"metadata": {},
"outputs": [],
"source": [
"# def ResizeWithAspectRatio(image, width=None, height=None, inter=cv2.INTER_AREA):\n",
"# dim = None\n",
"# (h, w) = image.shape[:2]\n",
"\n",
"# if width is None and height is None:\n",
"# return image\n",
"# if width is None:\n",
"# r = height / float(h)\n",
"# dim = (int(w * r), height)\n",
"# else:\n",
"# r = width / float(w)\n",
"# dim = (width, int(h * r))\n",
"\n",
"# return cv2.resize(image, dim, interpolation=inter)"
]
},
{
"cell_type": "code",
"execution_count": 352,
"metadata": {},
"outputs": [],
"source": [
"import heapq as hq\n",
"\n",
"class MaxHeapObj(object):\n",
" def __init__(self, val): self.val = val\n",
" def __lt__(self, other): return self.val > other.val\n",
" def __eq__(self, other): return self.val == other.val\n",
" def __str__(self): return str(self.val)\n",
" \n",
"class MinHeap(object):\n",
" def __init__(self): self.h = []\n",
" def heappush(self, x): heapq.heappush(self.h, x)\n",
" def heappop(self): return heapq.heappop(self.h)\n",
" def __getitem__(self, i): return self.h[i]\n",
" def __len__(self): return len(self.h)\n",
" \n",
"class MaxHeap(MinHeap):\n",
" def heappush(self, x): heapq.heappush(self.h, MaxHeapObj(x))\n",
" def heappop(self): return heapq.heappop(self.h).val\n",
" def __getitem__(self, i): return self.h[i].val"
]
},
{
"cell_type": "code",
"execution_count": 353,
"metadata": {},
"outputs": [],
"source": [
"# def clip(n, lower, upper):\n",
"# return max(lower, min(n, upper))\n",
"\n",
"# def colourscaler(n, min, max):\n",
"# temp = n-min\n",
"# diff = abs(max - min)\n",
"# return clip((temp/diff)*255, 0, 255)"
]
},
{
"cell_type": "code",
"execution_count": 354,
"metadata": {},
"outputs": [],
"source": [
"# inline double clip(double n, double lower, double upper) {\n",
"# return std::max(lower, std::min(n, upper));\n",
"# };\n",
"\n",
"# inline double colourscaler(double n, double min, double max) {\n",
"# double temp = n - min;\n",
"# double diff = std::abs(max - min);\n",
"# return clip((temp / diff) * 255, 0, 255);\n",
"# };"
]
},
{
"cell_type": "code",
"execution_count": 355,
"metadata": {},
"outputs": [],
"source": [
"# ## Test this code for the masking/colour squishing. it essentially can just speed up clipping the edges.\n",
"# #!/usr/local/bin/python3\n",
"# import cv2 as cv\n",
"# import numpy as np\n",
"\n",
"# # Load the aerial image and convert to HSV colourspace\n",
"# image = cv.imread(\"aerial.png\")\n",
"# hsv=cv.cvtColor(image,cv.COLOR_BGR2HSV)\n",
"\n",
"# # Define lower and uppper limits of what we call \"brown\"\n",
"# brown_lo=np.array([10,0,0])\n",
"# brown_hi=np.array([20,255,255])\n",
"\n",
"# # Mask image to only select browns\n",
"# mask=cv.inRange(hsv,brown_lo,brown_hi)\n",
"\n",
"# # Change image to red where we found brown\n",
"# image[mask>0]=(0,0,255)\n",
"\n",
"# cv.imwrite(\"result.png\",image)\n",
"\n",
"#CAN ALSO TRY USING NUMPY VECTORIZATION"
]
},
{
"cell_type": "code",
"execution_count": 356,
"metadata": {},
"outputs": [],
"source": [
"# def rotate(img, angle):\n",
"# rows,cols = img.shape[0], img.shape[1]\n",
"# M = cv2.getRotationMatrix2D((cols/2,rows/2),angle,1)\n",
"# dst = cv2.warpAffine(img,M,(cols,rows))\n",
"# return dst"
]
},
{
"cell_type": "code",
"execution_count": 357,
"metadata": {},
"outputs": [],
"source": [
"def crop(image, lower = 100, upper = 255, threshold1 = 50, threshold2 = 350):\n",
" lower = max(0,lower)\n",
" upper = min(255, upper)\n",
" gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)\n",
"\n",
" scaled_gray = np.zeros(gray.shape, gray.dtype)\n",
" \n",
" # for y in range(0,gray.shape[0]):\n",
" # for x in range(0,gray.shape[1]):\n",
" # scaled_gray[y][x] = colourscaler(gray[y][x], lower, upper)\n",
" scaled_gray = gray\n",
" \n",
" blurred = cv2.GaussianBlur(scaled_gray, (15,15),0)\n",
" # blurred = scaled_gray\n",
" edged = cv2.Canny(blurred, threshold1, threshold2)\n",
" # meangrayscale = cv2.mean(scaled_gray)[0]\n",
" # print(meangrayscale)\n",
" # edged = cv2.Canny(blurred, int(meangrayscale*2), int(meangrayscale*4))\n",
" return edged\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 358,
"metadata": {},
"outputs": [],
"source": [
"def selectiveSearchSegmentationImp(image):\n",
" ss = cv2.ximgproc.segmentation.createSelectiveSearchSegmentation()\n",
" ss.setBaseImage(image)\n",
" ss.switchToSelectiveSearchFast()\n",
" return ss.process()"
]
},
{
"cell_type": "code",
"execution_count": 359,
"metadata": {},
"outputs": [],
"source": [
"img = cv2.imread('./testing_space/final.jpg')"
]
},
{
"cell_type": "code",
"execution_count": 360,
"metadata": {},
"outputs": [],
"source": [
"# def rectArea(rect):\n",
"# # print(rect)\n",
"# return rect[2]*rect[3]\n",
"\n",
"# def biggestRects(n, rects):\n",
"# dict = {}\n",
"# # outrects = np.zeros(shape=(n, 4))\n",
"# for rect in rects:\n",
"# dict[tuple(rect)] = mf.rectArea(rect)\n",
"# # maxh.heappush(mf.rectArea(rect))\n",
"# # print(maxh[0])\n",
" \n",
" \n",
"# heap = [(-value, key) for key,value in dict.items()]\n",
"# largest = hq.nsmallest(n, heap)\n",
" \n",
"\n",
"# # hq.heapify(list(dict.items()))\n",
"# # for i in range(0,n):\n",
"# # outrects[i] = maxh.heappop()\n",
"# # print(outrects)\n",
"# return [key for value, key in largest]\n",
"\n",
"# def overlapRect(rects):\n",
"# leftwall = -1\n",
"# rightwall = -1\n",
"# topwall = -1\n",
"# bottomwall = -1\n",
"# for (x, y, w, h) in rects:\n",
"# if (leftwall == -1):\n",
"# leftwall = x\n",
"# rightwall = x + w\n",
"# topwall = y\n",
"# bottomwall = y + h\n",
"# continue\n",
"# leftwall = max(leftwall, x)\n",
"# rightwall = min(rightwall, x+w)\n",
"# topwall = max(topwall, y)\n",
"# bottomwall = min(bottomwall, y+h)\n",
" \n",
"# if (topwall >= bottomwall or leftwall >= rightwall):\n",
"# return (-1, -1, -1, -1)\n",
"# return (leftwall, topwall, rightwall-leftwall, bottomwall-topwall)"
]
},
{
"cell_type": "code",
"execution_count": 344,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(-1, -1, -1, -1)\n"
]
}
],
"source": [
"# rect = crop(img)\n",
"\n",
"# _, thresholded = cv2.threshold(cv2.cvtColor(img, cv2.COLOR_BGR2GRAY), 200, 255, cv2.THRESH_BINARY)\n",
"\n",
"rects = selectiveSearchSegmentationImp(cv2.GaussianBlur(ResizeWithAspectRatio(img,300), (15,15),0))\n",
"# mf.rectArea(rects[0])\n",
"bigRects = mf.biggestRects(20, rects)\n",
"# print(bigRects)\n",
"\n",
"finalrect = mf.overlapRect(bigRects)\n",
"print(finalrect)\n",
"output = ResizeWithAspectRatio(img,300)\n",
"for (x, y, w, h) in [finalrect]:\n",
"\t\t# draw the region proposal bounding box on the image\n",
"\t\tcolor = [random.randint(0, 255) for j in range(0, 3)]\n",
"\t\tcv2.rectangle(output, (x, y), (x + w, y + h), color, 2)\n",
"\n",
"# edges = cv2.Canny(cv2.GaussianBlur(cv2.cvtColor(img, cv2.COLOR_BGR2GRAY), (15,15),0),255 / 4, 255)\n",
"\n",
"# plt.imshow(edges, cmap='gray', vmin=0, vmax=255)\n",
"# plt.show()\n",
"\n",
"cv2.imshow(\"banana\", output)\n",
"cv2.waitKey(0)\n",
"cv2.destroyAllWindows()\n",
"\n",
"\n",
"# print(range(0,img.shape[1]))\n",
"# for i in range(0,img.shape[1]):\n",
"# print(i)"
]
},
{
"cell_type": "code",
"execution_count": 389,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1\n"
]
}
],
"source": [
"temp = ResizeWithAspectRatio(crop(img, threshold1=150, threshold2=350),500)\n",
"contours, _ = cv2.findContours(temp, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
"# print(type(contours))\n",
"# max(cv2.contourArea(contours))\n",
"# areas = list(map(cv2.contourArea, contours))\n",
"# print(areas)\n",
"contourindex = np.argmax(list(map(cv2.contourArea, contours)))\n",
"temp = cv2.drawContours(temp, contours, contourindex, (255,0,0), 2)\n",
"cv2.imshow(\"banana\", temp)\n",
"cv2.waitKey(0)\n",
"cv2.destroyAllWindows()\n",
"print(contourindex)\n",
"rect = cv2.boundingRect(contours[contourindex])\n",
"color = (random.randint(0,256), random.randint(0,256), random.randint(0,256))\n",
"result = cv2.rectangle(ResizeWithAspectRatio(img,500), rect, color, 3)"
]
},
{
"cell_type": "code",
"execution_count": 362,
"metadata": {},
"outputs": [],
"source": [
"# print(contourindex)"
]
},
{
"cell_type": "code",
"execution_count": 371,
"metadata": {},
"outputs": [],
"source": [
"cv2.imshow(\"banana\", result)\n",
"cv2.waitKey(0)\n",
"cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": 348,
"metadata": {},
"outputs": [],
"source": [
"# HSV = cv2.cvtColor(ResizeWithAspectRatio(img,500), cv2.COLOR_BGR2HSV)\n",
"# low = np.array([0,0,10])\n",
"# high = np.array([179,10,255])\n",
"\n",
"# mask = cv2.inRange(HSV,low,high)\n",
"\n",
"# cv2.imshow(\"banana\", mask)\n",
"# cv2.waitKey(0)\n",
"# cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": 349,
"metadata": {},
"outputs": [],
"source": [
" # cv::Mat gray, scaled_gray, blurred, edged;\n",
"\n",
" # lower = std::max(lower, 0);\n",
" # upper = std::min(upper, 255);\n",
"\n",
" # cv::cvtColor(src, gray, cv::COLOR_BGR2GRAY);\n",
" # scaled_gray = cv::Mat::zeros(gray.size(), gray.type());\n",
"\n",
" # for (int y = 0; y < gray.rows; y++) {\n",
" # for (int x = 0; x < gray.cols; x++) {\n",
" # scaled_gray.at<uchar>(y, x) =\n",
" # cv::saturate_cast<uchar>(colourscaler(gray.at<uchar>(y, x), lower, upper));\n",
" # }\n",
" # }\n",
"\n",
" # cv::GaussianBlur(scaled_gray, blurred, cv::Size(15, 15), 0);\n",
" # cv::Canny(blurred, edged, threshold1, threshold2);\n",
"\n",
" # std::vector<std::vector<cv::Point>> contours;\n",
" # std::vector<cv::Vec4i> heirarchy;\n",
" # cv::Mat approx;\n",
"\n",
" # cv::findContours(edged, contours, heirarchy, cv::RETR_TREE, cv::CHAIN_APPROX_SIMPLE);\n",
"\n",
" # cv::cvtColor(gray, gray, cv::COLOR_GRAY2BGR);\n",
"\n",
" # std::sort(contours.begin(), contours.end(), [](std::vector<cv::Point> a, std::vector<cv::Point> b) {\n",
" # return cv::arcLength(a, false) > cv::arcLength(b, false); });\n",
"\n",
" # int numContours = contours.size();\n",
"\n",
"\n",
" # return cv::boundingRect(contours[0]);"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,94 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"import cv2\n",
"import numpy as np\n",
"\n",
"import torch\n",
"import torchvision.transforms.functional as tvf\n",
"import torchvision.transforms.v2 as v2\n",
"import torchvision.transforms as t\n",
"import myfunctions as mf\n",
"\n",
"from skimage import io\n",
"from matplotlib import pyplot as plt\n",
"import time\n",
"\n",
"import myfunctions as mf"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# read image as grayscale\n",
"img = cv2.imread('./test_images/IMG_7594.jpg')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cropped = mf.morphologyCrop(img)\n",
"# rotated = deskew(cropped)\n",
"# cropped2 = morphologyCrop(rotated)\n",
"# cropped2 = selectiveSearchCrop(rotated)\n",
"# cropped3 = cannyEdgeCrop(cropped2)\n",
"cv2.imwrite(\"./testing_space/final.jpg\", cropped)\n",
"# final = rotate(cropped2, 180) # need to implement the code to determine if a doc is upside down"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"### Deskew seems to work \n",
"# Note licencing for the deskew package and "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,316 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# can probably be deleted or put somewhere. Was the original code for the rowsumdeskew"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"import cv2\n",
"import numpy as np\n"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"src = 255 - cv2.imread('./testing_space/cropped1.jpg',0)\n",
"scores = []\n",
"\n",
"h,w = src.shape\n",
"small_dimention = min(h,w)\n",
"src = src[:small_dimention, :small_dimention]\n",
"\n",
"out = cv2.VideoWriter('./temp/video.avi',\n",
" cv2.VideoWriter_fourcc('M','J','P','G'),\n",
" 15, (320,320))\n",
"\n",
"src = cv2.threshold(src, 100, 255, cv2.THRESH_BINARY)[1]"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"def rotate(img, angle):\n",
" rows,cols = img.shape\n",
" M = cv2.getRotationMatrix2D((cols/2,rows/2),angle,1)\n",
" dst = cv2.warpAffine(img,M,(cols,rows))\n",
" return dst\n",
"\n",
"def sum_rows(img):\n",
" # Create a list to store the row sums\n",
" row_sums = []\n",
" # Iterate through the rows\n",
" for r in range(img.shape[0]-1):\n",
" # Sum the row\n",
" row_sum = sum(sum(img[r:r+1,:]))\n",
" # Add the sum to the list\n",
" row_sums.append(row_sum)\n",
" # Normalize range to (0,255)\n",
" row_sums = (row_sums/max(row_sums)) * 255\n",
" # Return\n",
" return row_sums\n",
"\n",
"def display_data(roi, row_sums, buffer): \n",
" # Create background to draw transform on\n",
" bg = np.zeros((buffer*2, buffer*2), np.uint8) \n",
" # Iterate through the rows and draw on the background\n",
" for row in range(roi.shape[0]-1):\n",
" row_sum = row_sums[row]\n",
" bg[row:row+1, :] = row_sum\n",
" left_side = int(buffer/3)\n",
" bg[:, left_side:] = roi[:,left_side:] \n",
" cv2.imshow('bg1', bg)\n",
" k = cv2.waitKey(1)\n",
" out.write(cv2.cvtColor(cv2.resize(bg, (320,320)), cv2.COLOR_GRAY2BGR))\n",
" return k\n"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"count = 0\n",
"othercount = 0\n",
"goodangle = 0"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"# cv2.imshow('bg1', src)\n",
"# cv2.waitKey(0)\n",
"# cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"found optimal rotation\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n",
"found optimal rotation\n"
]
}
],
"source": [
"# Rotate the image around in a circle\n",
"angle = 0\n",
"while angle <= 360:\n",
" # Rotate the source image\n",
" img = rotate(src, angle) \n",
" # Crop the center 1/3rd of the image (roi is filled with text)\n",
" h,w = img.shape\n",
" buffer = min(h, w) - int(min(h,w)/1.5)\n",
" roi = img[int(h/2-buffer):int(h/2+buffer), int(w/2-buffer):int(w/2+buffer)]\n",
" # Create background to draw transform on\n",
" bg = np.zeros((buffer*2, buffer*2), np.uint8)\n",
" # Compute the sums of the rows\n",
" row_sums = sum_rows(roi)\n",
" # High score --> Zebra stripes\n",
" score = np.count_nonzero(row_sums)\n",
" scores.append(score)\n",
" othercount = othercount + 1\n",
" # Image has best rotation\n",
" if score <= min(scores):\n",
" count = count + 1\n",
" # Save the rotatied image\n",
" print('found optimal rotation')\n",
" best_rotation = img.copy()\n",
" goodangle = angle\n",
" k = display_data(roi, row_sums, buffer)\n",
" if k == 27: break\n",
" # Increment angle and try again\n",
" angle += .75\n",
"cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"25\n",
"481\n",
"349.5\n"
]
}
],
"source": [
"print(count)\n",
"print(othercount)\n",
"print(goodangle)\n",
"cv2.imshow('bg1', best_rotation)\n",
"cv2.waitKey(0)\n",
"cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"start\")\n",
"\n",
"# Rotate the image around in a circle\n",
"angle = 0\n",
"while angle <= 360:\n",
" # Rotate the source image\n",
" img = rotate(src, angle) \n",
" # Crop the center 1/3rd of the image (roi is filled with text)\n",
" h,w = img.shape\n",
" buffer = min(h, w) - int(min(h,w)/1.5)\n",
" #roi = img.copy()\n",
" roi = img[int(h/2-buffer):int(h/2+buffer), int(w/2-buffer):int(w/2+buffer)]\n",
" # Create background to draw transform on\n",
" bg = np.zeros((buffer*2, buffer*2), np.uint8)\n",
" # Threshold image\n",
" _, roi = cv2.threshold(roi, 140, 255, cv2.THRESH_BINARY)\n",
" # Compute the sums of the rows\n",
" row_sums = sum_rows(roi)\n",
" # High score --> Zebra stripes\n",
" score = np.count_nonzero(row_sums)\n",
" if sum(row_sums) < 100000: scores.append(angle)\n",
" k = display_data(roi, row_sums, buffer)\n",
" if k == 27: break\n",
" # Increment angle and try again\n",
" angle += .5\n",
" print(\"loop\")\n",
"cv2.destroyAllWindows()\n",
"\n",
"print(\"endofrotate\")\n",
"\n",
"# Create images for display purposes\t\n",
"display = src.copy()\n",
"# Create an image that contains bins. \n",
"bins_image = np.zeros_like(display)\n",
"for angle in scores:\n",
" # Rotate the image and draw a line on it\n",
" display = rotate(display, angle) \n",
" cv2.line(display, (0,int(h/2)), (w,int(h/2)), 255, 1)\n",
" display = rotate(display, -angle)\n",
" # Rotate the bins image\n",
" bins_image = rotate(bins_image, angle)\n",
" # Draw a line on a temporary image\n",
" temp = np.zeros_like(bins_image)\n",
" cv2.line(temp, (0,int(h/2)), (w,int(h/2)), 50, 1)\n",
" # 'Fill' up the bins\n",
" bins_image += temp\n",
" bins_image = rotate(bins_image, -angle)\n",
" \n",
"print(\"endofbins\")\n",
"\n",
"# Find the most filled bin\n",
"for col in range(bins_image.shape[0]-1):\n",
"\tcolumn = bins_image[:, col:col+1]\n",
"\tif np.amax(column) == np.amax(bins_image): x = col\n",
"for col in range(bins_image.shape[0]-1):\n",
"\tcolumn = bins_image[:, col:col+1]\n",
"\tif np.amax(column) == np.amax(bins_image): y = col\n",
"# Draw circles showing the most filled bin\n",
"cv2.circle(display, (x,y), 560, 255, 5)\n",
"\n",
"print(\"plotting\")\n",
"\n",
"# Plot with Matplotlib\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib.image as mpimg\n",
"f, axarr = plt.subplots(1,3, sharex=True)\n",
"axarr[0].imshow(src)\n",
"axarr[1].imshow(display)\n",
"axarr[2].imshow(bins_image)\n",
"axarr[0].set_title('Source Image')\n",
"axarr[1].set_title('Output')\n",
"axarr[2].set_title('Bins Image')\n",
"axarr[0].axis('off')\n",
"axarr[1].axis('off')\n",
"axarr[2].axis('off')\n",
"plt.show()\n",
"\n",
"cv2.waitKey()\n",
"cv2.destroyAllWindows()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,777 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.10/dist-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().\n",
" warnings.warn(_BETA_TRANSFORMS_WARNING)\n",
"/usr/local/lib/python3.10/dist-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().\n",
" warnings.warn(_BETA_TRANSFORMS_WARNING)\n"
]
}
],
"source": [
"import torch\n",
"import torch.nn as nn\n",
"import torch.nn.functional as fn\n",
"import torch.optim as optim\n",
"import torchvision.transforms.functional as tvf\n",
"from torchvision.transforms import v2\n",
"from torch.utils.data import DataLoader\n",
"\n",
"from PIL import Image\n",
"\n",
"import datasets as ds\n",
"from tqdm.autonotebook import tqdm\n",
"\n",
"import random\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"import numpy as np\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# original_dataset = ds.load_dataset(\"aharley/rvl_cdip\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"working_dataset = ds.load_from_disk(\"../.cache/huggingfaces/datasets/customrotation/\")\n",
"prepimage = v2.Compose([v2.Grayscale(num_output_channels=3),v2.Resize(1100), v2.CenterCrop(1100),v2.ToImageTensor(), v2.ConvertImageDtype()])\n",
"working_dataset.set_transform(prepimage)\n",
"torch.cuda.empty_cache()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Parameter Declaration\n",
"minRotation=-180\n",
"maxRotation=180\n",
"minTranslation=0\n",
"maxTranslation=150\n",
"minScale = 0.4\n",
"maxScale = 1\n",
"minShear = 0\n",
"maxShear = 0\n",
"\n",
"minFill=0\n",
"maxFill=255\n",
"\n",
"params = {\"minRotation\":minRotation,\"maxRotation\":maxRotation,\"minTranslation\":minTranslation,\"maxTranslation\":maxTranslation,\"minScale\":minScale,\"maxScale\":maxScale,\"minShear\":minShear,\"maxShear\":maxShear,\"minFill\":minFill,\"maxFill\":maxFill}"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def transform_picture(image_label, parameters):\n",
" image = image_label['image']\n",
"\n",
" appliedRotation = random.uniform(parameters['minRotation'], parameters['maxRotation'])\n",
" appliedXTranslation = random.uniform(parameters['minTranslation'], parameters['maxTranslation'])\n",
" appliedYTranslation = random.uniform(parameters['minTranslation'], parameters['maxTranslation'])\n",
" appliedScale = random.uniform(parameters['minScale'], parameters['maxScale'])\n",
" appliedFill = random.uniform(parameters['minFill'], parameters['maxFill'])\n",
" appliedXShear = random.uniform(parameters['minShear'], parameters['maxShear'])\n",
" appliedYShear = random.uniform(parameters['minShear'], parameters['maxShear'])\n",
" \n",
" appliers = [v2.RandomApply(transforms=[v2.RandomPosterize(bits=1)], p=0.25),\n",
" v2.RandomApply(transforms=[v2.ElasticTransform(alpha=25.0, fill=appliedFill)], p=0.25), # maybe add fill=appliedFill\n",
" v2.RandomApply(transforms=[v2.GaussianBlur(kernel_size=(5,9), sigma=(0.1,2.))],p=0.25),\n",
" v2.RandomApply(transforms=[v2.RandomEqualize()],p=0.25)]\n",
" \n",
" adjustedimage = tvf.affine(image, appliedRotation, [appliedXTranslation,appliedYTranslation], appliedScale, [appliedXShear, appliedYShear], fill=appliedFill)\n",
"\n",
" for applier in appliers:\n",
" adjustedimage = applier(adjustedimage)\n",
"\n",
" \n",
" adjustedimage = tvf.resize(adjustedimage, size=[1100,1100])\n",
" \n",
" return {'image':adjustedimage,'label':appliedRotation}"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# # Create own dataset from the images of the original dataset but make the labels the float value for the rotation. do the random rotation on all of the training ones but the labels for the validation and test should/can be 0\n",
"# og_training_dataset = original_dataset['train']\n",
"# og_testing_dataset = original_dataset['test']\n",
"# og_validation_dataset = original_dataset['validation']\n",
"\n",
"# type(og_testing_dataset[0]['label'])\n",
"\n",
"# # type(transform_picture(og_testing_dataset[0], params))\n",
"# new_testing_dataset = og_testing_dataset.map(transform_picture, fn_kwargs={'parameters':params})"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# class WorkaroundDataset(torch.utils.data.Dataset):\n",
"# def __init__(self, dataset):\n",
"# self._dataset = dataset\n",
"\n",
"# def __len__(self):\n",
"# return len(self._dataset)\n",
"\n",
"# def __getitem__(self, idx):\n",
"# return v2.Compose([v2.ToImageTensor(), v2.ConvertImageDtype()])(self._dataset[idx]['image'])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# # type(image_dataset['train'][0]['image'])\n",
"# # print(image_dataset['train'][0]['image'])\n",
"# img = image_dataset['train'][2]['image']\n",
"# # img\n",
"# # print(img.size)\n",
"# crop = tvf.resize(img, size=[500])\n",
"# # crop\n",
"# # print(crop.size)\n",
"# newimg = tvf.affine(crop, 180, [0,0], 0.7, 0)\n",
"# newimg"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# appliedRotation = random.uniform(minRotation, maxRotation)\n",
"# appliedXTranslation = random.uniform(minTranslation, maxTranslation)\n",
"# appliedYTranslation = random.uniform(minTranslation, maxTranslation)\n",
"# appliedScale = random.uniform(minScale, maxScale)\n",
"# appliedFill = random.uniform(minFill, maxFill)\n",
"\n",
"\n",
"\n",
"# newimg = tvf.affine(crop, appliedRotation, [appliedXTranslation,appliedYTranslation], appliedScale, shear, fill=appliedFill)\n",
"# newimg\n",
"\n",
"# appliers = [v2.RandomApply(transforms=[v2.RandomPosterize(bits=1)], p=0.25),\n",
"# v2.RandomApply(transforms=[v2.ElasticTransform(alpha=25.0, fill=appliedFill)], p=0.25),\n",
"# v2.RandomApply(transforms=[v2.GaussianBlur(kernel_size=(5,9), sigma=(0.1,2.))],p=0.25),\n",
"# v2.RandomApply(transforms=[v2.RandomEqualize()],p=0.25)]\n",
"\n",
"# for applier in appliers:\n",
"# newimg = applier(newimg)\n",
" \n",
"# # newimg\n",
"# newimg= tvf.resize(newimg, size=[1000,1000])\n",
"# newimg\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# class SquarePad:\n",
"# \tdef __call__(self, image):\n",
"# \t\tw, h = image.size\n",
"# \t\tmax_wh = np.max([w, h])\n",
"# \t\thp = int((max_wh - w) / 2)\n",
"# \t\tvp = int((max_wh - h) / 2)\n",
"# \t\tpadding = (hp, vp, hp, vp)\n",
"# \t\treturn tvf.pad(image, padding, 0, 'constant')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"\n",
"class RotationDeterminer(nn.Module):\n",
" def __init__(self):\n",
" super().__init__()\n",
" \n",
" torch.cuda.empty_cache()\n",
" \n",
" self.device = torch.device(\"cpu\")\n",
" if torch.cuda.is_available:\n",
" self.device = torch.device(\"cuda:0\")\n",
" \n",
" \n",
" self.appliers = [v2.RandomApply(transforms=[v2.RandomPosterize(bits=1)], p=0.25),\n",
" v2.RandomApply(transforms=[v2.ElasticTransform(alpha=25.0)], p=0.25), # maybe add fill=appliedFill\n",
" v2.RandomApply(transforms=[v2.GaussianBlur(kernel_size=(5,9), sigma=(0.1,2.))],p=0.25),\n",
" v2.RandomApply(transforms=[v2.RandomEqualize()],p=0.25)]\n",
" \n",
" \n",
" self.conv = nn.Sequential(nn.Conv2d(3, 9, kernel_size=11,stride=3), # 1100 x 1100 => 201 x 201\n",
" nn.ReLU(inplace=True),\n",
" nn.Conv2d(9, 18, kernel_size=5,stride=1),\n",
" nn.ReLU(inplace=True),\n",
" nn.MaxPool2d(kernel_size=4, stride=2),\n",
" nn.Conv2d(18, 36, kernel_size=3,stride=2),\n",
" nn.ReLU(inplace=True),\n",
" nn.Conv2d(36, 72, kernel_size=3,stride=2),\n",
" nn.ReLU(inplace=True),\n",
" nn.AvgPool2d(kernel_size=5, stride=3),\n",
" nn.Conv2d(72, 144, kernel_size=3,stride=1),\n",
" nn.ReLU(inplace=True),\n",
" nn.Conv2d(144, 288, kernel_size=5,stride=1),\n",
" nn.ReLU(inplace=True),\n",
" nn.MaxPool2d(kernel_size=4, stride=1),\n",
" nn.Conv2d(288, 192, kernel_size=3,stride=1),\n",
" nn.ReLU(inplace=True),\n",
" nn.Conv2d(192, 192, kernel_size=3,stride=1), # => 1\n",
" nn.ReLU(inplace=True))\n",
" \n",
" self.classifier = nn.Sequential(nn.Dropout(),\n",
" nn.Linear(192, 2048),\n",
" nn.ReLU(inplace=True),\n",
" nn.Dropout(),\n",
" nn.Linear(2048,2048),\n",
" nn.ReLU(inplace=True),\n",
" nn.Linear(2048,1))\n",
" \n",
" self.lossfunc = nn.MSELoss()\n",
" \n",
" self.imageprep = v2.Compose([self.SquarePad(),v2.Resize(1100),v2.Grayscale(num_output_channels=3),v2.CenterCrop(1100),v2.ToImageTensor(), v2.ConvertImageDtype()])\n",
" \n",
" \n",
" class SquarePad:\n",
" def __call__(self, image):\n",
" # print(\"hi type:\", type(image))\n",
" temp = image.size()\n",
" w = temp[-2]\n",
" h = temp[-1]\n",
" max_wh = max([w, h])\n",
" hp = int((max_wh - w) / 2)\n",
" vp = int((max_wh - h) / 2)\n",
" padding = (hp, vp, hp, vp)\n",
" return tvf.pad(image, padding, 0, 'edge')\n",
"\n",
"\n",
" \n",
"\n",
" \n",
" def forward(self, image):\n",
"\n",
" transformedimage = self.imageprep(image)\n",
" transformedimage = transformedimage.to(self.device)\n",
"\n",
" x = self.conv(transformedimage)\n",
" x = nn.Flatten(start_dim=-3)(x)\n",
" x = self.classifier(x)\n",
" guessRotation = nn.Flatten(start_dim=0)(x)\n",
" \n",
" return guessRotation\n",
" \n",
" def loss(self, guess, trueAnswer):\n",
" return self.lossfunc(guess, trueAnswer)\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# def batchmaker(entries, batchsize):\n",
"# random.shuffle(entries)\n",
"# listing = []\n",
"# for i in range(0,len(entries), batchsize):\n",
"# listing.append(entries[i:i+batchsize])\n",
"# return listing"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# print(type(v2.Compose([v2.ToImageTensor(), v2.ConvertImageDtype()])(image_dataset['train'][0]['image'])))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# a, b, x = working_dataset['train'][0]['image'].size()\n",
"# print(x)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def train(model, dataset, batchsize, num_epochs, stepsize, totalnumiters = -1):\n",
" device = torch.device(\"cpu\")\n",
" if torch.cuda.is_available:\n",
" device = torch.device(\"cuda:0\")\n",
" model = model.cuda()\n",
" optimizer = optim.Adam(model.parameters(), lr=stepsize)\n",
" \n",
" counter = totalnumiters\n",
" model = model.train()\n",
" \n",
" breakearly = True\n",
" if totalnumiters == -1:\n",
" print(\"hi\")\n",
" breakearly = False\n",
" totalnumiters = len(dataset) + 1\n",
" \n",
" for e in range(num_epochs):\n",
" \n",
" train_dataloader = DataLoader(dataset, batch_size=batchsize, shuffle=True)\n",
" \n",
" pbar = tqdm(train_dataloader)\n",
" \n",
" for i, batch in enumerate(pbar):\n",
" torch.cuda.empty_cache()\n",
" images, truerotations = batch['image'], batch['rotation']\n",
" images = images.to(device)\n",
" truerotations = truerotations.to(device)\n",
"\n",
" optimizer.zero_grad()\n",
" \n",
" guessRotation = model(images)\n",
" \n",
" truerotations = truerotations.float()\n",
" \n",
" loss = model.loss(guessRotation, truerotations)\n",
" \n",
" loss.backward()\n",
" \n",
" optimizer.step()\n",
" counter = counter - batchsize\n",
" if counter <= 0 and breakearly:\n",
" print(\"endearly\")\n",
" return\n",
"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"testimage = working_dataset['train'][10]['image']\n",
"\n",
"# testimage = v2.Compose([v2.Grayscale(num_output_channels=3),v2.ToTensor(),])(testimage)\n",
"# testimage.size()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# plt.imshow(testimage)\n",
"# plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# temp = testimage.size()\n",
"# print(temp[-3])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"model = RotationDeterminer()\n",
"device = torch.device(\"cpu\")\n",
"if torch.cuda.is_available:\n",
" device = torch.device(\"cuda:0\")\n",
" model = model.cuda()\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# output = model(testimage)\n",
"# print(output)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# train_dataloader = DataLoader(working_dataset['test'], batch_size=100, shuffle=True)\n",
"# hold = next(iter(train_dataloader))\n",
"# images1, labels1 = hold['image'], hold['rotation']\n",
"# # print(images1)\n",
"# print(labels1.size())"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.10/dist-packages/torchvision/transforms/functional.py:1603: UserWarning: The default value of the antialias parameter of all the resizing transforms (Resize(), RandomResizedCrop(), etc.) will change from None to True in v0.17, in order to be consistent across the PIL and Tensor backends. To suppress this warning, directly pass antialias=True (recommended, future default), antialias=None (current default, which means False for Tensors and True for PIL), or antialias=False (only works on Tensors - PIL will still use antialiasing). This also applies if you are using the inference transforms from the models weights: update the call to weights.transforms(antialias=True).\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"hi\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "faff1411ea0d485b9321271ebe6820db",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/12800 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d26b6872bee74eaab8be6e7cfe53b190",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/12800 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"outputarray = np.array([working_dataset['train'][10]['rotation']])\n",
"model = model.eval()\n",
"output = model(testimage)\n",
"outputarray = np.append(outputarray, output.detach().cpu().numpy())\n",
"counter = 0\n",
"\n",
"\n",
"\n",
"train(model, working_dataset['train'], 25, 2, 5e-3)\n",
"\n",
"model = model.eval()\n",
"\n",
"counter = 2 + counter\n",
"output = model(testimage)\n",
"outputarray = np.append(outputarray, output.detach().cpu().numpy())\n",
"np.save(\"./testing_space/outputarray\", outputarray)\n",
"np.save(\"./testing_space/counter\", counter)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-164.93280103082208\n",
"8.194759720936418e-05\n",
"-0.1751984804868698\n"
]
}
],
"source": [
"print(outputarray[0])\n",
"print(outputarray[1])\n",
"print(outputarray[2])"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(outputarray)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"torch.save(model.state_dict(), \"./testing_space/modelsave\" + str(counter) +\" epochs\")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"#load model\n",
"# model.load_state_dict(torch.load(\"./testing_space/modelsave2epochs\"))"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"hi\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e144d16317094603b328e2db88a4853a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/12800 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "bb6cf6e77ed34628bdfa6ed2a64ef284",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/12800 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"train(model, working_dataset['train'], 25, 2, 1e-3)\n",
"counter = 2 + counter"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# outputarray = []"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"model = model.eval()\n",
"output = model(testimage)\n",
"outputarray = np.append(outputarray, output.detach().cpu().numpy())\n",
"np.save(\"./testing_space/outputarray\", outputarray)\n",
"np.save(\"./testing_space/counter\", counter)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"torch.save(model.state_dict(), \"./testing_space/modelsave\" + str(counter) +\" epochs\")"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"hi\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c270b991cacd4abc996c602748e742f7",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/12800 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "1515223a29da4cfea86be156155fd06e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/12800 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"train(model, working_dataset['train'], 25, 2, 1e-2)\n",
"counter = 2 + counter"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"model = model.eval()\n",
"output = model(testimage)\n",
"outputarray = np.append(outputarray, output.detach().cpu().numpy())\n",
"np.save(\"./testing_space/outputarray\", outputarray)\n",
"np.save(\"./testing_space/counter\", counter)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"torch.save(model.state_dict(), \"./testing_space/modelsave\" + str(counter) +\" epochs\")"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[-1.64932801e+02 8.19475972e-05 -1.75198480e-01 -2.21363053e-01\n",
" -2.17262208e-01]\n"
]
}
],
"source": [
"print(outputarray)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,645 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"version=2.0\n",
"cachepath=\"../.cache/\"\n",
"savepath=\"./savespot/\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.10/dist-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().\n",
" warnings.warn(_BETA_TRANSFORMS_WARNING)\n",
"/usr/local/lib/python3.10/dist-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().\n",
" warnings.warn(_BETA_TRANSFORMS_WARNING)\n"
]
}
],
"source": [
"import torch\n",
"from torch.utils.data import DataLoader\n",
"import torch.nn as nn\n",
"import torch.nn.functional as fn\n",
"import torch.optim as optim\n",
"import torchvision.transforms.functional as tvf\n",
"import torchvision.transforms.v2 as v2\n",
"import torchvision.models as models\n",
"\n",
"\n",
"from PIL import Image\n",
"\n",
"import datasets as ds\n",
"from tqdm.autonotebook import tqdm\n",
"\n",
"import random\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"import numpy as np\n",
"\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# models.list_models()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"torch.cuda.empty_cache()\n",
"working_dataset = ds.load_from_disk(cachepath + \"datasets/customrotation/\")\n",
"prepimage = v2.Compose([v2.Grayscale(num_output_channels=3),v2.Resize(512), v2.CenterCrop(512),v2.ToImageTensor(), v2.ConvertImageDtype()])\n",
"working_dataset.set_transform(prepimage)\n",
"testsample = working_dataset['train'][10]\n",
"testimage = testsample['image']\n",
"torch.cuda.empty_cache()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# print(models.resnet18(pretrained=True))\n",
"# temp = models.resnet18(pretrained=True)\n",
"# print(temp(testimage.unsqueeze(0)).shape)\n",
"# device = torch.device(\"cpu\")\n",
"# if torch.cuda.is_available:\n",
"# device = torch.device(\"cuda:0\")\n",
"# temp = temp.to(device)\n",
"\n",
"#to be deleted"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# print(temp(testimage).shape)\n",
"#to be deleted"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"class RotationDeterminer(nn.Module):\n",
" def __init__(self, new=False):\n",
" super(RotationDeterminer,self).__init__()\n",
" \n",
" torch.cuda.empty_cache()\n",
" \n",
" self.device = torch.device(\"cpu\")\n",
" if torch.cuda.is_available:\n",
" self.device = torch.device(\"cuda:0\")\n",
" \n",
" \n",
" self.appliers = [v2.RandomApply(transforms=[v2.RandomPosterize(bits=1)], p=0.25),\n",
" v2.RandomApply(transforms=[v2.ElasticTransform(alpha=25.0)], p=0.25), # maybe add fill=appliedFill\n",
" v2.RandomApply(transforms=[v2.GaussianBlur(kernel_size=(5,9), sigma=(0.1,2.))],p=0.25),\n",
" v2.RandomApply(transforms=[v2.RandomEqualize()],p=0.25)]\n",
" \n",
" \n",
" # self.conv = nn.Sequential(nn.Conv2d(3, 9, kernel_size=11,stride=3), # 1100 x 1100 => 201 x 201\n",
" # nn.ReLU(inplace=True),\n",
" # nn.Conv2d(9, 18, kernel_size=5,stride=1),\n",
" # nn.ReLU(inplace=True),\n",
" # nn.MaxPool2d(kernel_size=4, stride=2),\n",
" # nn.Conv2d(18, 36, kernel_size=3,stride=2),\n",
" # nn.BatchNorm2d(36),\n",
" # nn.ReLU(inplace=True),\n",
" # nn.Conv2d(36, 72, kernel_size=3,stride=2),\n",
" # nn.ReLU(inplace=True),\n",
" # nn.AvgPool2d(kernel_size=5, stride=3),\n",
" # nn.Conv2d(72, 144, kernel_size=3,stride=1),\n",
" # nn.ReLU(inplace=True),\n",
" # nn.Conv2d(144, 288, kernel_size=5,stride=1),\n",
" # nn.ReLU(inplace=True),\n",
" # nn.MaxPool2d(kernel_size=4, stride=1),\n",
" # nn.Conv2d(288, 192, kernel_size=3,stride=1),\n",
" # nn.ReLU(inplace=True),\n",
" # nn.Conv2d(192, 192, kernel_size=3,stride=1), # => 1\n",
" # nn.ReLU(inplace=True))\n",
" # print(\"hi\")\n",
" self.conv = models.resnet18(pretrained=new)\n",
" \n",
" self.classifier = nn.Sequential(nn.Linear(1000, 4096),\n",
" nn.ReLU(inplace=True),\n",
" nn.Linear(4096,1))\n",
" \n",
" self.lossfunc = nn.MSELoss()\n",
" \n",
" self.imageprep = v2.Compose([self.SquarePad(),v2.Resize(512),v2.Grayscale(num_output_channels=3),v2.CenterCrop(512),v2.ToImageTensor(), v2.ConvertImageDtype()])\n",
" \n",
" \n",
" class SquarePad:\n",
" def __call__(self, image):\n",
" # print(\"hi type:\", type(image))\n",
" temp = image.size()\n",
" w = temp[-2]\n",
" h = temp[-1]\n",
" max_wh = max([w, h])\n",
" hp = int((max_wh - w) / 2)\n",
" vp = int((max_wh - h) / 2)\n",
" padding = (hp, vp, hp, vp)\n",
" return tvf.pad(image, padding, 0, 'edge')\n",
"\n",
"\n",
" \n",
"\n",
" \n",
" def forward(self, image):\n",
"\n",
" transformedimage = self.imageprep(image)\n",
" transformedimage = transformedimage.to(self.device)\n",
"\n",
" if (len(transformedimage.shape) != 4 and len(transformedimage.shape) != 3):\n",
" raise Exception(\"Sorry, Dimension of image is incorrect (\", len(transformedimage.shape),\"). Expected a 3D (single image) or 4D (batch of images) tensor\")\n",
"\n",
" if (len(transformedimage.shape) == 3):\n",
" x = transformedimage.unsqueeze(0)\n",
" else:\n",
" x = transformedimage\n",
" \n",
" x = self.conv(x)\n",
" # print(x.shape)\n",
" # x = nn.Flatten(start_dim=-1)(x)\n",
" # print(x.shape)\n",
" x = self.classifier(x)\n",
" # print(x.shape)\n",
" guessRotation = nn.Flatten(start_dim=0)(x)\n",
" \n",
" return guessRotation\n",
" \n",
" def loss(self, guess, trueAnswer):\n",
" return self.lossfunc(guess, trueAnswer)\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def train(model, dataset, batchsize, num_epochs, stepsize, totalnumiters = -1):\n",
" device = torch.device(\"cpu\")\n",
" if torch.cuda.is_available:\n",
" device = torch.device(\"cuda:0\")\n",
" model = model.cuda()\n",
" optimizer = optim.Adam(model.parameters(), lr=stepsize)\n",
" \n",
" counter = totalnumiters\n",
" model = model.train()\n",
" \n",
" breakearly = True\n",
" if totalnumiters == -1:\n",
" print(\"hi\")\n",
" breakearly = False\n",
" totalnumiters = len(dataset) + 1\n",
" \n",
" for e in range(num_epochs):\n",
" \n",
" train_dataloader = DataLoader(dataset, batch_size=batchsize, shuffle=True)\n",
" \n",
" pbar = tqdm(train_dataloader)\n",
" \n",
" for i, batch in enumerate(pbar):\n",
" torch.cuda.empty_cache()\n",
" images, truerotations = batch['image'], batch['rotation']\n",
" images = images.to(device)\n",
" truerotations = truerotations.to(device)\n",
"\n",
" optimizer.zero_grad()\n",
" \n",
" guessRotation = model(images)\n",
" \n",
" truerotations = truerotations.float()\n",
" \n",
" loss = model.loss(guessRotation, truerotations)\n",
" \n",
" loss.backward()\n",
" \n",
" optimizer.step()\n",
" counter = counter - batchsize\n",
" if counter <= 0 and breakearly:\n",
" print(\"endearly\")\n",
" return\n",
"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def measure(model, dataset):\n",
" total=0\n",
" within30=0\n",
" within15=0\n",
" within10=0\n",
" within5=0\n",
" within1=0\n",
" withintenth=0\n",
" model = model.eval()\n",
" pbar = tqdm(dataset)\n",
" for i, sample in enumerate(pbar):\n",
" if (i % 100 == 0):\n",
" torch.cuda.empty_cache()\n",
" images, truerotations = sample['image'], sample['rotation']\n",
" output = model(images)\n",
" outputvalue = output.item()\n",
" total = total + 1\n",
" if (abs(outputvalue - truerotations) < 0.1):\n",
" withintenth = withintenth + 1\n",
" within1 = within1 + 1\n",
" within5 = within5 + 1\n",
" within10 = within10 + 1\n",
" within15 = within15 + 1\n",
" within30 = within30 + 1\n",
" elif (abs(outputvalue - truerotations) < 1):\n",
" within1 = within1 + 1\n",
" within5 = within5 + 1\n",
" within10 = within10 + 1\n",
" within15 = within15 + 1\n",
" within30 = within30 + 1\n",
" elif (abs(outputvalue - truerotations) < 5):\n",
" within5 = within5 + 1\n",
" within10 = within10 + 1\n",
" within15 = within15 + 1\n",
" within30 = within30 + 1\n",
" elif (abs(outputvalue - truerotations) < 10):\n",
" within10 = within10 + 1\n",
" within15 = within15 + 1\n",
" within30 = within30 + 1\n",
" elif (abs(outputvalue - truerotations) < 15):\n",
" within15 = within15 + 1\n",
" within30 = within30 + 1\n",
" elif (abs(outputvalue - truerotations) < 30):\n",
" within30 = within30 + 1\n",
" # print(\"Hi\")\n",
" return {\"Within 30 Degrees\": within30/total, \"Within 15 Degrees\": within15/total, \"Within 10 Degrees\": within10/total, \"Within 5 Degrees\": within5/total, \"Within 1 Degree\": within1/total, \"Within 0.1 Degree\": withintenth/total}"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.10/dist-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=ResNet18_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet18_Weights.DEFAULT` to get the most up-to-date weights.\n",
" warnings.warn(msg)\n"
]
}
],
"source": [
"model = RotationDeterminer(new=True)\n",
"device = torch.device(\"cpu\")\n",
"if torch.cuda.is_available:\n",
" device = torch.device(\"cuda:0\")\n",
" model = model.to(device)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# # used when starting a new model training\n",
"# counter = 0\n",
"# outputarray = np.array([])\n",
"# tempdict = {\"Epochs Done\": counter}\n",
"# tempdict.update(measure(model, working_dataset['validation']))\n",
"# outputarray = np.append(outputarray, tempdict)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# load values\n",
"counter = np.load(savepath + \"/v\"+str(version)+\"/counter.npy\")\n",
"model.load_state_dict(torch.load(savepath + \"/v\"+str(version)+\"/modelsave\" + str(counter) +\"epochs\"))\n",
"outputarray = np.load(savepath + \"/v\"+str(version)+\"/outputarray.npy\", allow_pickle=True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# # used to rollback the model one training loop\n",
"# counter = 6\n",
"# outputarray = #removed the 7th element, will go from the 6th epoch"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"hi\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3048e1546e12444193f99b15781768d9",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/12800 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.10/dist-packages/torchvision/transforms/functional.py:1603: UserWarning: The default value of the antialias parameter of all the resizing transforms (Resize(), RandomResizedCrop(), etc.) will change from None to True in v0.17, in order to be consistent across the PIL and Tensor backends. To suppress this warning, directly pass antialias=True (recommended, future default), antialias=None (current default, which means False for Tensors and True for PIL), or antialias=False (only works on Tensors - PIL will still use antialiasing). This also applies if you are using the inference transforms from the models weights: update the call to weights.transforms(antialias=True).\n",
" warnings.warn(\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "1a36a12b123e4b24bf00a8eeec2e396a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/12800 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# train\n",
"numepochs = 2\n",
"batchsize = 25\n",
"stepsize = 1e-3\n",
"train(model, working_dataset['train'], batchsize, numepochs, stepsize)\n",
"# model = model.eval()\n",
"# output = model(testimage)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# print(output)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6b99a98f480745c4a375bf1e713708ed",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/40000 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# outputarray = np.append(outputarray, output.detach().cpu().numpy())\n",
"counter = numepochs + counter\n",
"tempdict = {\"Epochs Done\": counter}\n",
"tempdict.update(measure(model, working_dataset['validation']))\n",
"outputarray = np.append(outputarray, tempdict)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# save values\n",
"torch.save(model.state_dict(), savepath + \"/v\"+str(version)+\"/modelsave\" + str(counter) +\"epochs\")\n",
"np.save(savepath + \"/v\"+str(version)+\"/outputarray\", outputarray)\n",
"np.save(savepath + \"/v\"+str(version)+\"/counter\", counter)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[{'Epochs Done': 0, 'Within 30 Degrees': 0.162575, 'Within 15 Degrees': 0.080575, 'Within 10 Degrees': 0.053725, 'Within 5 Degrees': 0.027125, 'Within 1 Degree': 0.00545, 'Within 0.1 Degree': 0.00075}\n",
" {'Epochs Done': 1, 'Within 30 Degrees': 0.7764, 'Within 15 Degrees': 0.65105, 'Within 10 Degrees': 0.538875, 'Within 5 Degrees': 0.322625, 'Within 1 Degree': 0.070375, 'Within 0.1 Degree': 0.00805}\n",
" {'Epochs Done': 5, 'Within 30 Degrees': 0.891675, 'Within 15 Degrees': 0.8042, 'Within 10 Degrees': 0.673275, 'Within 5 Degrees': 0.415725, 'Within 1 Degree': 0.092375, 'Within 0.1 Degree': 0.009275}\n",
" {'Epochs Done': 8, 'Within 30 Degrees': 0.928125, 'Within 15 Degrees': 0.881625, 'Within 10 Degrees': 0.7686, 'Within 5 Degrees': 0.4791, 'Within 1 Degree': 0.102925, 'Within 0.1 Degree': 0.009975}\n",
" {'Epochs Done': 11, 'Within 30 Degrees': 0.9417, 'Within 15 Degrees': 0.91265, 'Within 10 Degrees': 0.86655, 'Within 5 Degrees': 0.633125, 'Within 1 Degree': 0.14265, 'Within 0.1 Degree': 0.01495}\n",
" {'Epochs Done': 13, 'Within 30 Degrees': 0.941575, 'Within 15 Degrees': 0.917375, 'Within 10 Degrees': 0.889125, 'Within 5 Degrees': 0.735525, 'Within 1 Degree': 0.1992, 'Within 0.1 Degree': 0.019875}]\n"
]
}
],
"source": [
"print(outputarray)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d339e6a22ccf4812bdad90dd3d546c68",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/39999 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.10/dist-packages/torchvision/transforms/functional.py:1603: UserWarning: The default value of the antialias parameter of all the resizing transforms (Resize(), RandomResizedCrop(), etc.) will change from None to True in v0.17, in order to be consistent across the PIL and Tensor backends. To suppress this warning, directly pass antialias=True (recommended, future default), antialias=None (current default, which means False for Tensors and True for PIL), or antialias=False (only works on Tensors - PIL will still use antialiasing). This also applies if you are using the inference transforms from the models weights: update the call to weights.transforms(antialias=True).\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
"{'Within 30 Degrees': 0.9433985849646241,\n",
" 'Within 15 Degrees': 0.9174979374484362,\n",
" 'Within 10 Degrees': 0.889422235555889,\n",
" 'Within 5 Degrees': 0.737118427960699,\n",
" 'Within 1 Degree': 0.1995799894997375,\n",
" 'Within 0.1 Degree': 0.020050501262531564}"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"measure(model, working_dataset['test'])"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# first epoch 25 batchsize, 1e-3 stepsize # GOOD PROGRESS SO FAR\n",
"# epoch 2-11 25 batchsize, 1e-3 stepsize"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# model1 = RotationDeterminer(new=False)\n",
"# device = torch.device(\"cpu\")\n",
"# if torch.cuda.is_available:\n",
"# device = torch.device(\"cuda:0\")\n",
"# model1 = model1.to(device)\n",
"# measurementarray=np.array([])\n",
"# for i in range(counter+1):\n",
"# print(i)\n",
"# if (i == 0 or i == 1 or i == 5 or i == 8 or i == 11):\n",
"# tempdict = {\"Epochs Done\": i}\n",
"# model1.load_state_dict(torch.load(savepath + \"/v\"+str(version)+\"/modelsave\" + str(i) +\"epochs\"))\n",
"# tempdict.update(measure(model1, working_dataset['validation']))\n",
"# measurementarray = np.append(measurementarray, tempdict)\n",
" \n",
"# print(\"hi\")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"# print(measurementarray)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# np.save(savepath + \"/v\"+str(version)+\"/outputarray\", measurementarray)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# measurementarraycopy = measurementarray\n",
"# tempdict = {\"Epochs Done\": 1}\n",
"# tempdict.update(measurementarraycopy[0])"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# print(tempdict)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,144 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"version=2.0\n",
"cachepath=\"../.cache/\"\n",
"savepath=\"./savespot/\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.10/dist-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().\n",
" warnings.warn(_BETA_TRANSFORMS_WARNING)\n",
"/usr/local/lib/python3.10/dist-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().\n",
" warnings.warn(_BETA_TRANSFORMS_WARNING)\n"
]
}
],
"source": [
"import torch\n",
"from torch.utils.data import DataLoader\n",
"import torch.nn as nn\n",
"import torch.nn.functional as fn\n",
"import torch.optim as optim\n",
"import torchvision.transforms.functional as tvf\n",
"import torchvision.transforms.v2 as v2\n",
"import torchvision.models as models\n",
"\n",
"\n",
"from PIL import Image\n",
"\n",
"import datasets as ds\n",
"from tqdm.autonotebook import tqdm\n",
"\n",
"import random\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"\n",
"import cv2\n",
"import numpy as np\n",
"import myfunctions as mf\n",
"\n",
"torch.cuda.empty_cache()\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# array = np.load(\"./testing_space/outputarray.npy\")\n",
"# counter = np.load(\"./testing_space/counter.npy\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# print(array)\n",
"# print(counter)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"img = cv2.imread('./test_images/IMG_7605.jpg')\n",
"# img = mf.ResizeWithAspectRatio(img, 1000)\n",
"# img = mf.ResizeWithAspectRatio(mf.SquarePad(fill=255)(img),1000)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"rotatedimg = mf.houghlinedeskewandcrop(img)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# out = mf.morphologyCrop(img)\n",
"# out = cv2.cvtColor(out, cv2.COLOR_BGR2GRAY)\n",
"# out = cv2.threshold(out, 200, 255, cv2.THRESH_BINARY)[1]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"cv2.imshow(\"result1\", rotatedimg)\n",
"# cv2.imshow(\"result2\", result2)\n",
"cv2.waitKey(0)\n",
"cv2.destroyAllWindows()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,345 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ORIGINAL DOCUMENT FOR MORPHOLOGY CROP can maybe be deleted"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"import cv2\n",
"import numpy as np\n",
"\n",
"import torch\n",
"from torch.utils.data import DataLoader\n",
"import torch.nn as nn\n",
"import torch.nn.functional as fn\n",
"import torch.optim as optim\n",
"import torchvision.transforms.functional as tvf\n",
"import torchvision.transforms.v2 as v2\n",
"import torchvision.models as models\n",
"import torchvision.transforms as t\n",
"\n",
"import myfunctions as mf\n",
"\n",
"from PIL import Image"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# read image as grayscale\n",
"img = cv2.imread('./test_images/IMG_7640.jpg')"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"# def ResizeWithAspectRatio(image, width=None, height=None, inter=cv2.INTER_AREA):\n",
"# dim = None\n",
"# (h, w) = image.shape[:2]\n",
"\n",
"# if width is None and height is None:\n",
"# return image\n",
"# if width is None:\n",
"# r = height / float(h)\n",
"# dim = (int(w * r), height)\n",
"# else:\n",
"# r = width / float(w)\n",
"# dim = (width, int(h * r))\n",
"\n",
"# return cv2.resize(image, dim, interpolation=inter)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"# convert to grayscale\n",
"gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)\n",
"\n",
"# threshold\n",
"thresh = cv2.threshold(gray, 190, 255, cv2.THRESH_BINARY)[1]\n",
"\n",
"# apply morphology\n",
"kernel = np.ones((7,7), np.uint8)\n",
"morph = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)\n",
"kernel = np.ones((9,9), np.uint8)\n",
"morph = cv2.morphologyEx(morph, cv2.MORPH_ERODE, kernel)\n",
"\n",
"# get largest contour\n",
"contours = cv2.findContours(morph, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)\n",
"contours = contours[0] if len(contours) == 2 else contours[1]\n",
"area_thresh = 0\n",
"for c in contours:\n",
" area = cv2.contourArea(c)\n",
" if area > area_thresh:\n",
" area_thresh = area\n",
" big_contour = c\n",
"\n",
"\n",
"# get bounding box\n",
"x,y,w,h = cv2.boundingRect(big_contour)\n",
"\n",
"# draw filled contour on black background\n",
"mask = np.zeros_like(gray)\n",
"mask = cv2.merge([mask,mask,mask])\n",
"cv2.drawContours(mask, [big_contour], -1, (255,255,255), cv2.FILLED)\n",
"\n",
"# apply mask to input\n",
"result1 = img.copy()\n",
"result1 = cv2.bitwise_and(result1, mask)\n",
"\n",
"# crop result\n",
"result2 = result1[y:y+h, x:x+w]\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"# view result\n",
"# cv2.imshow(\"threshold\", thresh)\n",
"# cv2.imshow(\"morph\", morph)\n",
"# cv2.imshow(\"mask\", mask)\n",
"# cv2.imshow(\"result1\", result1)\n",
"resizedresult2 = mf.ResizeWithAspectRatio(result2, 1000)\n",
"cv2.imwrite(\"./testing_space/cropped1.jpg\", resizedresult2)\n",
"cv2.imshow(\"result2\", resizedresult2)\n",
"cv2.waitKey(0)\n",
"cv2.destroyAllWindows()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"class RotationDeterminer(nn.Module):\n",
" def __init__(self, new=False):\n",
" super(RotationDeterminer,self).__init__()\n",
" \n",
" torch.cuda.empty_cache()\n",
" \n",
" self.device = torch.device(\"cpu\")\n",
" if torch.cuda.is_available:\n",
" self.device = torch.device(\"cuda:0\")\n",
" \n",
" \n",
" self.appliers = [v2.RandomApply(transforms=[v2.RandomPosterize(bits=1)], p=0.25),\n",
" v2.RandomApply(transforms=[v2.ElasticTransform(alpha=25.0)], p=0.25), # maybe add fill=appliedFill\n",
" v2.RandomApply(transforms=[v2.GaussianBlur(kernel_size=(5,9), sigma=(0.1,2.))],p=0.25),\n",
" v2.RandomApply(transforms=[v2.RandomEqualize()],p=0.25)]\n",
" \n",
" \n",
" # self.conv = nn.Sequential(nn.Conv2d(3, 9, kernel_size=11,stride=3), # 1100 x 1100 => 201 x 201\n",
" # nn.ReLU(inplace=True),\n",
" # nn.Conv2d(9, 18, kernel_size=5,stride=1),\n",
" # nn.ReLU(inplace=True),\n",
" # nn.MaxPool2d(kernel_size=4, stride=2),\n",
" # nn.Conv2d(18, 36, kernel_size=3,stride=2),\n",
" # nn.BatchNorm2d(36),\n",
" # nn.ReLU(inplace=True),\n",
" # nn.Conv2d(36, 72, kernel_size=3,stride=2),\n",
" # nn.ReLU(inplace=True),\n",
" # nn.AvgPool2d(kernel_size=5, stride=3),\n",
" # nn.Conv2d(72, 144, kernel_size=3,stride=1),\n",
" # nn.ReLU(inplace=True),\n",
" # nn.Conv2d(144, 288, kernel_size=5,stride=1),\n",
" # nn.ReLU(inplace=True),\n",
" # nn.MaxPool2d(kernel_size=4, stride=1),\n",
" # nn.Conv2d(288, 192, kernel_size=3,stride=1),\n",
" # nn.ReLU(inplace=True),\n",
" # nn.Conv2d(192, 192, kernel_size=3,stride=1), # => 1\n",
" # nn.ReLU(inplace=True))\n",
" # print(\"hi\")\n",
" self.conv = models.resnet18(pretrained=new)\n",
" \n",
" self.classifier = nn.Sequential(nn.Linear(1000, 4096),\n",
" nn.ReLU(inplace=True),\n",
" nn.Linear(4096,1))\n",
" \n",
" self.lossfunc = nn.MSELoss()\n",
" \n",
" self.imageprep = v2.Compose([self.SquarePad(),v2.Resize(512),v2.Grayscale(num_output_channels=3),v2.CenterCrop(512),v2.ToImageTensor(), v2.ConvertImageDtype()])\n",
" \n",
" \n",
" class SquarePad:\n",
" def __call__(self, image):\n",
" # print(\"hi type:\", type(image))\n",
" temp = image.size()\n",
" w = temp[-2]\n",
" h = temp[-1]\n",
" max_wh = max([w, h])\n",
" hp = int((max_wh - w) / 2)\n",
" vp = int((max_wh - h) / 2)\n",
" padding = (hp, vp, hp, vp)\n",
" return tvf.pad(image, padding, 0, 'edge')\n",
"\n",
"\n",
" \n",
"\n",
" \n",
" def forward(self, image):\n",
"\n",
" transformedimage = self.imageprep(image)\n",
" transformedimage = transformedimage.to(self.device)\n",
"\n",
" if (len(transformedimage.shape) != 4 and len(transformedimage.shape) != 3):\n",
" raise Exception(\"Sorry, Dimension of image is incorrect (\", len(transformedimage.shape),\"). Expected a 3D (single image) or 4D (batch of images) tensor\")\n",
"\n",
" if (len(transformedimage.shape) == 3):\n",
" x = transformedimage.unsqueeze(0)\n",
" else:\n",
" x = transformedimage\n",
" \n",
" x = self.conv(x)\n",
" # print(x.shape)\n",
" # x = nn.Flatten(start_dim=-1)(x)\n",
" # print(x.shape)\n",
" x = self.classifier(x)\n",
" # print(x.shape)\n",
" guessRotation = nn.Flatten(start_dim=0)(x)\n",
" \n",
" return guessRotation\n",
" \n",
" def loss(self, guess, trueAnswer):\n",
" return self.lossfunc(guess, trueAnswer)\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.10/dist-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead.\n",
" warnings.warn(\n",
"/usr/local/lib/python3.10/dist-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=ResNet18_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet18_Weights.DEFAULT` to get the most up-to-date weights.\n",
" warnings.warn(msg)\n"
]
}
],
"source": [
"model = RotationDeterminer(new=True)\n",
"device = torch.device(\"cpu\")\n",
"if torch.cuda.is_available:\n",
" device = torch.device(\"cuda:0\")\n",
" model = model.to(device)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([1, 1174, 1000])\n",
"torch.Size([3, 1174, 1000])\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.10/dist-packages/torchvision/transforms/functional.py:1603: UserWarning: The default value of the antialias parameter of all the resizing transforms (Resize(), RandomResizedCrop(), etc.) will change from None to True in v0.17, in order to be consistent across the PIL and Tensor backends. To suppress this warning, directly pass antialias=True (recommended, future default), antialias=None (current default, which means False for Tensors and True for PIL), or antialias=False (only works on Tensors - PIL will still use antialiasing). This also applies if you are using the inference transforms from the models weights: update the call to weights.transforms(antialias=True).\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"-0.1470905989408493\n"
]
}
],
"source": [
"tensorize = v2.Compose([v2.ToImageTensor(), v2.ConvertImageDtype()])\n",
"grayscaler = v2.Grayscale(num_output_channels=3)\n",
"\n",
"imagetobeprocessed = cv2.cvtColor(resizedresult2,cv2.COLOR_BGR2GRAY)\n",
"\n",
"\n",
"tensorizedimage = torch.unsqueeze(torch.from_numpy(imagetobeprocessed),0)\n",
"print(tensorizedimage.shape)\n",
"adjustedtensorizedimage = tensorize(grayscaler(t.ToPILImage()(tensorizedimage)))\n",
"print(adjustedtensorizedimage.shape)\n",
"rotation = model(adjustedtensorizedimage).item()\n",
"print(rotation)\n",
"rotatedimage = t.Resize(size=1000)(tvf.rotate(adjustedtensorizedimage, rotation))\n",
"# imS = mf.ResizeWithAspectRatio(filereadimage, 1000)\n",
"# imS = cv2.resize(filereadimage, (960, 540)) \n",
"open_cv_image = np.array(t.ToPILImage()(rotatedimage))\n",
"cv2.imshow(f'image', open_cv_image)\n",
"key = cv2.waitKey(0)\n",
"cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# # save result\n",
"# cv2.imwrite(\"paper_thresh.jpg\", thresh)\n",
"# cv2.imwrite(\"paper_morph.jpg\", morph)\n",
"# cv2.imwrite(\"paper_mask.jpg\", mask)\n",
"# cv2.imwrite(\"paper_result1.jpg\", result1)\n",
"# cv2.imwrite(\"paper_result2.jpg\", result2)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,387 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 772,
"metadata": {},
"outputs": [],
"source": [
"import cv2\n",
"import numpy as np\n",
"\n",
"import myfunctions as mf\n",
"\n",
"\n",
"import scipy.stats as st\n",
"import math"
]
},
{
"cell_type": "code",
"execution_count": 773,
"metadata": {},
"outputs": [],
"source": [
"# read image as grayscale\n",
"img = cv2.imread('./test_images/IMG_7605.jpg')\n",
"# img = mf.ResizeWithAspectRatio(img,1000)\n",
"# img = mf.rotate(img, 54)"
]
},
{
"cell_type": "code",
"execution_count": 774,
"metadata": {},
"outputs": [],
"source": [
"prepped = mf.ResizeWithAspectRatio(mf.SquarePad(fill=255)(img),1000)\n",
"prepped = mf.premorphCrop(prepped)\n",
"prepped = mf.ResizeWithAspectRatio(mf.SquarePad(fill=255)(prepped),1000)\n",
"# kernel = np.ones((5,5), np.uint8)\n",
"# prepped = cv2.dilate(prepped, kernel, iterations=1)\n",
"gray1 = cv2.cvtColor(prepped, cv2.COLOR_BGR2GRAY)\n",
"dst1 = cv2.Canny(gray1, 0, 500, None, 3)\n",
"\n",
"kernel = np.ones((5,5), np.uint8)\n",
"out = cv2.morphologyEx(dst1, cv2.MORPH_DILATE, kernel)\n",
"out = cv2.blur(out, (5,5))\n",
"kernel = np.ones((6,6), np.uint8)\n",
"dst1 = cv2.morphologyEx(out, cv2.MORPH_ERODE, kernel)\n",
"\n",
"dst1 = cv2.Canny(dst1, 0, 500, None, 3)\n",
"\n",
"cdstP = prepped.copy()\n",
"cdstPmargin = cdstP.copy()\n",
"basecdstP = cdstP.copy()\n",
"linesP = cv2.HoughLinesP(dst1, 1, np.pi / 180, 30, None, 90, 30)"
]
},
{
"cell_type": "code",
"execution_count": 779,
"metadata": {},
"outputs": [],
"source": [
"# # testing = dst1.copy()\n",
"# # kernel = np.ones((5,5), np.uint8)\n",
"# # out = cv2.morphologyEx(testing, cv2.MORPH_DILATE, kernel)\n",
"# # out = cv2.blur(out, (5,5))\n",
"# # kernel = np.ones((3,3), np.uint8)\n",
"# # out = cv2.morphologyEx(out, cv2.MORPH_ERODE, kernel)\n",
"cv2.imshow(\"result1\", dst1)\n",
"cv2.waitKey(0)\n",
"cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": 758,
"metadata": {},
"outputs": [],
"source": [
"angles = np.zeros(len(linesP))\n",
"if linesP is not None:\n",
" for i in range(0, len(linesP)):\n",
" l = linesP[i][0]\n",
" angles[i] = mf.lineAngle(l)\n",
" cv2.line(cdstP, (l[0], l[1]), (l[2], l[3]), (0,0,255), 3, cv2.LINE_AA)"
]
},
{
"cell_type": "code",
"execution_count": 759,
"metadata": {},
"outputs": [],
"source": [
"# cv2.imshow(\"result1\", cdstP)\n",
"# cv2.waitKey(0)\n",
"# cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": 760,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-3.093972093706445\n"
]
}
],
"source": [
"mode = st.mode(np.around(angles, decimals=3))[0]\n",
"rotationangle = np.rad2deg(mode)\n",
"print(rotationangle)"
]
},
{
"cell_type": "code",
"execution_count": 761,
"metadata": {},
"outputs": [],
"source": [
"rotatedcdstP = mf.rotate(basecdstP, rotationangle)"
]
},
{
"cell_type": "code",
"execution_count": 762,
"metadata": {},
"outputs": [],
"source": [
"vmarginlines = mf.WithinXDegrees(linesP, 7, baseangle=rotationangle)\n",
"hmarginlines = mf.WithinXDegrees(linesP, 7, baseangle=90+rotationangle)\n",
"vrect = mf.lineBoundingRect(vmarginlines,asRect=False, returnint=True)\n",
"hmarginlines = mf.lineswithinrange(hmarginlines, (vrect[0], vrect[1]), (vrect[2],vrect[3]), x=True, y=False)\n",
"\n",
"\n",
"if (hmarginlines != []):\n",
" marginlines = np.append(vmarginlines, hmarginlines, axis=0)\n",
"else:\n",
" marginlines = vmarginlines\n",
" \n",
"rect = mf.lineBoundingRect(marginlines,asRect=False, returnint=True)\n",
"cdstP = cv2.rectangle(cdstP, (rect[0],rect[1]), (rect[2],rect[3]), (0,255,0), 3)"
]
},
{
"cell_type": "code",
"execution_count": 763,
"metadata": {},
"outputs": [],
"source": [
"cv2.imshow(\"result1\", cdstP)\n",
"cv2.waitKey(0)\n",
"cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": 764,
"metadata": {},
"outputs": [],
"source": [
"#####NEED TO WORK ON SCORING THE LINES SO IT PICKS THE CORRECT ORIENTATION (horizontal vs vertical) AND SO THAT THE CROPPING RECTANGLE MOVES/GET TRANSFORMED WITH IT"
]
},
{
"cell_type": "code",
"execution_count": 780,
"metadata": {},
"outputs": [],
"source": [
"def rotatePoint(img, pt, angle, returnint=True):\n",
" rotateaxisx = img.shape[0]/2\n",
" rotateaxisy = img.shape[1]/2\n",
" tempx = pt[0] - rotateaxisx\n",
" tempy = pt[1] - rotateaxisy\n",
" rotatedx = tempx*math.cos(np.deg2rad(-angle)) - tempy*math.sin(np.deg2rad(-angle))\n",
" rotatedy = tempx*math.sin(np.deg2rad(-angle)) + tempy*math.cos(np.deg2rad(-angle))\n",
" finalx = rotatedx + rotateaxisx\n",
" finaly = rotatedy + rotateaxisy\n",
" if (returnint):\n",
" finalx = int(finalx)\n",
" finaly = int(finaly)\n",
" return (finalx, finaly)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 766,
"metadata": {},
"outputs": [],
"source": [
"def rotateRect(img, rect, angle, returnint=True, asRect=False):\n",
" if (asRect):\n",
" pt1 = rotatePoint(img, (rect[0],rect[1]), angle, returnint)\n",
" pt2 = rotatePoint(img, (rect[0]+rect[2],rect[1]+rect[3]), angle, returnint)\n",
" return (pt1[0], pt1[1], pt2[0]-pt1[0], pt2[1]-pt1[1])\n",
" else:\n",
" pt1 = rotatePoint(img, (rect[0],rect[1]), angle, returnint)\n",
" pt2 = rotatePoint(img, (rect[2],rect[3]), angle, returnint)\n",
" return (pt1[0], pt1[1], pt2[0], pt2[1])\n",
"\n",
"def rotateLine(img, line, angle, returnint=True):\n",
" pt1 = rotatePoint(img, (line[0],line[1]), angle, returnint)\n",
" pt2 = rotatePoint(img, (line[2],line[3]), angle, returnint)\n",
" return (pt1[0], pt1[1], pt2[0], pt2[1])\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 767,
"metadata": {},
"outputs": [],
"source": [
"# print(linesP.shape)\n",
"rotatedlines = [rotateLine(rotatedcdstP, line[0], rotationangle) for line in linesP]\n",
"rotatedlines = np.reshape(rotatedlines, (len(rotatedlines),1,4))\n",
"# rotatedlines = linesP\n",
"# print(rotatedlines.shape)"
]
},
{
"cell_type": "code",
"execution_count": 768,
"metadata": {},
"outputs": [],
"source": [
"vmarginlines = mf.WithinXDegrees(rotatedlines, 7)\n",
"hmarginlines = mf.WithinXDegrees(rotatedlines, 7, baseangle=90)\n",
"vrect = mf.lineBoundingRect(vmarginlines,asRect=False, returnint=True)\n",
"hmarginlines = mf.lineswithinrange(hmarginlines, (vrect[0], vrect[1]), (vrect[2],vrect[3]), x=True, y=False)\n",
"\n",
"if (hmarginlines != []):\n",
" marginlines = np.append(vmarginlines, hmarginlines, axis=0)\n",
"else:\n",
" marginlines = vmarginlines\n",
" \n",
"rect = mf.lineBoundingRect(marginlines,asRect=False, returnint=True)\n",
"# rect = vrect\n",
"rotatedcdstP = cv2.rectangle(rotatedcdstP, (rect[0],rect[1]), (rect[2],rect[3]), (0,255,0), 3)"
]
},
{
"cell_type": "code",
"execution_count": 769,
"metadata": {},
"outputs": [],
"source": [
"if rotatedlines is not None:\n",
" for i in range(0, len(rotatedlines)):\n",
" l = rotatedlines[i][0]\n",
" cv2.line(rotatedcdstP, (l[0], l[1]), (l[2], l[3]), (0,0,255), 3, cv2.LINE_AA)"
]
},
{
"cell_type": "code",
"execution_count": 771,
"metadata": {},
"outputs": [],
"source": [
"cv2.imshow(\"result1\", rotatedcdstP)\n",
"# cv2.imshow(\"result1\", cdstP)\n",
"cv2.waitKey(0)\n",
"cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": 394,
"metadata": {},
"outputs": [],
"source": [
"vmarginlines = mf.WithinXDegrees(linesP, 7)\n",
"hmarginlines = mf.WithinXDegrees(linesP, 7, baseangle=90)\n",
"vrect = mf.lineBoundingRect(vmarginlines,asRect=False, returnint=True)\n",
"hmarginlines = mf.lineswithinrange(hmarginlines, (vrect[0], vrect[1]), (vrect[2],vrect[3]), x=True, y=False)\n",
"\n",
"\n",
"if (hmarginlines != []):\n",
" marginlines = np.append(vmarginlines, hmarginlines, axis=0)\n",
"else:\n",
" marginlines = vmarginlines\n",
"\n",
"rect = mf.lineBoundingRect(marginlines,asRect=False, returnint=True)\n",
"cdstP = cv2.rectangle(cdstP, (rect[0],rect[1]), (rect[2],rect[3]), (0,255,0), 3)\n",
"\n",
"\n",
"# rotatedrect = rotateRect(cdstP, rect, -rotationangle)\n",
"\n",
"# rotatedcdstP = cv2.rectangle(rotatedcdstP, (rotatedrect[0],rotatedrect[1]), (rotatedrect[2],rotatedrect[3]), (0,255,0), 3)"
]
},
{
"cell_type": "code",
"execution_count": 395,
"metadata": {},
"outputs": [],
"source": [
"###figure out how to rotate rectangle"
]
},
{
"cell_type": "code",
"execution_count": 396,
"metadata": {},
"outputs": [],
"source": [
"cv2.imshow(\"result1\", cdstP)\n",
"cv2.waitKey(0)\n",
"cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": 397,
"metadata": {},
"outputs": [],
"source": [
"# vmarginlines = mf.WithinXDegrees(linesP, 7)\n",
"# hmarginlines = mf.WithinXDegrees(linesP, 7, baseangle=90)\n",
"# vrect = mf.lineBoundingRect(vmarginlines,asRect=False, returnint=True)\n",
"# hmarginlines = mf.lineswithinrange(hmarginlines, (vrect[0], vrect[1]), (vrect[2],vrect[3]), x=True, y=False)\n",
"# # print(hmarginlines)\n",
"# if (hmarginlines != []):\n",
"# marginlines = np.append(vmarginlines, hmarginlines, axis=0)\n",
"# else:\n",
"# marginlines = vmarginlines\n",
"\n",
"# # print(marginlines)\n",
"# rect = mf.lineBoundingRect(marginlines,asRect=False, returnint=True)\n",
"# # print(rect)\n",
"# cdstP = cv2.rectangle(cdstP, (rect[0],rect[1]), (rect[2],rect[3]), (0,255,0), 3)\n",
"# # print(cdstP.shape)\n",
"# # cropped = cdstP[rect[1]:rect[3], rect[0]:rect[2],:]\n",
"\n",
"# if marginlines is not None:\n",
"# for i in range(0, len(marginlines)):\n",
"# l = marginlines[i]\n",
"# cv2.line(cdstP, (int(l[0]), int(l[1])), (int(l[2]), int(l[3])), (255,0,0), 3, cv2.LINE_AA)"
]
},
{
"cell_type": "code",
"execution_count": 398,
"metadata": {},
"outputs": [],
"source": [
"# # view result\n",
"# # cv2.imshow(\"threshold\", thresh)\n",
"# # cv2.imshow(\"morph\", morph)\n",
"# # cv2.imshow(\"mask\", mask)\n",
"# cv2.imshow(\"result1\", mf.ResizeWithAspectRatio(cdstP,height=1000))\n",
"# # cv2.imshow(\"result2\", cropped)\n",
"# cv2.waitKey(0)\n",
"# cv2.destroyAllWindows()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,165 +0,0 @@
#include "cropper.h"
#include <opencv2/ximgproc/segmentation.hpp>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
using namespace cv::ximgproc::segmentation;
inline cv::Point topLeft(cv::Rect rect) {
return cv::Point(rect.x, rect.y);
}
inline cv::Point bottomLeft(cv::Rect rect) {
return cv::Point(rect.x, rect.y + rect.height);
}
inline cv::Point topRight(cv::Rect rect) {
return cv::Point(rect.x + rect.width, rect.y);
}
inline cv::Point bottomRight(cv::Rect rect) {
return cv::Point(rect.x + rect.width, rect.y + rect.height);
}
inline double distanceBetweenPoints(cv::Point p1, cv::Point p2) {
return std::sqrt(std::pow(p1.x - p2.x, 2) + std::pow(p1.y - p2.y, 2));
}
inline void scaleRect(cv::Rect& r, int originalheight, int currentheight) {
int scalingFactor = originalheight / currentheight;
r.x *= scalingFactor;
r.y *= scalingFactor;
r.width *= scalingFactor;
r.height *= scalingFactor;
}
// uses the L2 loss of the corners of the rectangles
double MSELossRect(cv::Rect r1, cv::Rect r2) {
return (distanceBetweenPoints(topLeft(r1), topLeft(r2)) +
distanceBetweenPoints(bottomLeft(r1), bottomLeft(r2)) +
distanceBetweenPoints(topRight(r1), topRight(r2)) +
distanceBetweenPoints(bottomRight(r1), bottomRight(r2))) / 4.0;
}
std::vector<cv::Rect> selectiveSearchSegmentationActor(cv::InputArray src, bool fast = true, int imageHeight = 800) {
cv::setUseOptimized(true);
cv::setNumThreads(4);
cv::Mat temp = src.getMat();
cv::Ptr<cv::ximgproc::segmentation::SelectiveSearchSegmentation> ss =
createSelectiveSearchSegmentation();
ss->setBaseImage(temp);
if (fast) {
ss->switchToSelectiveSearchFast();
} else {
ss->switchToSelectiveSearchQuality();
}
std::vector<cv::Rect> rects;
ss->process(rects);
return rects;
}
inline double clip(double n, double lower, double upper) {
return std::max(lower, std::min(n, upper));
};
inline double colourscaler(double n, double min, double max) {
double temp = n - min;
double diff = std::abs(max - min);
return clip((temp / diff) * 255, 0, 255);
};
cv::Rect cannyEdgeRectangle(cv::InputArray src, int lower = 100, int upper = 255, double threshold1 = 50, double threshold2 = 350) {
cv::Mat gray, scaled_gray, blurred, edged;
lower = std::max(lower, 0);
upper = std::min(upper, 255);
cv::cvtColor(src, gray, cv::COLOR_BGR2GRAY);
scaled_gray = cv::Mat::zeros(gray.size(), gray.type());
for (int y = 0; y < gray.rows; y++) {
for (int x = 0; x < gray.cols; x++) {
scaled_gray.at<uchar>(y, x) =
cv::saturate_cast<uchar>(colourscaler(gray.at<uchar>(y, x), lower, upper));
}
}
cv::GaussianBlur(scaled_gray, blurred, cv::Size(15, 15), 0);
cv::Canny(blurred, edged, threshold1, threshold2);
std::vector<std::vector<cv::Point>> contours;
std::vector<cv::Vec4i> heirarchy;
cv::Mat approx;
cv::findContours(edged, contours, heirarchy, cv::RETR_TREE, cv::CHAIN_APPROX_SIMPLE);
cv::cvtColor(gray, gray, cv::COLOR_GRAY2BGR);
std::sort(contours.begin(), contours.end(), [](std::vector<cv::Point> a, std::vector<cv::Point> b) {
return cv::arcLength(a, false) > cv::arcLength(b, false); });
int numContours = contours.size();
return cv::boundingRect(contours[0]);
}
bool crop(cv::InputArray src, cv::OutputArray dst, bool fastsearch, int imageHeight) { //add other params or maybe overload or something
cv::Mat temp;
src.copyTo(temp);
int newWidth = temp.cols * imageHeight / temp.rows;
cv::resize(temp, temp, cv::Size(newWidth, imageHeight));
cv::Rect cannyRect = cannyEdgeRectangle(temp, 100, 255, 255 / 4, 255);
std::vector<cv::Rect> rects = selectiveSearchSegmentationActor(temp, fastsearch);
int indexOfMin = -1;
double currentMin = std::numeric_limits<double>::max();
int lengthOfRects = rects.size();
for (int i = 0; i < lengthOfRects; i++) {
double tempMin = MSELossRect(rects[i], cannyRect);
if (tempMin < currentMin) {
indexOfMin = i;
currentMin = tempMin;
}
}
cv::Rect goodRect = rects[indexOfMin];
cv::Rect finalRect;
if (goodRect.area() > cannyRect.area()) {
finalRect = goodRect;
} else {
finalRect = cannyRect;
}
cv::Mat extra = src.getMat();
scaleRect(finalRect, extra.rows, temp.rows);
extra = extra(finalRect);
extra.copyTo(dst);
return true;
}

File diff suppressed because it is too large Load Diff

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.0 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.1 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.3 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.3 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 139 KiB

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,62 +0,0 @@
cmake_minimum_required(VERSION 3.22)
project(imagemanipulation_libraries
VERSION 0.1
DESCRIPTION "Libraries for image preprocessing"
LANGUAGES CXX)
include(GNUInstallDirs)
find_package(OpenCV REQUIRED)
# RECTANGLE
add_library(rect SHARED src/rectangle.cpp)
target_compile_features(rect PRIVATE cxx_std_20)
# set_target_properties(rect PROPERTIES VERSION ${PROJECT_VERSION}) # git can't deal with the symlinks for some reason
# set_target_properties(rect PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/rect_lib.h)
set_target_properties(rect PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/lib)
target_link_libraries(rect ${OpenCV_LIBS})
target_include_directories(rect
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
PRIVATE ${OpenCV_INCLUDE_DIRS})
# LINE
add_library(line SHARED src/line.cpp)
target_compile_features(line PRIVATE cxx_std_20)
# set_target_properties(line PROPERTIES VERSION ${PROJECT_VERSION}) # git can't deal with the symlinks for some reason
# set_target_properties(line PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/line_lib.h)
set_target_properties(line PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/lib)
target_link_libraries(line ${OpenCV_LIBS})
target_include_directories(line
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
PRIVATE ${OpenCV_INCLUDE_DIRS})
# install(TARGETS rect
# LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
# PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
# find_package(OpenCV REQUIRED)
# target_include_directories(CropperEx
# PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include
# PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../externallibraries/stbimagehelpers
# PRIVATE ${OpenCV_INCLUDE_DIRS})

View File

@ -1,20 +0,0 @@
#ifndef LINE_H
#define LINE_H
class Line {
private:
public:
private:
public:
};
#endif //LINE_H

View File

@ -1,79 +0,0 @@
#ifndef RECTANGLE_H
#define RECTANGLE_H
#include <opencv2/core.hpp>
#include <vector>
// MAYBE MAKE TWOPTOBJECT A PARENT CLASS FOR BOTH LINE AND RECTANGLE?
class Rectangle {
private:
// properties of the rectangle
double pt1x;
double pt1y;
double pt2x;
double pt2y;
public:
// constructor/destructor functions
Rectangle(); //don't know if this should be here
Rectangle(cv::Rect rect);
Rectangle(cv::Point pt1, cv::Point pt2);
template <typename T, typename std::enable_if<std::is_arithmetic<T>::value>::type* = nullptr> // meant for an int/double/float or some other numeric return type
Rectangle(T pt1x, T pt1y, T pt2x, T pt2y);
~Rectangle();
private:
// private helper functions
public:
void overwriteTopLeft(cv::Point pt);
template <typename T, typename std::enable_if<std::is_arithmetic<T>::value>::type* = nullptr> // meant for an int/double/float or some other numeric return type
void overwriteTopLeft(T ptx, T pty);
void overwriteBottomRight(cv::Point pt);
template <typename T, typename std::enable_if<std::is_arithmetic<T>::value>::type* = nullptr> // meant for an int/double/float or some other numeric return type
void overwriteBottomRight(T ptx, T pty);
template <typename T, typename std::enable_if<std::is_arithmetic<T>::value>::type* = nullptr> // meant for an int/double/float or some other numeric return type
cv::Point_<T> topLeft();
template <typename T, typename std::enable_if<std::is_arithmetic<T>::value>::type* = nullptr> // meant for an int/double/float or some other numeric return type
cv::Point_<T> bottomRight();
template <typename T, typename std::enable_if<std::is_arithmetic<T>::value>::type* = nullptr>
T area(); // meant for an int/double/float or some other numeric return type
bool containsRect(Rectangle rect); // if this rectangle contains the other rectangle
// Might need to implement size or width/height retrievers
// INTERESTING NOTE, OPENCV ASSUMES THAT THE BOTTOM RIGHT BOUNDARY IS NOT INCLUSIVE
// DON'T KNOW WHAT rectscontaining(rect, outerrects) DOES OR HOW IT'S USED
// FOLLOW UP, IT'S PART OF A BRUTE FORCE IMPLEMENTATION FOR OVERLAPPING RECTANGLES
// MAYBE IMPLEMENT IT AS A PRIVATE HELPER FUNCTION OR JUST NOT AT ALL AND HAVE IT BE SEPERATE. IT USES containsRect AS IT'S MAIN PART
// NOT SURE WHAT TO DO ABOUT rotateRect(img, rect, angle, returnint=True, asRect=False) AS WELL
// general rectangle functions
static std::vector<Rectangle> biggestNRects(std::vector<Rectangle> rects, int n);
static Rectangle overlapRect(std::vector<Rectangle> rects);
static Rectangle mergeRects(std::vector<Rectangle> rects);
};
#endif //RECTANGLE_H
// MAYBE IMPLEMENT STUFF FOR LINES AS WELL?

Binary file not shown.

Binary file not shown.

View File

@ -1,12 +0,0 @@
def relabel(datasetpath):
mappingpathwithindataset = "/baseimages/unaugmentednames/mapping.txt"
mappingfilepath = datasetpath+mappingpathwithindataset
mappingfile = open(mappingfilepath, 'r')
maptext = mappingfile.read()
mappingfile.close()
print(maptext)

View File

@ -1 +0,0 @@
#include "line.h"

View File

@ -1,8 +0,0 @@
#include "rectangle.h"
// #include "line.h" //ONLY FOR POSSIBLY rotateRect(img, rect, angle, returnint=True, asRect=False)
#include <queue>
// use a priority queue with a custom comparator to make a maxheap for biggestNRects https://stackoverflow.com/questions/57271271/is-there-a-maxheap-in-the-c-standard-library

View File

@ -1,287 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pathlib\n",
"import shutil\n",
"import cv2\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.insert(0, '/mnt/code/autocropper')\n",
"import myfunctions as mf"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# os.getcwd()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"# filenames = next(os.walk(\"/mnt/dataset/baseimages/unaugmentednames/\"), (None, None, []))[2]\n",
"# filenames.remove(\"mapping.txt\")\n",
"# print(filenames)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"imagefileextensions = [\".jpg\", \".png\"]\n",
"\n",
"def parsemaptext(text):\n",
" lineseperated = text.split('\\n')\n",
" # if (lineseperated[-1] == ''):\n",
" # lineseperated = lineseperated[:-1]\n",
" # print(lineseperated)\n",
" mappingdict = {}\n",
" for line in lineseperated:\n",
" if line == '':\n",
" continue\n",
" splitline = line.split(\" | \")\n",
" if splitline[0] not in mappingdict:\n",
" mappingdict[splitline[0]] = splitline[1]\n",
" # print(splitline)\n",
" \n",
" \n",
" return mappingdict\n",
"\n",
"\n",
"def readmapfiletodict(mapfilepath):\n",
" if (not os.path.isfile(mapfilepath)):\n",
" # f = open(mapfilepath, \"x\")\n",
" # f.close()\n",
" return {}\n",
" mappingfile = open(mapfilepath, 'r')\n",
" maptext = mappingfile.read()\n",
" mappingfile.close()\n",
" \n",
" mappingdict = parsemaptext(maptext)\n",
" return mappingdict\n",
" \n",
"\n",
"def writemapdicttofile(mapfilepath, mappingdict):\n",
" starting = False\n",
" if (not os.path.isfile(mapfilepath) or os.stat(mapfilepath).st_size == 0):\n",
" file = open(mapfilepath, \"w\")\n",
" starting = True\n",
" # f.close()\n",
" # return {}\n",
" else:\n",
" file = open(mapfilepath, 'a')\n",
" for key in mappingdict:\n",
" if starting:\n",
" file.write(key+\" | \"+mappingdict[key])\n",
" starting = False\n",
" else:\n",
" file.write(\"\\n\"+key+\" | \"+mappingdict[key])\n",
" file.close()\n",
"\n",
"def renameoriginals(datasetpath):\n",
" pathtooriginals = \"baseimages/unaugmentednames/\"\n",
" mappingfilename = \"mapping.txt\"\n",
" mappingpathwithindataset = pathtooriginals+mappingfilename\n",
" mappingfilepath = datasetpath+mappingpathwithindataset\n",
"\n",
" \n",
" mappingdict = readmapfiletodict(mappingfilepath)\n",
" print(mappingdict)\n",
" blacklistednumbers = []\n",
" for key in mappingdict:\n",
" value = mappingdict[key]\n",
" suffix = pathlib.Path(value).suffix\n",
" # print(pathlib.Path(value).name)\n",
" valnum = value[:-len(suffix)]\n",
" blacklistednumbers.append(int(valnum))\n",
" print(blacklistednumbers)\n",
" \n",
" \n",
" \n",
" filenames = next(os.walk(datasetpath+pathtooriginals), (None, None, []))[2]\n",
" if (mappingfilename in filenames):\n",
" filenames.remove(mappingfilename)\n",
" # print(filenames)\n",
" \n",
" mappeddict = {}\n",
" filenamecounter = 0\n",
" for filename in filenames:\n",
" suffix = pathlib.Path(filename).suffix\n",
" if (suffix not in imagefileextensions):\n",
" print(\"Not a valid image \"+filename)\n",
" continue\n",
" if filename in mappingdict:\n",
" continue\n",
" while filenamecounter in blacklistednumbers:\n",
" filenamecounter += 1\n",
" shutil.copyfile(datasetpath+pathtooriginals+filename, datasetpath+\"baseimages/\"+str(filenamecounter)+suffix)\n",
" mappeddict[filename] = str(filenamecounter)+suffix\n",
" filenamecounter += 1\n",
" print(mappeddict)\n",
" writemapdicttofile(mappingfilepath, mappeddict)\n",
" \n",
" # print(maptext)\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"def autocrop(datasetpath):\n",
" subpathtobasefiles = \"baseimages/\"\n",
" subpathtoaugmentedfiles = \"autocropped/\"\n",
" imagespath = datasetpath + subpathtobasefiles\n",
" \n",
" filenames = next(os.walk(imagespath), (None, None, []))[2]\n",
" \n",
" for filename in filenames:\n",
" suffix = pathlib.Path(filename).suffix\n",
" if (suffix not in imagefileextensions):\n",
" print(\"Not a valid image \"+filename)\n",
" continue\n",
" print(imagespath+filename)\n",
" if (not os.path.isfile(imagespath+filename)):\n",
" print(\"hi\")\n",
" continue\n",
" img = cv2.imread(imagespath+filename)\n",
" # print(img)\n",
" autocropped = mf.houghlineprocessing(img)\n",
" cv2.imwrite(datasetpath+subpathtoaugmentedfiles+filename, autocropped)\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"def showimgs(imgs):\n",
" if (isinstance(imgs, np.ndarray)):\n",
" if (imgs.shape[0] > imgs.shape[1]):\n",
" cv2.imshow(\"test\", mf.ResizeWithAspectRatio(imgs, height=1350))\n",
" else:\n",
" cv2.imshow(\"test\", mf.ResizeWithAspectRatio(imgs, width=1000))\n",
" else:\n",
" for i, out in enumerate(imgs):\n",
" if (out.shape[0] > out.shape[1]):\n",
" cv2.imshow(\"test\"+str(i), mf.ResizeWithAspectRatio(out, height=1350))\n",
" else:\n",
" cv2.imshow(\"test\"+str(i), mf.ResizeWithAspectRatio(out, width=1000))\n",
" cv2.waitKey(0)\n",
" cv2.destroyAllWindows()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'IMG_7736.jpg': '0.jpg', 'IMG_7737.jpg': '1.jpg', 'IMG_7738.jpg': '2.jpg', 'IMG_7739.jpg': '3.jpg', 'IMG_7740.jpg': '4.jpg', 'IMG_7741.jpg': '5.jpg', 'IMG_7742.jpg': '6.jpg', 'IMG_7743.jpg': '7.jpg', 'IMG_7744.jpg': '8.jpg', 'IMG_7745.jpg': '9.jpg', 'IMG_7747.jpg': '10.jpg', 'IMG_7748.jpg': '11.jpg'}\n",
"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]\n",
"{}\n"
]
}
],
"source": [
"renameoriginals(\"/mnt/dataset/\")"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"ename": "error",
"evalue": "OpenCV(4.5.4) ./modules/imgproc/src/resize.cpp:4051: error: (-215:Assertion failed) !ssize.empty() in function 'resize'\n",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31merror\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/mnt/code/libraries/testprocessing.ipynb Cell 9\u001b[0m line \u001b[0;36m2\n\u001b[1;32m <a href='vscode-notebook-cell://attached-container%2B7b22636f6e7461696e65724e616d65223a222f72696c6962726172696573646576656e76227d/mnt/code/libraries/testprocessing.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=0'>1</a>\u001b[0m img \u001b[39m=\u001b[39m cv2\u001b[39m.\u001b[39mimread(\u001b[39m'\u001b[39m\u001b[39m/mnt/dataset/baseimages/1.jpg\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m----> <a href='vscode-notebook-cell://attached-container%2B7b22636f6e7461696e65724e616d65223a222f72696c6962726172696573646576656e76227d/mnt/code/libraries/testprocessing.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=1'>2</a>\u001b[0m out \u001b[39m=\u001b[39m mf\u001b[39m.\u001b[39;49mhoughlineprocessing(img)\n",
"File \u001b[0;32m/mnt/code/autocropper/myfunctions.py:1042\u001b[0m, in \u001b[0;36mhoughlineprocessing\u001b[0;34m(image)\u001b[0m\n\u001b[1;32m 1041\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mhoughlineprocessing\u001b[39m(image):\n\u001b[0;32m-> 1042\u001b[0m croppedanddeskewed, _ \u001b[39m=\u001b[39m houghlinedeskewandcrop(image)\n\u001b[1;32m 1043\u001b[0m \u001b[39m##IF IT DOESN'T CHANGE THE IMAGE (CHANGE THE _ TO SOMETHING USEFUL), THEN CROPCLARIFYING SHOULD JUST DO THE TEXT ISOLATION SECTION AND NOT TRY AND WHITE OUT ANY BACKGROUND.\u001b[39;00m\n\u001b[1;32m 1044\u001b[0m \u001b[39m## IF THERE'S NO CROPPING, MAYBE EVEN JUMP RIGHT TO USING THE EXTERNAL DESKEW FIRST BEFORE TOSSING IT INTO CROPCLARIFYING\u001b[39;00m\n\u001b[1;32m 1046\u001b[0m postprocessed \u001b[39m=\u001b[39m cropclarifying(croppedanddeskewed)\n",
"File \u001b[0;32m/mnt/code/autocropper/myfunctions.py:452\u001b[0m, in \u001b[0;36mhoughlinedeskewandcrop\u001b[0;34m(image)\u001b[0m\n\u001b[1;32m 446\u001b[0m rotationangle \u001b[39m=\u001b[39m houghlinedeskewangle(dst1)\n\u001b[1;32m 448\u001b[0m \u001b[39m# -----------------end of finding angle to deskew-----------------\u001b[39;00m\n\u001b[1;32m 449\u001b[0m \n\u001b[1;32m 450\u001b[0m \u001b[39m## -----------------deskewing and then cropping-----------------\u001b[39;00m\n\u001b[0;32m--> 452\u001b[0m \u001b[39mreturn\u001b[39;00m houghlinedeskewthencrop(croppedogimage, dst1, rotationangle)\n",
"File \u001b[0;32m/mnt/code/autocropper/myfunctions.py:420\u001b[0m, in \u001b[0;36mhoughlinedeskewthencrop\u001b[0;34m(baseimage, preppedimage, rotationangle)\u001b[0m\n\u001b[1;32m 414\u001b[0m scaledrect \u001b[39m=\u001b[39m (\u001b[39mint\u001b[39m(rect[\u001b[39m0\u001b[39m]\u001b[39m*\u001b[39msizemultiplier), \u001b[39mint\u001b[39m(rect[\u001b[39m1\u001b[39m]\u001b[39m*\u001b[39msizemultiplier), \u001b[39mint\u001b[39m(rect[\u001b[39m2\u001b[39m]\u001b[39m*\u001b[39msizemultiplier), \u001b[39mint\u001b[39m(rect[\u001b[39m3\u001b[39m]\u001b[39m*\u001b[39msizemultiplier))\n\u001b[1;32m 416\u001b[0m croppedbaseimage \u001b[39m=\u001b[39m rotatedbaseimage[scaledrect[\u001b[39m1\u001b[39m]:scaledrect[\u001b[39m3\u001b[39m], scaledrect[\u001b[39m0\u001b[39m]:scaledrect[\u001b[39m2\u001b[39m], :]\n\u001b[0;32m--> 420\u001b[0m shrunkencbi, sizemultiplier \u001b[39m=\u001b[39m ResizeWithAspectRatio(croppedbaseimage, width\u001b[39m=\u001b[39;49m\u001b[39m1000\u001b[39;49m, retscale\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n\u001b[1;32m 421\u001b[0m gray \u001b[39m=\u001b[39m cv2\u001b[39m.\u001b[39mcvtColor(shrunkencbi, cv2\u001b[39m.\u001b[39mCOLOR_BGR2GRAY)\n\u001b[1;32m 422\u001b[0m thresh \u001b[39m=\u001b[39m cv2\u001b[39m.\u001b[39mthreshold(gray, \u001b[39m200\u001b[39m, \u001b[39m255\u001b[39m, cv2\u001b[39m.\u001b[39mTHRESH_BINARY)[\u001b[39m1\u001b[39m]\n",
"File \u001b[0;32m/mnt/code/autocropper/myfunctions.py:27\u001b[0m, in \u001b[0;36mResizeWithAspectRatio\u001b[0;34m(image, width, height, inter, retscale)\u001b[0m\n\u001b[1;32m 23\u001b[0m dim \u001b[39m=\u001b[39m (width, \u001b[39mint\u001b[39m(h \u001b[39m*\u001b[39m r))\n\u001b[1;32m 25\u001b[0m \u001b[39mif\u001b[39;00m (retscale \u001b[39m==\u001b[39m \u001b[39mTrue\u001b[39;00m):\n\u001b[1;32m 26\u001b[0m \u001b[39m# print(\"hi\")\u001b[39;00m\n\u001b[0;32m---> 27\u001b[0m \u001b[39mreturn\u001b[39;00m (cv2\u001b[39m.\u001b[39;49mresize(image, dim, interpolation\u001b[39m=\u001b[39;49minter), \u001b[39m1\u001b[39m\u001b[39m/\u001b[39mr)\n\u001b[1;32m 28\u001b[0m \u001b[39mreturn\u001b[39;00m cv2\u001b[39m.\u001b[39mresize(image, dim, interpolation\u001b[39m=\u001b[39minter)\n",
"\u001b[0;31merror\u001b[0m: OpenCV(4.5.4) ./modules/imgproc/src/resize.cpp:4051: error: (-215:Assertion failed) !ssize.empty() in function 'resize'\n"
]
}
],
"source": [
"img = cv2.imread('/mnt/dataset/baseimages/1.jpg')\n",
"out = mf.houghlineprocessing(img)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"showimgs(out)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# autocrop(\"/mnt/dataset/\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 448 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 433 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 634 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 194 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 121 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 9.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.0 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.1 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.3 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.3 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 139 KiB

File diff suppressed because one or more lines are too long

View File

@ -1,585 +0,0 @@
import cv2
import numpy as np
import sys
sys.path.insert(0, '../../autocropper')
import myfunctions as mf
## helper functions
def rectcenterpt(rect, xywhrect=True, retint=False):
if (xywhrect):
x = rect[0] + rect[2]/2
y = rect[1] + rect[3]/2
else:
x = (rect[0]+rect[2])/2
y = (rect[1]+rect[3])/2
if (retint):
x = int(x)
y = int(y)
return (x,y)
def containsamount(outerrect, innerrect, percentage=1):
tinyrect = mf.overlapRect([outerrect, innerrect])
tinyarea = tinyrect[2]*tinyrect[3]
if (tinyrect[0] == -1):
tinyarea = 0
innerrectarea = innerrect[2]*innerrect[3]
if (tinyarea/innerrectarea >= percentage):
return True
return False
def aboveandbelow(outerrect, innerrect):
if (outerrect[1] < innerrect[1] and outerrect[1]+outerrect[3] > innerrect[1]+innerrect[3]):
return True
return False
## Below code is an almost direct copy from https://github.com/scrunts23/CS-Data-Science-Build-Week-1/blob/master/model/dbscan.py
def dbscan(D, eps, MinPts):
'''
Cluster the dataset `D` using the DBSCAN algorithm.
dbscan takes a dataset `D` (a list of vectors), a threshold distance
`eps`, and a required number of points `MinPts`.
It will return a list of cluster labels. The label -1 means noise, and then
the clusters are numbered starting from 1.
'''
# This list will hold the final cluster assignment for each point in D.
# There are two reserved values:
# -1 - Indicates a noise point
# 0 - Means the point hasn't been considered yet.
# Initially all labels are 0.
labels = [0]*len(D)
# C is the ID of the current cluster.
C = 0
# This outer loop is just responsible for picking new seed points--a point
# from which to grow a new cluster.
# Once a valid seed point is found, a new cluster is created, and the
# cluster growth is all handled by the 'expandCluster' routine.
# For each point P in the Dataset D...
# ('P' is the index of the datapoint, rather than the datapoint itself.)
for P in range(0, len(D)):
# Only points that have not already been claimed can be picked as new
# seed points.
# If the point's label is not 0, continue to the next point.
if not (labels[P] == 0):
continue
# Find all of P's neighboring points.
NeighborPts = region_query(D, P, eps)
# If the number is below MinPts, this point is noise.
# This is the only condition under which a point is labeled
# NOISE--when it's not a valid seed point. A NOISE point may later
# be picked up by another cluster as a boundary point (this is the only
# condition under which a cluster label can change--from NOISE to
# something else).
if len(NeighborPts) < MinPts:
labels[P] = -1
# Otherwise, if there are at least MinPts nearby, use this point as the
# seed for a new cluster.
else:
C += 1
grow_cluster(D, labels, P, NeighborPts, C, eps, MinPts)
# All data has been clustered!
return labels
def grow_cluster(D, labels, P, NeighborPts, C, eps, MinPts):
'''
Grow a new cluster with label `C` from the seed point `P`.
This function searches through the dataset to find all points that belong
to this new cluster. When this function returns, cluster `C` is complete.
Parameters:
`D` - The dataset (a list of vectors)
`labels` - List storing the cluster labels for all dataset points
`P` - Index of the seed point for this new cluster
`NeighborPts` - All of the neighbors of `P`
`C` - The label for this new cluster.
`eps` - Threshold distance
`MinPts` - Minimum required number of neighbors
'''
# Assign the cluster label to the seed point.
labels[P] = C
# Look at each neighbor of P (neighbors are referred to as Pn).
# NeighborPts will be used as a FIFO queue of points to search--that is, it
# will grow as we discover new branch points for the cluster. The FIFO
# behavior is accomplished by using a while-loop rather than a for-loop.
# In NeighborPts, the points are represented by their index in the original
# dataset.
i = 0
while i < len(NeighborPts):
# Get the next point from the queue.
Pn = NeighborPts[i]
# If Pn was labelled NOISE during the seed search, then we
# know it's not a branch point (it doesn't have enough neighbors), so
# make it a leaf point of cluster C and move on.
if labels[Pn] == -1:
labels[Pn] = C
# Otherwise, if Pn isn't already claimed, claim it as part of C.
elif labels[Pn] == 0:
# Add Pn to cluster C (Assign cluster label C).
labels[Pn] = C
# Find all the neighbors of Pn
PnNeighborPts = region_query(D, Pn, eps)
# If Pn has at least MinPts neighbors, it's a branch point!
# Add all of its neighbors to the FIFO queue to be searched.
if len(PnNeighborPts) >= MinPts:
NeighborPts = NeighborPts + PnNeighborPts
# If Pn *doesn't* have enough neighbors, then it's a leaf point.
# Don't queue up it's neighbors as expansion points.
#else:
# Do nothing
#NeighborPts = NeighborPts
# Advance to the next point in the FIFO queue.
i += 1
# We've finished growing cluster C!
def region_query(D, P, eps):
'''
Find all points in dataset `D` within distance `eps` of point `P`.
This function calculates the distance between a point P and every other
point in the dataset, and then returns only those points which are within a
threshold distance `eps`.
'''
neighbors = []
# For each point in the dataset...
for Pn in range(0, len(D)):
# If the distance is below the threshold, add it to the neighbors list.
if (rectcenterpt(D[P])[1] - rectcenterpt(D[Pn])[1]) < eps:
neighbors.append(Pn)
return neighbors
def padWithColour(img, hpadding=0, vpadding=0, fill=(0,0,0)):
borderType = cv2.BORDER_CONSTANT
out = cv2.copyMakeBorder(img, vpadding, vpadding, hpadding, hpadding, borderType, None, fill)
return out
def mergecontours(contours):
cont = np.vstack(contours)
finalcontour = cv2.convexHull(cont)
return finalcontour
def getSkewAngle(cvImage) -> float:
# Prep image, copy, convert to gray scale, blur, and threshold
newImage = padWithColour(cvImage, hpadding=50, vpadding=50, fill=(255,255,255))
# return newImage
gray = cv2.cvtColor(newImage, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (9, 9), 0)
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# Apply dilate to merge text into meaningful lines/paragraphs.
# Use larger kernel on X axis to merge characters into single line, cancelling out any spaces.
# But use smaller kernel on Y axis to separate between different blocks of text
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (10, 5))
dilate = cv2.dilate(thresh, kernel, iterations=5)
# return dilate
# Find all contours
contours, hierarchy = cv2.findContours(dilate, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key = cv2.contourArea, reverse = True)
# Find largest contour and surround in min area box
largestContour = contours[0]
mergedcontour = mergecontours(contours)
# return cv2.drawContours(newImage, [mergedcontour], -1, (0,255,0), thickness=3)
minAreaRect = cv2.minAreaRect(mergedcontour)
# return cv2.drawContours(newImage, [largestContour], -1, (0,255,0), thickness=3)
# minAreaRect = cv2.minAreaRect(largestContour)
box = cv2.boxPoints(minAreaRect)
box = np.intp(box)
newImage = cv2.drawContours(newImage, [box], -1, (0,255,0), thickness=3)
# return newImage
# Determine the angle. Convert it to the value that was originally used to obtain skewed image
angle = minAreaRect[-1]
# print(angle)
if angle > 45:
angle = angle - 90
if angle < -45:
angle = 90 + angle
# print(angle)
return angle
def minboxdeskew(img, fill=(0,0,0)):
colourimg = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
angle = getSkewAngle(colourimg)
padimg = padWithColour(img, hpadding=50, vpadding=50, fill=fill)
rotated = mf.rotate(padimg, angle, fill=fill)
return rotated
def l1linerectretriever(image, divider=2):
shape = image.shape
imgcopy = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
# return imgcopy
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
linekernel = cv2.getStructuringElement(cv2.MORPH_RECT, (shape[1]//40, 1))
# reducedimage = image
reducedimage = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel, iterations=1)
# reducedimage = cv2.morphologyEx(reducedimage, cv2.MORPH_ERODE, kernel)
# return reducedimage
charcanny = cv2.Canny(reducedimage, 0, 500, None, 3)
# return canny
lettercontours, heirarchy = cv2.findContours(charcanny,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# contours, heirarchy = cv2.findContours(255-reducedimage,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# imgcopy = cv2.drawContours(imgcopy, lettercontours, -1, color=(0,255,0), thickness=1)
# return imgcopy
letterboxes = np.empty((len(lettercontours), 4), dtype=int)
for i, contour in enumerate(lettercontours):
b = list(cv2.boundingRect(contour))
# b[0] -= (kernel.shape[0]-1)
# b[1] -= (kernel.shape[1]-1)
# b[2] += (2*kernel.shape[0]-1)
# b[3] += (2*kernel.shape[1]-1)
letterboxes[i] = b
# imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), 128, thickness=3)
# return imgcopy
epsilonvalue = np.median(letterboxes, axis=0)[3]/divider
# print(epsilonvalue)
linemade = 255-cv2.morphologyEx(255-image, cv2.MORPH_DILATE, linekernel)
# return linemade
linecanny = cv2.Canny(linemade, 0, 500, None, 3)
linecontours, heirarchy = cv2.findContours(linecanny,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# imgcopy = cv2.drawContours(imgcopy, linecontours, -1, color=(0,255,0), thickness=1)
# return imgcopy
# for i, contour in enumerate(linecontours):
# k = i+1
# colour = ((k*23123)%255, (k*8654)%255, (k*45242)%255)
# imgcopy = cv2.drawContours(imgcopy, [contour], -1, colour, thickness=1)
# return imgcopy
lineboxes = np.empty((len(linecontours), 4), dtype=int)
for i, contour in enumerate(linecontours):
b = list(cv2.boundingRect(contour))
# b[0] -= (kernel.shape[0]-1)
# b[1] -= (kernel.shape[1]-1)
# b[2] += (2*kernel.shape[0]-1)
# b[3] += (2*kernel.shape[1]-1)
lineboxes[i] = b
# imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), (0,255,0), thickness=3)
# return imgcopy
linelabels = dbscan(lineboxes, epsilonvalue, 1)
# print(linelabels)
numclusters = max(linelabels)
letterboxesbyline = [[] for _ in range(numclusters)]
for i, linebox in enumerate(lineboxes):
for j, letterbox in enumerate(letterboxes):
if containsamount(linebox, letterbox, 0.9):
letterboxesbyline[linelabels[i]-1].append(letterbox.tolist())
# print(len(letterboxesbyline))
# # COLOUR THE RECTANGLES GROUPED
# for i, setofboxes in enumerate(letterboxesbyline):
# k = i+1
# colour = ((k*23123)%255, (k*8654)%255, (k*45242)%255)
# # print(colour)
# # b = lineboxes[i]
# # imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), colour, thickness=3)
# print(i)
# for b in setofboxes:
# print(i)
# imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), colour, thickness=3)
# return imgcopy
mergedboxes = np.empty((numclusters,4), dtype=int)
tobedeleted = []
for i in range(numclusters):
b = mf.mergerects(letterboxesbyline[i])
# if (b[0] == -1):
# tobedeleted.append(i)
mergedboxes[i] = b
# if (tobedeleted != []):
# # print("hi")
# mergedboxes = np.delete(mergedboxes, tobedeleted, axis=0)
# letterboxesbyline = [ele for idx, ele in enumerate(letterboxesbyline) if idx not in tobedeleted]
return mergedboxes, letterboxesbyline
def sublinerectretriever(image, divider=2):
shape = image.shape
imgcopy = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
# return imgcopy
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
# reducedimage = image
reducedimage = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel, iterations=1)
# reducedimage = cv2.morphologyEx(reducedimage, cv2.MORPH_ERODE, kernel)
# return reducedimage
canny = cv2.Canny(reducedimage, 0, 500, None, 3)
# return canny
contours, heirarchy = cv2.findContours(canny,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# contours, heirarchy = cv2.findContours(255-reducedimage,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# imgcopy = cv2.drawContours(imgcopy, contours, -1, color=(0,255,0), thickness=1)
# return imgcopy
boundingboxes = np.empty((len(contours), 4), dtype=int)
for i, contour in enumerate(contours):
b = list(cv2.boundingRect(contour))
b[0] -= (kernel.shape[0]-1)
b[1] -= (kernel.shape[1]-1)
b[2] += (2*kernel.shape[0]-1)
b[3] += (2*kernel.shape[1]-1)
boundingboxes[i] = b
# imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), 128, thickness=3)
# return imgcopy
epsilonvalue = np.median(boundingboxes, axis=0)[3]/divider
# print(epsilonvalue)
labels = dbscan(boundingboxes, epsilonvalue, 1)
# print(labels)
numclusters = max(labels)
lineboxes = [[] for _ in range(numclusters)]
for i, item in enumerate(labels):
lineboxes[item-1].append(boundingboxes[i].tolist())
# # COLOUR THE RECTANGLES GROUPED
# for i, setofboxes in enumerate(lineboxes):
# k = i+1
# colour = ((k*23123)%255, (k*8654)%255, (k*45242)%255)
# # print(colour)
# for b in setofboxes:
# imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), colour, thickness=3)
# return imgcopy
mergedboxes = np.empty((numclusters,4), dtype=int)
for i in range(numclusters):
b = mf.mergerects(lineboxes[i])
mergedboxes[i] = b
j = 0
while (j < len(mergedboxes)):
i = 0
while (i < len(mergedboxes)):
if (i == j):
i += 1
continue
outerbox = mergedboxes[j]
innerbox = mergedboxes[i]
if containsamount(outerbox, innerbox, 1) or aboveandbelow(outerbox, innerbox) or innerbox[3] < epsilonvalue:
mergedboxes = np.delete(mergedboxes, i, axis=0)
lineboxes.pop(i)
if (i < j):
j -= 1
i -= 1
i += 1
j += 1
return mergedboxes, lineboxes
def linerectretriever(image, divider=2, sublines=False):
if (sublines):
return sublinerectretriever(image, divider=divider)
else:
return l1linerectretriever(image, divider=divider)
def lineimagemaker(thresholded, divider=2, sublines=False):
lineimages = []
mergedboxes, originalboxes = linerectretriever(thresholded, divider=divider, sublines=sublines)
# print(mergedboxes)
# print(originalboxes)
# return thresholded
mergedboxesordering = (mergedboxes[:,1]).argsort() # sorted by y value (aka lines from top to bottom)
# print(mergedboxesordering)
goodpoint = 0
for i, item in enumerate(mergedboxesordering):
if (mergedboxes[item][0] != -1):
goodpoint = i
break
mergedboxesordering = mergedboxesordering[goodpoint:]
mergedboxes = mergedboxes[mergedboxesordering]
originalboxes = [originalboxes[i] for i in mergedboxesordering]
out = cv2.cvtColor(thresholded.copy(), cv2.COLOR_GRAY2BGR)
# lineimages.append(out)
for i, box in enumerate(mergedboxes):
# print(box)
mask = np.zeros(thresholded.shape, dtype=np.uint8)
whitebackground = np.full(thresholded.shape, fill_value=255, dtype=np.uint8)
# print(originalboxes[i])
for lb in originalboxes[i]:
mask = cv2.rectangle(mask, (lb[0],lb[1]), (lb[0]+lb[2], lb[1]+lb[3]), (255,255,255), thickness=cv2.FILLED)
# lineimages[0] = cv2.rectangle(lineimages[0], (box[0],box[1]), (box[0]+box[2], box[1]+box[3]), (0,255,0), thickness=1)
invertedmask = cv2.bitwise_not(mask)
whitedscreen = cv2.bitwise_and(whitebackground, whitebackground, mask=invertedmask)
lineimage = cv2.bitwise_and(thresholded, thresholded, mask=mask)
lineimage = cv2.bitwise_or(whitedscreen, lineimage)[box[1]:box[1]+box[3], box[0]:box[0]+box[2]]
# lineimage = mf.externaldeskew(lineimage, fill=(255,255,255), alreadygray=True)
# lineimage = thresholded[box[1]:box[1]+box[3], box[0]:box[0]+box[2]]
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
lineimage = cv2.morphologyEx(lineimage, cv2.MORPH_CLOSE, kernel, iterations=1)
lineimages.append(lineimage)
# lineimages.append(mask)
return lineimages
def ismultiline(img):
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
reducedimage = cv2.morphologyEx(img, cv2.MORPH_DILATE, kernel)
# reducedimage = cv2.morphologyEx(reducedimage, cv2.MORPH_ERODE, kernel)
canny = cv2.Canny(reducedimage, 0, 500, None, 3)
# return canny
contours, heirarchy = cv2.findContours(canny,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# imgcopy = cv2.drawContours(imgcopy, contours, -1, color=(0,255,0), thickness=1)
# return imgcopy
boundingboxes = np.empty((len(contours), 4), dtype=int)
for i, contour in enumerate(contours):
boundingboxes[i] = cv2.boundingRect(contour)
b = boundingboxes[i]
# heightdetermination = np.median(boundingboxes, axis=0)[3]
heightdetermination = np.max(boundingboxes, axis=0)[3]
# print(heightdetermination)
if (img.shape[0] > (heightdetermination*1.5) + (2*50)):
return True
return False
### actual function
def lineisolator(image):
# imgcopy = image.copy()
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# thresholded = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
# return gray
# return thresholded
thresholded = gray
# kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
lineimages = lineimagemaker(thresholded, 1.5, False)
# for i, lineimage in enumerate(lineimages):
# lineimages[i] = cv2.morphologyEx(lineimage, cv2.MORPH_ERODE, kernel)
finallineimages = []
for i, lineimage in enumerate(lineimages):
# if (i == 0):
# finallineimages.append(lineimages[0])
# continue
deskewedlineimage = minboxdeskew(lineimage, fill=255)
# finallineimages.append(deskewedlineimage)
# print(deskewedlineimage.shape)
if (ismultiline(deskewedlineimage)):
# print("hi" + str(i))
templineimages = lineimagemaker(deskewedlineimage, 2.5, True)
else:
templineimages = lineimagemaker(deskewedlineimage, 1.5, True)
# templineimages = lineimagemaker(deskewedlineimage, 2)
finallineimages += templineimages
# finallineimages += templineimages[1:]
for i, lineimage in enumerate(finallineimages):
deskewedli = minboxdeskew(lineimage, fill=255)
dim = int((deskewedli.shape[0]-100)//20)
# print(dim)
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (dim, dim))
deskewedli = cv2.morphologyEx(deskewedli, cv2.MORPH_DILATE, kernel,iterations=1)
finallineimages[i] = cv2.morphologyEx(deskewedli, cv2.MORPH_OPEN, kernel)
# mergedboxes, originalboxes = linerectretriever(thresholded)
# mask = np.zeros(thresholded.shape, dtype=np.uint8)
# for i, box in enumerate(mergedboxes):
# for lb in originalboxes[i]:
# mask = cv2.rectangle(mask, (lb[0],lb[1]), (lb[0]+lb[2], lb[1]+lb[3]), (255,255,255), thickness=cv2.FILLED)
# return mask
# out = tempfunc(thresholded)
# return out
return finallineimages

View File

@ -1,511 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import cv2\n",
"import numpy as np\n",
"\n",
"\n",
"import scipy.stats as st\n",
"import math\n",
"\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.10/dist-packages/torchvision/datapoints/__init__.py:12: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().\n",
" warnings.warn(_BETA_TRANSFORMS_WARNING)\n",
"/usr/local/lib/python3.10/dist-packages/torchvision/transforms/v2/__init__.py:54: UserWarning: The torchvision.datapoints and torchvision.transforms.v2 namespaces are still Beta. While we do not expect major breaking changes, some APIs may still change according to user feedback. Please submit any feedback you may have in this issue: https://github.com/pytorch/vision/issues/6753, and you can also check out https://github.com/pytorch/vision/issues/7319 to learn more about the APIs that we suspect might involve future changes. You can silence this warning by calling torchvision.disable_beta_transforms_warning().\n",
" warnings.warn(_BETA_TRANSFORMS_WARNING)\n"
]
}
],
"source": [
"import sys\n",
"sys.path.insert(0, '../../autocropper')\n",
"import myfunctions as mf\n",
"\n",
"import extractorfunctions as ef\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"pathname = \"../test_images/\"\n",
"filename = \"IMG_7640.jpg\"\n",
"# pathname = \"../temp/\"\n",
"# filename = \"test.jpg\"\n",
"# pathname = \"../result_images/\"\n",
"# filename = \"13.jpg\"\n",
"\n",
"# print(pathname+filename)\n",
"img = cv2.imread(pathname+filename)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# import easyocr\n",
"# reader = easyocr.Reader(['en'])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def l1linerectretriever(image, divider=2):\n",
" shape = image.shape\n",
"\n",
" imgcopy = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)\n",
" # return imgcopy\n",
" \n",
" kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))\n",
" linekernel = cv2.getStructuringElement(cv2.MORPH_RECT, (shape[1]//40, 1))\n",
" # reducedimage = image\n",
" reducedimage = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel, iterations=1)\n",
" # reducedimage = cv2.morphologyEx(reducedimage, cv2.MORPH_ERODE, kernel)\n",
" # return reducedimage\n",
" \n",
" charcanny = cv2.Canny(reducedimage, 0, 500, None, 3)\n",
" # return canny\n",
" \n",
" \n",
" lettercontours, heirarchy = cv2.findContours(charcanny,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
" # contours, heirarchy = cv2.findContours(255-reducedimage,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
"\n",
" # imgcopy = cv2.drawContours(imgcopy, lettercontours, -1, color=(0,255,0), thickness=1)\n",
" # return imgcopy\n",
"\n",
" letterboxes = np.empty((len(lettercontours), 4), dtype=int)\n",
" \n",
" for i, contour in enumerate(lettercontours):\n",
" b = list(cv2.boundingRect(contour))\n",
" # b[0] -= (kernel.shape[0]-1)\n",
" # b[1] -= (kernel.shape[1]-1)\n",
" # b[2] += (2*kernel.shape[0]-1)\n",
" # b[3] += (2*kernel.shape[1]-1)\n",
" letterboxes[i] = b\n",
" # imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), 128, thickness=3)\n",
" # return imgcopy\n",
" \n",
" epsilonvalue = np.median(letterboxes, axis=0)[3]/divider\n",
" # print(epsilonvalue)\n",
"\n",
"\n",
"\n",
" linemade = 255-cv2.morphologyEx(255-image, cv2.MORPH_DILATE, linekernel)\n",
" # return linemade\n",
"\n",
" linecanny = cv2.Canny(linemade, 0, 500, None, 3)\n",
" linecontours, heirarchy = cv2.findContours(linecanny,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
"\n",
" # imgcopy = cv2.drawContours(imgcopy, linecontours, -1, color=(0,255,0), thickness=1)\n",
" # return imgcopy\n",
" # for i, contour in enumerate(linecontours):\n",
" # k = i+1\n",
" # colour = ((k*23123)%255, (k*8654)%255, (k*45242)%255)\n",
" # imgcopy = cv2.drawContours(imgcopy, [contour], -1, colour, thickness=1)\n",
" # return imgcopy\n",
"\n",
"\n",
"\n",
" lineboxes = np.empty((len(linecontours), 4), dtype=int)\n",
" \n",
" for i, contour in enumerate(linecontours):\n",
" b = list(cv2.boundingRect(contour))\n",
" # b[0] -= (kernel.shape[0]-1)\n",
" # b[1] -= (kernel.shape[1]-1)\n",
" # b[2] += (2*kernel.shape[0]-1)\n",
" # b[3] += (2*kernel.shape[1]-1)\n",
" lineboxes[i] = b\n",
" # imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), (0,255,0), thickness=3)\n",
" # return imgcopy\n",
"\n",
" linelabels = ef.dbscan(lineboxes, epsilonvalue, 1)\n",
" # print(linelabels)\n",
" numclusters = max(linelabels)\n",
"\n",
" letterboxesbyline = [[] for _ in range(numclusters)]\n",
"\n",
" for i, linebox in enumerate(lineboxes):\n",
" for j, letterbox in enumerate(letterboxes):\n",
" if ef.containsamount(linebox, letterbox, 0.9):\n",
" letterboxesbyline[linelabels[i]-1].append(letterbox.tolist())\n",
"\n",
" # print(len(letterboxesbyline))\n",
"\n",
"\n",
" # # COLOUR THE RECTANGLES GROUPED\n",
" # for i, setofboxes in enumerate(letterboxesbyline):\n",
" # k = i+1\n",
" # colour = ((k*23123)%255, (k*8654)%255, (k*45242)%255)\n",
" # # print(colour)\n",
" # # b = lineboxes[i]\n",
" # # imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), colour, thickness=3)\n",
" # print(i)\n",
" # for b in setofboxes:\n",
" # print(i)\n",
" # imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), colour, thickness=3)\n",
" # return imgcopy\n",
"\n",
" mergedboxes = np.empty((numclusters,4), dtype=int)\n",
"\n",
" tobedeleted = []\n",
"\n",
" for i in range(numclusters):\n",
" b = mf.mergerects(letterboxesbyline[i])\n",
" # if (b[0] == -1):\n",
" # tobedeleted.append(i)\n",
" mergedboxes[i] = b\n",
"\n",
" # if (tobedeleted != []):\n",
" # # print(\"hi\")\n",
" # mergedboxes = np.delete(mergedboxes, tobedeleted, axis=0)\n",
" # letterboxesbyline = [ele for idx, ele in enumerate(letterboxesbyline) if idx not in tobedeleted]\n",
"\n",
" return mergedboxes, letterboxesbyline\n",
"\n",
"\n",
"\n",
"def sublinerectretriever(image, divider=2):\n",
" shape = image.shape\n",
" \n",
" imgcopy = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)\n",
" # return imgcopy\n",
" \n",
" kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))\n",
" # reducedimage = image\n",
" reducedimage = cv2.morphologyEx(image, cv2.MORPH_DILATE, kernel, iterations=1)\n",
" # reducedimage = cv2.morphologyEx(reducedimage, cv2.MORPH_ERODE, kernel)\n",
" # return reducedimage\n",
" \n",
" canny = cv2.Canny(reducedimage, 0, 500, None, 3)\n",
" # return canny\n",
" \n",
" \n",
" contours, heirarchy = cv2.findContours(canny,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
" # contours, heirarchy = cv2.findContours(255-reducedimage,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
"\n",
" # imgcopy = cv2.drawContours(imgcopy, contours, -1, color=(0,255,0), thickness=1)\n",
" # return imgcopy\n",
"\n",
" boundingboxes = np.empty((len(contours), 4), dtype=int)\n",
" \n",
" for i, contour in enumerate(contours):\n",
" b = list(cv2.boundingRect(contour))\n",
" b[0] -= (kernel.shape[0]-1)\n",
" b[1] -= (kernel.shape[1]-1)\n",
" b[2] += (2*kernel.shape[0]-1)\n",
" b[3] += (2*kernel.shape[1]-1)\n",
" boundingboxes[i] = b\n",
" # imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), 128, thickness=3)\n",
" # return imgcopy\n",
" \n",
" epsilonvalue = np.median(boundingboxes, axis=0)[3]/divider\n",
" # print(epsilonvalue)\n",
" \n",
" labels = ef.dbscan(boundingboxes, epsilonvalue, 1)\n",
" # print(labels)\n",
" numclusters = max(labels)\n",
" lineboxes = [[] for _ in range(numclusters)]\n",
"\n",
" for i, item in enumerate(labels):\n",
" lineboxes[item-1].append(boundingboxes[i].tolist())\n",
" \n",
" \n",
" # # COLOUR THE RECTANGLES GROUPED\n",
" # for i, setofboxes in enumerate(lineboxes):\n",
" # k = i+1\n",
" # colour = ((k*23123)%255, (k*8654)%255, (k*45242)%255)\n",
" # # print(colour)\n",
" # for b in setofboxes:\n",
" # imgcopy = cv2.rectangle(imgcopy, (b[0],b[1]), (b[0]+b[2], b[1]+b[3]), colour, thickness=3)\n",
" # return imgcopy\n",
" \n",
" \n",
" mergedboxes = np.empty((numclusters,4), dtype=int)\n",
" \n",
" \n",
" for i in range(numclusters):\n",
" b = mf.mergerects(lineboxes[i])\n",
" mergedboxes[i] = b\n",
" \n",
" j = 0\n",
" while (j < len(mergedboxes)):\n",
" i = 0\n",
" while (i < len(mergedboxes)):\n",
" if (i == j):\n",
" i += 1\n",
" continue\n",
" outerbox = mergedboxes[j]\n",
" innerbox = mergedboxes[i]\n",
" if ef.containsamount(outerbox, innerbox, 1) or ef.aboveandbelow(outerbox, innerbox) or innerbox[3] < epsilonvalue:\n",
" mergedboxes = np.delete(mergedboxes, i, axis=0)\n",
" lineboxes.pop(i)\n",
" if (i < j):\n",
" j -= 1\n",
" i -= 1\n",
" i += 1\n",
" j += 1\n",
" \n",
" return mergedboxes, lineboxes\n",
"\n",
"def linerectretriever(image, divider=2, sublines=False):\n",
"\n",
" if (sublines):\n",
" return sublinerectretriever(image, divider=divider)\n",
" else:\n",
" return l1linerectretriever(image, divider=divider)\n",
"\n",
"\n",
"def lineimagemaker(thresholded, divider=2, sublines=False):\n",
" lineimages = []\n",
" mergedboxes, originalboxes = linerectretriever(thresholded, divider=divider, sublines=sublines)\n",
" # print(mergedboxes)\n",
" # print(originalboxes)\n",
" # return thresholded\n",
" \n",
" mergedboxesordering = (mergedboxes[:,1]).argsort() # sorted by y value (aka lines from top to bottom)\n",
" # print(mergedboxesordering)\n",
" \n",
" goodpoint = 0\n",
" for i, item in enumerate(mergedboxesordering):\n",
" if (mergedboxes[item][0] != -1):\n",
" goodpoint = i\n",
" break\n",
" mergedboxesordering = mergedboxesordering[goodpoint:]\n",
"\n",
" mergedboxes = mergedboxes[mergedboxesordering]\n",
" originalboxes = [originalboxes[i] for i in mergedboxesordering]\n",
" out = cv2.cvtColor(thresholded.copy(), cv2.COLOR_GRAY2BGR)\n",
" # lineimages.append(out)\n",
" for i, box in enumerate(mergedboxes):\n",
" # print(box)\n",
" mask = np.zeros(thresholded.shape, dtype=np.uint8)\n",
" whitebackground = np.full(thresholded.shape, fill_value=255, dtype=np.uint8)\n",
" # print(originalboxes[i])\n",
" for lb in originalboxes[i]:\n",
" mask = cv2.rectangle(mask, (lb[0],lb[1]), (lb[0]+lb[2], lb[1]+lb[3]), (255,255,255), thickness=cv2.FILLED)\n",
"\n",
" # lineimages[0] = cv2.rectangle(lineimages[0], (box[0],box[1]), (box[0]+box[2], box[1]+box[3]), (0,255,0), thickness=1)\n",
"\n",
" invertedmask = cv2.bitwise_not(mask)\n",
" whitedscreen = cv2.bitwise_and(whitebackground, whitebackground, mask=invertedmask)\n",
" lineimage = cv2.bitwise_and(thresholded, thresholded, mask=mask)\n",
" lineimage = cv2.bitwise_or(whitedscreen, lineimage)[box[1]:box[1]+box[3], box[0]:box[0]+box[2]]\n",
" # lineimage = mf.externaldeskew(lineimage, fill=(255,255,255), alreadygray=True)\n",
" # lineimage = thresholded[box[1]:box[1]+box[3], box[0]:box[0]+box[2]]\n",
" kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))\n",
" lineimage = cv2.morphologyEx(lineimage, cv2.MORPH_CLOSE, kernel, iterations=1)\n",
" lineimages.append(lineimage)\n",
" # lineimages.append(mask)\n",
" return lineimages\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def lineisolator(image):\n",
" # imgcopy = image.copy()\n",
" gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n",
" # thresholded = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]\n",
" # return gray\n",
" # return thresholded\n",
" thresholded = gray\n",
" \n",
" \n",
" # kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))\n",
" \n",
" \n",
" \n",
" lineimages = lineimagemaker(thresholded, 1.5, False)\n",
" \n",
" # for i, lineimage in enumerate(lineimages):\n",
" # lineimages[i] = cv2.morphologyEx(lineimage, cv2.MORPH_ERODE, kernel)\n",
"\n",
" \n",
" finallineimages = []\n",
" \n",
" for i, lineimage in enumerate(lineimages):\n",
" # if (i == 0):\n",
" # finallineimages.append(lineimages[0])\n",
" # continue\n",
" deskewedlineimage = ef.minboxdeskew(lineimage, fill=255)\n",
"\n",
" # finallineimages.append(deskewedlineimage)\n",
" # print(deskewedlineimage.shape)\n",
"\n",
" if (ef.ismultiline(deskewedlineimage)):\n",
" # print(\"hi\" + str(i))\n",
" templineimages = lineimagemaker(deskewedlineimage, 2.5, True)\n",
" else:\n",
" templineimages = lineimagemaker(deskewedlineimage, 1.5, True)\n",
"\n",
" # templineimages = lineimagemaker(deskewedlineimage, 2)\n",
"\n",
" finallineimages += templineimages\n",
" # finallineimages += templineimages[1:]\n",
"\n",
" for i, lineimage in enumerate(finallineimages):\n",
" deskewedli = ef.minboxdeskew(lineimage, fill=255)\n",
" dim = int((deskewedli.shape[0]-100)//20)\n",
" # print(dim)\n",
" kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (dim, dim))\n",
" deskewedli = cv2.morphologyEx(deskewedli, cv2.MORPH_DILATE, kernel,iterations=1)\n",
" finallineimages[i] = cv2.morphologyEx(deskewedli, cv2.MORPH_OPEN, kernel)\n",
" \n",
" \n",
" # mergedboxes, originalboxes = linerectretriever(thresholded) \n",
" # mask = np.zeros(thresholded.shape, dtype=np.uint8)\n",
" # for i, box in enumerate(mergedboxes):\n",
" # for lb in originalboxes[i]:\n",
" # mask = cv2.rectangle(mask, (lb[0],lb[1]), (lb[0]+lb[2], lb[1]+lb[3]), (255,255,255), thickness=cv2.FILLED)\n",
"\n",
" # return mask\n",
" \n",
" \n",
" # out = tempfunc(thresholded)\n",
" # return out\n",
" \n",
" return finallineimages"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# result = reader.readtext(pathname+filename)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# print(result)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"bing = mf.houghlineprocessing(img)\n",
"# outs = bing\n",
"outs = ef.lineisolator(bing)\n",
"# # gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
"# # outs = linerectretriever(gray)\n",
"# outs = getSkewAngle(img)\n",
"# outs = minboxdeskew(img, fill=(255,255,255))\n",
"# bing = cv2.cvtColor(bing, cv2.COLOR_BGR2GRAY)\n",
"# bing = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
"# outs = bing\n",
"# outs = linerectretriever(bing, 1.5, False)\n",
"# outs = lineimagemaker(bing, 1.5, False)\n",
"# for i, _ in enumerate(outs):\n",
"# outs[i] = ef.minboxdeskew(outs[i], fill=255)\n",
"\n",
"# outs = img"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# print(outs)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# for out in outs:\n",
"# if (out.shape[0] > out.shape[1]):\n",
"# cv2.imshow(\"test1\", mf.ResizeWithAspectRatio(out, height=1000))\n",
"# else:\n",
"# cv2.imshow(\"test1\", mf.ResizeWithAspectRatio(out, width=1000))\n",
"# key = cv2.waitKey(0)\n",
"# cv2.destroyAllWindows()\n",
"# if (key == 107):\n",
"# break\n",
"if (isinstance(outs, np.ndarray)):\n",
" if (outs.shape[0] > outs.shape[1]):\n",
" cv2.imshow(\"test\", mf.ResizeWithAspectRatio(outs, height=1350))\n",
" else:\n",
" cv2.imshow(\"test\", mf.ResizeWithAspectRatio(outs, width=1000))\n",
"else:\n",
" for i, out in enumerate(outs):\n",
" # cv2.imwrite(\"../result_images/\"+str(i)+\".jpg\", out)\n",
" if (out.shape[0] > out.shape[1]):\n",
" cv2.imshow(\"test\"+str(i), mf.ResizeWithAspectRatio(out, height=1350))\n",
" else:\n",
" cv2.imshow(\"test\"+str(i), mf.ResizeWithAspectRatio(out, width=1000))\n",
"cv2.waitKey(0)\n",
"cv2.destroyAllWindows()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# cv2.imwrite(\"../temp/test.jpg\", outs[2])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,260 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"# https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TrOCR/Inference_with_TrOCR_%2B_Gradio_demo.ipynb\n",
"# https://github.com/NielsRogge/Transformers-Tutorials/tree/master/TrOCR\n",
"# https://huggingface.co/docs/transformers/model_doc/trocr"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"from transformers import TrOCRProcessor\n",
"from transformers import VisionEncoderDecoderModel\n",
"\n",
"from PIL import Image\n",
"import torch"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.insert(0, '../../autocropper')\n",
"import myfunctions as mf\n",
"\n",
"import extractorfunctions as ef\n",
"import cv2"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-small-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
}
],
"source": [
"processor = TrOCRProcessor.from_pretrained('microsoft/trocr-small-printed')\n",
"model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-small-printed')"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"device = torch.device(\"cpu\")\n",
"if torch.cuda.is_available:\n",
" device = torch.device(\"cuda:0\")\n",
" \n",
"model = model.to(device)\n"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
"filename = \"IMG_7640.jpg\"\n",
"pathname = \"../test_images/\""
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"img = cv2.imread(pathname+filename)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
"clarified = mf.houghlineprocessing(img)\n",
"lineimages = ef.lineisolator(clarified)"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [],
"source": [
"# print(len(lineimages))"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
"PILversions = []\n",
"for line in lineimages:\n",
" rgbline = cv2.cvtColor(line, cv2.COLOR_GRAY2RGB)\n",
" PILversions.append(Image.fromarray(rgbline))"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAogAAAB8CAIAAABlt9bLAAAI4klEQVR4nO3d23ajuBYF0FCj//+XfR7cx52yMVddtrTnfMqopGwQQksSIJbH4/EDAMTwz+q/Lsvy/EFsA0BLy1v0viJ5lZwGgKr+nPrrZVm2kxsAuONcMD+JZwCo5EowAwCVCGYACEQwA0AgghkAAhHMABCIYAaAQNZX/tplpZEaLLgGwMURs+eYAaCGiyNm7tjt1izL+1KpACQhmJs6PtPw+ZeiGiADN3+1Y/4fgF2CuRGpDMARghkAAhHMwzDmBshAMLcgUwE4SDBXJ5UBOE4wA0AgghkAAhHMABCIYAaAQCzJCfDu9z2bq6vhehcc9QjmYTj/oY23Jyk8WEFjgjmix+OhPw6Qk2AO55nE8hggJzd/xSKPAZITzIFIZQAEMwAE8lcwu/kQAPq6OGI26QoANfwXzIbLNShVAE65MmI2XAaAStz8BQCBzLPAyOqkscE9AGOZJ5hXWdgSgLFMHswvEhqAIbjGXJFbsgE4SzBXsSzLhVQW5ABkmcqu6m2e/E6+mnIHSE4wl2TIC5BZkcGVYL5LGAPw83ccfIuGI4EtmOMyrQ2QkGAOquVA/O279AMAKlmWZbeNFcy8e+b0q+qIbYCCdrPZ41IA0NT2nKgRczoHJ8nd1AbQhWAewJFrEgc/p/iHmNkG+Ck6mDGVnUWlEbCBNZnpmFKDYOYu2cw0ri2mC2WZyqaAtxu5U/ndjn+WgNeET8xxpBLBDNdtj66+/dbSMcAGwXzXatsaZzas8UIlkuYn0tEHRiSYq3g8Hjlb5+TZfPagu8V9m6kFchLMUUzT9GS+3nxT8m7NhreSGeXKvY4F1wjmECqdtzlH7VUp0l6e2bxR/tF6hG+beuddQ2QjmGtJO5vNHZnHWLvny5ETarirA3H6E12KbuOYRiiT4y609hszZIJ5Wne6BcHvaOvFnXT11FsAp30xfu7L2S5F1W1eDeCAZ/fBLkuE6xqXS+9VP9965IKZv2xU6ORzAO33PUk21y7YBkPS+7vwOe9dcIN3H+r71hGPX/12n0j8rdTuvHWhbh791UMvmOd0ra7sVtzjtXCIs5o86l0jCN5bvTD/v/rvZcsteKEdV2NHlmX5p96n00ulVH79WcLaknCXZ5Wqyzhfve2+R202IN1a2d2Pa0w1mipFfVOe/ODT/VW7y56A3U/nVMuYnw7mmI1FngNWw9ljGrMOMKKhz9wGURGqfLpE4/Mbg5RDs834c+rLYrbIQY4ZnxwaNqge9dQr2/bxXOrrBqpvbv4i11W3xg6+b2r3f1GKsi0iYaPRMtfTXWPOVpkOOrK2Q6prPPc9Ho/VyqYGdjR94bc8T0dsDUbZ5lwj5ulPyzsOdoFHqdl9bZfk9rLPamklgxasEzOhXMHMNud2d4OGxwVlH+rb/bQuBdvm2cKOp22c9UQnI5hnI1zHpYG7bDsCe6Xy6leXXVQ5guCbNyLBPBWpHMTxpkqLdtCRZemeP6yuBd1Ysy+tsZqQZqS7RDd/aQFpSevWS8cz/fF/239T6usq1bGxmsrdAr/wl90ZMQOzad/+nvrGvuvaHtnUsK+cevPalwZH/PL9DRe+RTBXEb9CNzBK57Qet8YUFPlC5oUN65XNA3UgNoSqCddKaXsXhp/KjllvmMydhkAV5aZQOfSp8eYFL40jdndh+GCewAT1LIMi2Wydljtilpvzt2WtTlLaprI7S1LPoIh6bwi+oPsGnNJga6teuxmrtG8yYoYWDJQpywh1YoIZjtJIFXS/MPN0dPLs6aqBHnMq5Vww6/VDEc6jHx2dQThM7V25xvz53EKEpXaggYIPkER+/mcUr2OhJLcpn7FcnMpe/vb529sbxmlni925ek38lZsGUqowlSQzqXVX9rfzRBgE8W2F/W2av6ewCy8AE2j9uJTAHprD9FIqm0M9/zM066wxjSh3ZRt/MJyEN4vWULYM3aBKe8XbgSjBDKSlfxOZjk57gjkpJ1spBcfNmQ9KhHHzQOV/alNbrvSuj1VElGB2OBnakRfxHjFQNhRXqgxfThXmrAk0ynbyW5Rgnqk9GmVfrm3nxmNyv397e+tG9fjl2idkLr2n+A+kSbtViqUUL7FI7ewaF9/CuNwWTeXyndvWHvm9+yrYZBzQXVFGzNxXuylfXUnGObbNuPm+mxV7vpIceo/OXh3P2UMVzNkNfZKTxHxXBwJuEpcVP5qCuafincF6vUvtyGU5u/yVzJHNs95ollONqvXnp8LNkIylRsUK1Q525+QqaI5sPk7lmcyRA/rXzV/Ha8CgVZxvHNBvSr05zfLaBSlMgqg0+XHxruwjn+7MYVzfaq/3DAYhm5lYxceldlsubVwNQRqsoY/pkQIs8qRZWkXO/ef/VbDMp+dzzEO33ZF1z+YkR9bTxtf8rpxerlWbWjqiLHdlq5rUcLADZFRHR3fW+Cu+MRwx/Mpfb4n7WZNEMqecbYx2Z2W1bi/Fi0LZMqXhg/mNGH7qPpvNk6MAnDVbMPPSJZszd4xk8IaD99O9fvbcxwbd7ullucYMY8ncxfk5ELrJk+ls9Tj7BszkxdudEXN5uy/GadbmNu5ZJ8+SgmYqyZs18HMYLTOuqX179kyVtjvBXFf3yvragNrNWfc9ZXoi+aZ62ez0L8tUNgU4LYlPLf3RuRmEYM6iRqs05etPuu/RZOV5mSud2yq9zEOZR2AqO5HPae07E93ygyOC3EI8ZXW9XLZlj8iUZXtQpeotmDP6PJEyn1qret1n5EBwSvd+jxpbg6ls+KpxozNrG9f96sDclO18BDOEMH3zOv0OdtSrbB3TSgQzbJnyBrde5Ec97fcxQ6n24hozHFLpqrPWrapUxdvyenOqgm3PiBn6yDkQb7nLinfob8nMiBlOKPVux8xNW5uBXdoSrl28aQu2JcEMBWitTtldT77Uh+dUL5uVbRumsoF5SI6nSiv9Ff9MVhkxAz0Fae5bLinTeDL/zncFOTrZCGaAf0V7JWuR7bncD5DKvQhmgA7C3qAeYW3z5AQzAP+p2mNYvWRgaP6m1nuzASCD3TmGszn7Px23ddef5EmFAAAAAElFTkSuQmCC",
"text/plain": [
"<PIL.Image.Image image mode=RGB size=648x124>"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# PILversions[9]"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"# image = Image.open(\"../result_images/6.jpg\").convert(\"RGB\")\n",
"# image"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"# pixel_values = processor(image, return_tensors=\"pt\").pixel_values\n",
"# # print(pixel_values.shape)\n",
"# # print(image)\n",
"# # print(pixel_values)"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"# pixel_values = processor(image, return_tensors=\"pt\").pixel_values\n",
"# # print(pixel_values.shape)\n",
"# generated_ids = model.generate(pixel_values)\n",
"# generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",
"# print(generated_text)"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
"finalstring = \"\""
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"for image in PILversions:\n",
" pixel_values = processor(image, return_tensors=\"pt\").pixel_values\n",
" pixel_values = pixel_values.to(device)\n",
" generated_ids = model.generate(pixel_values)\n",
" generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",
" finalstring = finalstring + generated_text + \"\\n\"\n",
" # print(generated_text)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WALKER'S\n",
"CHOCOLATES\n",
"NO RETURNS OR EXCHANGES\n",
"ON FOOD ITEMS.\n",
"REG 09-22-2023 12:08\n",
"000021\n",
"1 BAKING NT $14.40\n",
"TL $14.40\n",
"CREDIT : $14.40\n",
"LIFE S SHORT\n",
"EAT CHOCOLATE\n",
"\n"
]
}
],
"source": [
"print(finalstring)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Some files were not shown because too many files have changed in this diff Show More