python-3.xscikit-learnscikit-imagemahotas

HOG handwritten digit recognition not working


I was going through a chapter in an OpenCV book about handwritten number recognition, and though I went through it and i THINK everything got processed correctly, I get this error saying Expected 2D array, got 1D array instead. I've attempted to google search an answer and it seems like a lot of other people have run into a very similar issue but with no real answer provided.

would anyone be able to clarify why this feature.hog() method is not returning a 2D array? I was reading some of the documentation and it apparently by default it returns a flat 1D array, so i don't know why this model.predict() method is complaining expects a 2d array. then again this book i am following i think was released in 2015 so maybe something changed?

this is the file i am trying to run:

classify.py

# -*- coding: utf-8 -*-
"""
Created on Tue Nov  3 13:01:39 2020

@author: User
"""


from __future__ import print_function
from sklearn.externals import joblib
from pyimagesearch.hog import HOG
from pyimagesearch import dataset
import argparse
import mahotas
import cv2

ap = argparse.ArgumentParser()
ap.add_argument('-m', '--model', required=True, help='Path to model')
ap.add_argument('-i', '--image', required=True, help='Path to image')
args=vars(ap.parse_args())

model = joblib.load(args['model'])

hog = HOG(orientations=18, pixelsPerCell=(10,10),
          cellsPerBlock=(1,1), normalize=True)

image = cv2.imread(args["image"])
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

blurred = cv2.GaussianBlur(gray, (5, 5), 0)
edged = cv2.Canny(blurred, 30, 150)
(_, cnts, _) = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL,
                                 cv2.CHAIN_APPROX_SIMPLE)
cnts = sorted([(c, cv2.boundingRect(c)[0]) for c in cnts], key=lambda x: x[1])

for (c, _) in cnts:
    (x, y, w, h) = cv2.boundingRect(c)
    
    if w >= 7 and h>= 20:
        roi = gray[y:y+h, x:x+w]
        thresh = roi.copy()
        T = mahotas.thresholding.otsu(roi)
        thresh[thresh > T] = 255
        thresh = cv2.bitwise_not(thresh)
        
        thresh = dataset.deskew(thresh, 72)
        thresh = dataset.center_extent(thresh, (72, 72))
        
        cv2.imshow("thresh", thresh)
        
        hist = hog.describe(thresh)
        digit = model.predict(hist)[0] #this is where it errors
        print("I think that number is: {}".format(digit))
        
        cv2.rectangle(image, (x, y), (x+w, y+h), (0, 255, 0), 1)
        cv2.putText(image, str(digit), (x-10, y-10),
                    cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 2)
        cv2.imshow("image", image)
        cv2.waitKey(0)

and here is the custom hog module that is written for this:

hog.py

# -*- coding: utf-8 -*-
"""
Created on Tue Nov  3 11:22:38 2020

@author: User
"""

from skimage import feature

class HOG:
    def __init__(self, orientations=9, pixelsPerCell=(14,14), cellsPerBlock=(1,1),
                 normalize=False):
        self.orientations = orientations
        self.pixelsPerCell = pixelsPerCell
        self.cellsPerBlock = cellsPerBlock
        self.normalize = normalize
    
    def describe(self, image):
        '''
        (2017-11-28) Update for skimage: In  scikit-image==0.12 , the  
        normalise  parameter has been updated to  transform_sqrt . The  
        transform_sqrt  performs the exact same operation, only with a 
        different name. If you’re using an older version of  scikit-image  
        (again, before the v0.12 release), then you’ll want to change 
        transform_sqrt  to  normalise . In  scikit-image==0.15  the default 
        value of  block_norm="L1"  has been deprecated and changed to  
        block_norm="L2-Hys" . Therefore, for this lesson we’ll explicitly 
        specify  block_norm="L1" . Doing this will avoid it switching to  
        "L2-Hys"  with version updates without us knowing (and yielding 
        incorrect car logo identification results). You can read about L1 and 
        L2 norms here:
        https://gurus.pyimagesearch.com/lesson-sample-histogram-of-oriented-gradients-and-car-logo-recognition/#tour_modal
        '''
        hist = feature.hog(image,
                           orientations=self.orientations,
                           pixels_per_cell=self.pixelsPerCell,
                           cells_per_block=self.cellsPerBlock,
                           transform_sqrt =self.normalize,
                           block_norm="L1")
        return hist

here is what produced the "training model" for this:

train.py

# -*- coding: utf-8 -*-
"""
Created on Tue Nov  3 11:57:26 2020

@author: User
"""

from sklearn.externals import joblib
from sklearn.svm import LinearSVC
from pyimagesearch.hog import HOG
from pyimagesearch import dataset
import argparse

ap = argparse.ArgumentParser()
ap.add_argument('-d', '--dataset', required=True, help='Path to dataset')
ap.add_argument('-m', '--model', required=True, help='path to where model will be stored')
args=vars(ap.parse_args())

(digits, target) = dataset.load_digits(args['dataset'])

data = []

hog = HOG(orientations=9, pixelsPerCell=(14,14),
          cellsPerBlock=(1,1), normalize=True)

for image in digits:
    image = dataset.deskew(image, 20)
    image = dataset.center_extent(image, (20, 20))
    
    hist = hog.describe(image)
    data.append(hist)

model = LinearSVC(random_state=42)
model.fit(data, target)

joblib.dump(model, args['model'])

dataset.py

# -*- coding: utf-8 -*-
"""
Created on Tue Nov  3 11:35:04 2020

@author: User
"""

from . import imutils
import numpy as np
import mahotas
import cv2

def load_digits(datasetPath):
    data = np.genfromtxt(datasetPath, delimiter=',', dtype='uint8')
    target = data[:, 0]
    data = data[:, 1:].reshape(data.shape[0], 28, 28)
    
    return (data, target)

def deskew(image, width):
    (h, w) = image.shape[:2]
    moments = cv2.moments(image)
    
    skew = moments['mu11'] / moments['mu02']
    M = np.float32([
            [1, skew, -0.5 * w * skew],
            [0, 1, 0]])
    image = cv2.warpAffine(image, M, (w, h),
                   flags = cv2.WARP_INVERSE_MAP | cv2.INTER_LINEAR)
    
    image = imutils.resize(image, width=width)
    
    return image

def center_extent(image, size):
    (eW, eH) = size
    
    if image.shape[1] > image.shape[0]:
        image = imutils.resize(image, width=eW)
    else:
        image = imutils.resize(image, height=eH)
    
    extent = np.zeros((eH, eW), dtype = 'uint8')
    
    offsetX = (eW - image.shape[1]) // 2
    offsetY = (eH - image.shape[0]) // 2
    extent[offsetY:offsetY + image.shape[0], 
           offsetX:offsetX + image.shape[1]] = image
       
    CM = mahotas.center_of_mass(extent)
    (cY, cX) = np.round(CM).astype('int32')
    (dX, dY) = ((size[0] // 2) - cX, (size[1] // 2) - cY)
    M = np.float32([[1, 0, dX], [0, 1, dY]])
    extent = cv2.warpAffine(extent, M, size)
    
    return extent
           

and if its needed here is this custom imutils module

imutils.py

# -*- coding: utf-8 -*-
"""
Created on Tue Sep 29 16:27:16 2020

@author: User
"""

import numpy as np
import cv2

def translate(image, x, y):
    M = np.float32([[1, 0, x], [0, 1, y]])
    shifted = cv2.warpAffine(image, M, (image.shape[1], image.shape[0]))
    return shifted

def rotate(image, angle, center=None, scale=1.0):
    (h, w) = image.shape[:2]
    if not center:
        center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, scale)
    rotated = cv2.warpAffine(image, M, (w, h))
    return rotated

def resize(image, width=None, height=None, inter=cv2.INTER_AREA):
    dim = None
    (h, w) = image.shape[:2]
    
    if width is None and height is None:
        return image
    
    if width is None:
        r = height / float(h)
        dim = (int(w*r), height)
    
    else:
        r = width / float(w)
        dim = (width, int(h*r))
    
    resized = cv2.resize(image, dim, interpolation = inter)
    return resized

and i am using this data found here (the train.csv file), and i reduced it to 5000 rows via this script:

import pandas as pd

metadata = pd.read_csv('C:/Users/User/Downloads/digit-recognizer/train.csv', low_memory=False)
smaller_df = metadata.head(5000)
smaller_df.to_csv(path_or_buf='data/digits.csv', index=False)
print('successfully wrote a smaller file!')

Solution

  • i have figured this out for a while but now got a chance to post so i wanted to share my findings.

    I think when i was trying to "reshape" the array i was actually reshaping the WRONG array, and thats why it kept giving me an error.

    So i guess to convert the 1D array i had into 2d, i took this line: digit = model.predict(hist)[0]

    and changed it to this: digit = model.predict(hist.reshape(1,-1))[0]