RCNN

Posted by neverset on September 13, 2020

RCNN stands for Region-based Convolutional Neural Network

type of RCNN

Fast RCNN

In R-CNN, feature extraction takes place for each region proposal whereas, in Fast R-CNN, feature extraction takes place only once for an original image.

Faster RCNN

Faster R-CNN replaces the exterior region proposal algorithm with a Region Proposal Network (RPN). it is a two-stage object detector: first it identifies regions of interest, and then passes these regions to a convolutional neural network

Mask-RCNN

object detection model based on CNN. the project can be cloned on github:

git clone https://github.com/matterport/Mask_RCNN.git

installation and usage

prediction

prediction returned is an object with

rois: The boxes around each detected object.
class_ids: The class IDs for the objects.
scores: The class scores for each object.
masks: The masks.

! python setup.py install
#config model
import mrcnn.config
class SimpleConfig(mrcnn.config.Config):
    #specify number of classes, background must be regarded as an additional class
    NUM_CLASSES = 81
    #batch size is calculated by BATCH_SIZE = IMAGES_PER_GPU * GPU_COUNT
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1
#create instance
import mrcnn.model
model = mrcnn.model.MaskRCNN(mode="inference", 
                            config=SimpleConfig(),
                            model_dir=os.getcwd())
#check model architecture
model.keras_model.summary()
#load weights
model.load_weights(filepath="mask_rcnn_coco.h5", 
               by_name=True)
#read images
import cv2
image = cv2.imread("3627527276_6fe8cd9bfe_z.jpg")
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
#Detect Objects
r = model.detect(images=[image], 
             verbose=0)
#Visualize the Results
import mrcnn.visualize
CLASS_NAMES = ['BG', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']
r = r[0]
mrcnn.visualize.display_instances(image=image, 
                                boxes=r['rois'], 
                                masks=r['masks'], 
                                class_ids=r['class_ids'], 
                                class_names=CLASS_NAMES, 
                                scores=r['scores'])
training

The mrcnn.utils.Dataset class has a number of useful methods which include:

add_class(): Adds a new class.
add_image(): Adds a new image to the dataset.
image_reference(): The reference (e.g. path or link) by which the image is retrieved.
prepare(): After adding all the classes and images to the dataset, this method prepares the dataset for use.
source_image_link(): Returns the path or link of the image.
load_image(): Reads and return an image.
load_mask(): Loads the masks for the objects in an image, which must be overriden for customized dataset

to train mask rcnn model for customized dataset, the Dataset class must be overwritten

import mrcnn.utils
class KangarooDataset(mrcnn.utils.Dataset):
    
    def load_dataset(self, dataset_dir, is_train=True):
        self.add_class("dataset", 1, "kangaroo")

        images_dir = dataset_dir + '/images/'
        annotations_dir = dataset_dir + '/annots/'

        for filename in os.listdir(images_dir):
            image_id = filename[:-4]

            if image_id in ['00090']:
                continue

            if is_train and int(image_id) >= 150:
                continue

            if not is_train and int(image_id) < 150:
                continue

            img_path = images_dir + filename
            ann_path = annotations_dir + image_id + '.xml'

            self.add_image('dataset', image_id=image_id, path=img_path, annotation=ann_path)

    def extract_boxes(self, filename):
        tree = xml.etree.ElementTree.parse(filename)

        root = tree.getroot()

        boxes = list()
        for box in root.findall('.//bndbox'):
            xmin = int(box.find('xmin').text)
            ymin = int(box.find('ymin').text)
            xmax = int(box.find('xmax').text)
            ymax = int(box.find('ymax').text)
            coors = [xmin, ymin, xmax, ymax]
            boxes.append(coors)

        width = int(root.find('.//size/width').text)
        height = int(root.find('.//size/height').text)
        return boxes, width, height

    def load_mask(self, image_id):
        info = self.image_info[image_id]
        path = info['annotation']
        boxes, w, h = self.extract_boxes(path)
        masks = zeros([h, w, len(boxes)], dtype='uint8')

        class_ids = list()
        for i in range(len(boxes)):
            box = boxes[i]
            row_s, row_e = box[1], box[3]
            col_s, col_e = box[0], box[2]
            masks[row_s:row_e, col_s:col_e, i] = 1
            class_ids.append(self.class_names.index('kangaroo'))
        return masks, asarray(class_ids, dtype='int32')

#load data
train_set = KangarooDataset()
train_set.load_dataset(dataset_dir='D:\kangaroo', is_train=True)
train_set.prepare()
valid_dataset = KangarooDataset()
valid_dataset.load_dataset(dataset_dir='D:\kangaroo', is_train=False)
valid_dataset.prepare()
#config model
import mrcnn.config
class KangarooConfig(mrcnn.config.Config):
    NAME = "kangaroo_cfg"
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1
    NUM_CLASSES = 2
    STEPS_PER_EPOCH = 131
#train model
import mrcnn.model
model = mrcnn.model.MaskRCNN(mode='training', 
                            model_dir='./', 
                            config=KangarooConfig())
model.load_weights(filepath='mask_rcnn_coco.h5', 
               by_name=True, 
               exclude=["mrcnn_class_logits", "mrcnn_bbox_fc",  "mrcnn_bbox", "mrcnn_mask"])
model.train(train_dataset=train_set, 
        val_dataset=valid_dataset, 
        learning_rate=KangarooConfig().LEARNING_RATE, 
        epochs=10, 
        layers='heads')
model_path = 'Kangaroo_mask_rcnn.h5'
model.keras_model.save_weights(model_path)

library

Detectron 2

it is an open-source library for object detection and segmentation created by facebook. it implemented common RCNN algorithms(Faster R CNN, Mask R CNN, and RetinaNet) for tasks:

Object Detection
Instance Segmentation
Keypoint Detection
Panoptic Segmentation

installation

# install dependencies: (use cu101 because colab has CUDA 10.1)
!pip install cython pyyaml==5.1

# install detectron2:
!pip install detectron2==0.1.3 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.5/index.html

usage

1) import dependencies

# Some basic setup:
# Setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import numpy as np
import cv2
import random
from google.colab.patches import cv2_imshow
import matplotlib.pyplot as plt

# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog

2) read video

%%time
!rm -r frames/*
!mkdir frames/

#specify path to video
video = "sample.mp4"

#capture video
cap = cv2.VideoCapture(video)
cnt=0

# Check if video file is opened successfully
if (cap.isOpened()== False): 
print("Error opening video stream or file")

ret,first_frame = cap.read()

#Read until video is completed
while(cap.isOpened()):
    
# Capture frame-by-frame
ret, frame = cap.read()
    
if ret == True:

    #save each frame to folder        
    cv2.imwrite('frames/'+str(cnt)+'.png', frame)
    cnt=cnt+1
    if(cnt==750):
    break

# Break the loop
else: 
    break

#check frame rate of a video
FPS=cap.get(cv2.CAP_PROP_FPS)
print(FPS)

3) download pretrained model

cfg = get_cfg()

# add project-specific config (e.g., TensorMask) here if you're not running a model in detectron2's core library
cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_C4_3x.yaml"))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.9  # set threshold for this model

# Find a model from detectron2's model zoo. You can use the https://dl.fbaipublicfiles... url as well
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("COCO-Detection/faster_rcnn_R_50_C4_3x.yaml")
predictor = DefaultPredictor(cfg)

4) read and predict image

#read an image
img = cv2.imread("frames/30.png")

#pass to the model
outputs = predictor(img)

5) visualize detected objects

# Use `Visualizer` to draw the predictions on the image.
v = Visualizer(img[:, :, ::-1], MetadataCatalog.get(cfg.DATASETS.TRAIN[0]), scale=1.2)
v = v.draw_instance_predictions(outputs["instances"].to("cpu"))
cv2_imshow(v.get_image()[:, :, ::-1])

6) get bounding box of people

#identity only persons 
ind = np.where(classes==0)[0]

#identify bounding box of only persons
person=bbox[ind]

#total no. of persons
num= len(person)

7) compute bottom center of bounding box and distance

#define a function which return the bottom center of every bbox
def mid_point(img,person,idx):
    #get the coordinates
    x1,y1,x2,y2 = person[idx]
    _ = cv2.rectangle(img, (x1, y1), (x2, y2), (0,0,255), 2)
    
    #compute bottom center of bbox
    x_mid = int((x1+x2)/2)
    y_mid = int(y2)
    mid   = (x_mid,y_mid)
    
    _ = cv2.circle(img, mid, 5, (0, 0, 255), -1)
    cv2.putText(img, str(idx), mid, cv2.FONT_HERSHEY_SIMPLEX,1, (255, 255, 255), 2, cv2.LINE_AA)
    return mid

%%time
from scipy.spatial import distance
def compute_distance(midpoints,num):
dist = np.zeros((num,num))
for i in range(num):
    for j in range(i+1,num):
    if i!=j:
        dst = distance.euclidean(midpoints[i], midpoints[j])
        dist[i][j]=dst
return dist

%%time
def find_closest(dist,num,thresh):
    p1=[]
    p2=[]
    d=[]
    for i in range(num):
        for j in range(i,num):
        if( (i!=j) & (dist[i][j]<=thresh)):
            p1.append(i)
            p2.append(j)
            d.append(dist[i][j])
return p1,p2,d

def change_2_red(img,person,p1,p2):
risky = np.unique(p1+p2)
for i in risky:
    x1,y1,x2,y2 = person[i]
    _ = cv2.rectangle(img, (x1, y1), (x2, y2), (255,0,0), 2)  
return img

8) repeat for all frames

import os
import re

names=os.listdir('frames/')
names.sort(key=lambda f: int(re.sub('\D', '', f)))
def find_closest_people(name,thresh):
    img = cv2.imread('frames/'+name)
    outputs = predictor(img)
    classes=outputs['instances'].pred_classes.cpu().numpy()
    bbox=outputs['instances'].pred_boxes.tensor.cpu().numpy()
    ind = np.where(classes==0)[0]
    person=bbox[ind]
    midpoints = [mid_point(img,person,i) for i in range(len(person))]
    num = len(midpoints)
    dist= compute_distance(midpoints,num)
    p1,p2,d=find_closest(dist,num,thresh)
    img = change_2_red(img,person,p1,p2)
    cv2.imwrite('frames/'+name,img)
    return 0

from tqdm import tqdm
thresh=100
_ = [find_closest_people(names[i],thresh) for i in tqdm(range(len(names))) ]

%%time
frames = os.listdir('frames/')
frames.sort(key=lambda f: int(re.sub('\D', '', f)))

frame_array=[]
for i in range(len(frames)):
    #reading each files
    img = cv2.imread('frames/'+frames[i])
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)

    height, width, layers = img.shape
    size = (width,height)
    
    #inserting the frames into an image array
    frame_array.append(img)

out = cv2.VideoWriter('sample_output.mp4',cv2.VideoWriter_fourcc(*'DIVX'), 25, size)

for i in range(len(frame_array)):
    # writing to a image array
    out.write(frame_array[i])
out.release()