from imutils.video import VideoStream
from imutils.video import FPS
import argparse
import imutils
import time
import cv2
from datetime import datetime, time
import numpy as np
import time as time2
Person detection in video streams using Python, OpenCV and deep learning
Introduction
This tutorial is on detecting persons in videos using Python and deep learning.
After following the steps and executing the Python code below, the output should be as follows, showing a video in which persons are tagged once recognized:
Neural networks trained for object recognition allow one to identify persons in pictures. Hence, we can decompose videos or live streams into frames and analyze each frame by turning it into a matrix of pixel values. This tutorial is part of a larger section on person recognition that covers 3 topics:
- Detecting persons in videos (this page)
- Tracking persons in videos
- Detecting and tracking persons in real-time (e.g. live streams, or a game)
Note: this is still work in progress - this guide and code is updated frequently as the code base behind it develops.
If you have any questions or suggestions, please post them below the article in the comments section.
Executing the code on a video
The code below, when saved as a python file (or in a Jupyter notebook), can be ran as follows with a video argument that specificies the location of the video:
python file.py -v C:\run.mp4
The video can be downloaded from here: run.mp4 (right click and ‘save as’).
If no video is specified, the video stream from the webcam will be analyzed (still work in progress).
Necessary libraries
To get started, we first import the necessary Python libraries. Apart from the general ones, Imagegrab is used to capture frames and transform them into numpy arrays (where each pixel is a number), which are in turn fed to the object recognition models. VideoStream and FPS are used to capture and stream the video output and keep track of the number of frames processed per second.
Arguments
Now we specify the arguments. The -v argument, when running the code, specifies the location of the video to analyze. In the arguments we can also specify a separate tracker parameter with -t, and min-area parameter with -a (the higher the area, the lower the frames per second - i.e. FPS - the machine can capture).
= argparse.ArgumentParser()
ap "-v", "--video", help="path to the video file")
ap.add_argument("-a", "--min-area", type=int, default=500, help="minimum area size")
ap.add_argument("-t", "--tracker", type=str, default="csrt", help="OpenCV object tracker type")
ap.add_argument(= vars(ap.parse_args()) args
We then determine which version of OpenCV is used, and we select the tracker. The csrt tracker performs quite well in most applications. We will use the tracker in section 2, for now in section 1 we just focus on person recognition.
# extract the OpenCV version info
= cv2.__version__.split(".")[:2]
(major, minor) # if we are using OpenCV 3.2 or an earlier version, we can use a special factory
# function to create the entity that tracks objects
if int(major) == 3 and int(minor) < 3:
= cv2.Tracker_create(args["tracker"].upper())
tracker #tracker = cv2.TrackerGOTURN_create()
# otherwise, for OpenCV 3.3 or newer,
# we need to explicity call the respective constructor that contains the tracker object:
else:
# initialize a dictionary that maps strings to their corresponding
# OpenCV object tracker implementations
= {
OPENCV_OBJECT_TRACKERS "csrt": cv2.TrackerCSRT_create,
"kcf": cv2.TrackerKCF_create,
"boosting": cv2.TrackerBoosting_create,
"mil": cv2.TrackerMIL_create,
"tld": cv2.TrackerTLD_create,
"medianflow": cv2.TrackerMedianFlow_create,
"mosse": cv2.TrackerMOSSE_create
}# grab the appropriate object tracker using our dictionary of
# OpenCV object tracker objects
= OPENCV_OBJECT_TRACKERS[args["tracker"]]()
tracker #tracker = cv2.TrackerGOTURN_create()
# if the video argument is None, then the code will read from webcam (work in progress)
if args.get("video", None) is None:
= VideoStream(src=0).start()
vs 2.0)
time2.sleep(# otherwise, we are reading from a video file
else:
= cv2.VideoCapture(args["video"]) vs
Looping over and analyzing video frames
Now that we have selected the video and appropriate tracker, we initialize the first frame of the video, and loop over the rest of the frames using a While loop. The program ends once the final frame of the video has been processed.
Each frame is cut to the resolution specified below (500 width in this case). The image is then greyscaled. Both of these steps help in reducing the burden on the CPU and GPU and increase the frames processed per second. The image of each frame is also dilated, which helps in identifying (person) contours as it highlights differences between contrasts.
# loop over the frames of the video, and store corresponding information from each frame
= None
firstFrame = None
initBB2 = None
fps = None
differ = ''
now = 0
framecounter = 0
trackeron
while True:
= vs.read()
frame = frame if args.get("video", None) is None else frame[1]
frame # if the frame can not be grabbed, then we have reached the end of the video
if frame is None:
break
# resize the frame to 500
= imutils.resize(frame, width=500)
frame
= framecounter+1
framecounter if framecounter > 1:
= frame.shape[:2]
(H, W) = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
gray = cv2.GaussianBlur(gray, (21, 21), 0)
gray
# if the first frame is None, initialize it
if firstFrame is None:
= gray
firstFrame continue
# compute the absolute difference between the current frame and first frame
= cv2.absdiff(firstFrame, gray)
frameDelta = cv2.threshold(frameDelta, 25, 255, cv2.THRESH_BINARY)[1]
thresh
# dilate the thresholded image to fill in holes, then find contours on thresholded image
= cv2.dilate(thresh, None, iterations=2)
thresh = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if imutils.is_cv2() else cnts[1]
cnts
# loop over the contours identified
= 0
contourcount for c in cnts:
= contourcount + 1
contourcount
# if the contour is too small, ignore it
if cv2.contourArea(c) < args["min_area"]:
continue
# compute the bounding box for the contour, draw it on the frame,
= cv2.boundingRect(c)
(x, y, w, h) =(x,y,w,h) initBB2
Pre-trained neural network models to identify persons
- Note: the code below integrates neural networks to identify persons, but this section can also be commented out (the code block from here until the end of the neural network integration). Then the program will identify just moving objects as such but does not check whether these are persons or not.
The code so far identifies moving objects, captured in the contours above. These can be all sorts of objects, from trucks to persons to airplanes. All of these can be identified with a certain confidence level by including the Python code on neural networks below, as shown in the picture here.
We now want to make sure the objects identifed are actually persons. To do so, we can use machine learning and integrate pre-trained models - neural networks trained to recognize persons, which are key to object recognition.
The fastest models for this at the time of writing are MobileNet (MobileNetSSD caffe) models, which can handle more than 30 frames per second. They can be used to analyze persons from live video streams, for examples live feeds from another program (e.g. a live stream from a webcam, or video running in the background.). If you’re interested in building a system to utilize such models, you might find Building your own deep learning machine in 2023: some reflections particularly useful.
They need to be downloaded from:
https://github.com/chuanqi305/MobileNet-SSD/blob/master/voc/MobileNetSSD_deploy.prototxt
https://github.com/chuanqi305/MobileNet-SSD/blob/master/mobilenet_iter_73000.caffemodel
And then need to be loaded as follows, using the OpenCV package and its darknet integration:
= r'C:\Downloads\MobileNetSSD_deploy.prototxt'
prott1 = r'C:\Downloads\MobileNetSSD_deploy.caffemodel'
prott2 = cv2.dnn.readNetFromCaffe(prott1, prott2) net
The next step is to select the classes to identify objects. These can be areoplanes, sheep, sofas, trains, and so on. As we are interested in persons, we set this list to person, and we specify colors to identify the class. Note here that these models have been pre-trained in all the classes mentioned above, and more. Hence they have to be downloaded as one and the same model (there’s no ‘person-only’ pre-trained model), which explains the large file size.
= ["person"]
CLASSES = np.random.uniform(0, 255, size=(len(CLASSES), 3)) COLORS
Now we feed the captured contour (moving part in frame) to the neural network which will give us a confidence interval as to whether this is a person:
= frame[y:y+h, x:x+w]
trackbox = cv2.resize(trackbox, (224, 224))
trackbox 'image',trackbox)
cv2.imshow(= cv2.dnn.blobFromImage(cv2.resize(trackbox, (300, 300)),0.007843, (300, 300), 127.5)
blob
net.setInput(blob)= net.forward() detections
Now we loop over the detections - i.e. the predictors for each contours as to what object they represent. If we are confident enough that the contour is a person, we proceed and display the prediction on screen in the frame, as follows:
for i in np.arange(0, detections.shape[2]):
= detections[0, 0, i, 2]
confidence
= 0.7
confidence_level
if confidence > confidence_level:
# extract the index of the class label from the `detections`, then compute the (x, y)-coordinates of
# the bounding box for the object
= int(detections[0, 0, i, 1])
idx = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
box = box.astype("int")
(startX, startY, endX, endY) # draw the prediction on the frame
= "{}: {:.2f}%".format(CLASSES[idx],
label * 100)
confidence
cv2.rectangle(frame, (startX, startY), (endX, endY),2)
COLORS[idx], = startY - 15 if startY - 15 > 15 else startY + 15
y
cv2.putText(frame, label, (startX, y),0.5, COLORS[idx], 2) cv2.FONT_HERSHEY_SIMPLEX,
This is the end of the neural network integration. For those interested in further expanding their knowledge on similar topics, you might find this introduction to object recognition useful as it provides an overview of how objects can be identified within images, potentially complementing the neural network’s functions. The rest of the code deals with drawing the object in the frame and finishing the calculations for the frame.
+ w, y + h), (255, 255, 0), 2)
cv2.rectangle(frame, (x, y), (x # Start tracker
= datetime.now()
now if differ == None or differ > 9:
tracker.init(frame, initBB2)= FPS().start()
fps
# check to see if we are currently tracking an object, if so, ignore other boxes
# this code is relevant if we want to identify particular persons (section 2 of this tutorial)
if initBB2 is not None:
# grab the new bounding box coordinates of the object
= tracker.update(frame)
(success, box)
# check to see if the tracking was a success
= 10
differ if success:
= [int(v) for v in box]
(x, y, w, h) + w, y + h),(0, 255, 0), 2)
cv2.rectangle(frame, (x, y), (x = abs(initBB2[0]-box[0]) + abs(initBB2[1]-box[1])
differ = tracker.update(lastframe)
i if i[0] != True:
4000)
time2.sleep(else:
= 1
trackeron
# update the FPS counter
fps.update()
fps.stop()
# initialize the set of information we'll be displaying on
# the frame
= [
info "Success", "Yes" if success else "No"),
("FPS", "{:.2f}".format(fps.fps())),
(
]
# loop over the info tuples and draw them on our frame
for (i, (k, v)) in enumerate(info):
= "{}: {}".format(k, v)
text 10, H - ((i * 20) + 20)), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
cv2.putText(frame, text, (
# draw the text and timestamp on the frame
= datetime.now()
now2 = str((now2-now).seconds)
time_passed_seconds 'Detecting persons',(10, 20),
cv2.putText(frame, 0.5, (0, 0, 255), 2)
cv2.FONT_HERSHEY_SIMPLEX,
# show the frame and record if the user presses a key
"Video stream", frame)
cv2.imshow(= cv2.waitKey(1) & 0xFF
key
# if the `q` key is pressed, break from the lop
if key == ord("q"):
break
if key == ord("d"):
= None
firstFrame = frame
lastframe
# finally, stop the camera/stream and close any open windows
if args.get("video", None) is None else vs.release()
vs.stop() cv2.destroyAllWindows()
Conclusion
If you’ve gone through the code and saved it, you can run it as follows on a video:
python file.py -v C:\run.mp4
The code will start tagging persons that it identifies in the video. This is a first step in object recognition in Python. You can now use the information on the entities tagged for further analysis. For instance, you can store their properties in a database.
The next section on person tracking in videos using Python will elaborate on how you can track persons that you’ve tagged in a video, using neural networks and deep learning techniques similar to the ones used in this tutorial.