Visual Servoing Platform version 3.6.0
Loading...
Searching...
No Matches
tutorial-dnn-tensorrt-live.cpp
1// This tutorial uses NVIDIA TensorRT inference framework to perform object detection.
2// The object detection model is provided as a `.onnx` file. The model will be parsed
3// and a GPU Inference Engine (GIE) will be created if it doesn't exist. This GIE is
4// specific to the platform you're using.
5//
6// This tutorial was tested on NVIDIA Jetson TX2.
7//
8// The object detection model used is `SSD_Mobilenet V1` (Single Shot MultiBox Detector)
9// pre-trained on PASCAL VOC dataset. It can detect 20 classes.
10// For more information about the model, see this link:
11//
12// https://github.com/qfgaohao/pytorch-ssd
13//
14
15#include <iostream>
16#include <visp3/core/vpConfig.h>
17
18#if defined(VISP_HAVE_TENSORRT) && defined(VISP_HAVE_OPENCV)
19
20#if defined(HAVE_OPENCV_CUDEV) && defined(HAVE_OPENCV_CUDAWARPING) && defined(HAVE_OPENCV_CUDAARITHM) && \
21 defined(HAVE_OPENCV_DNN) && defined(HAVE_OPENCV_VIDEOIO)
22#include <visp3/core/vpImageConvert.h>
23#include <visp3/core/vpIoTools.h>
24#include <visp3/gui/vpDisplayX.h>
25
26#include <opencv2/videoio.hpp>
27
29#include <opencv2/core/cuda.hpp>
30#include <opencv2/cudaarithm.hpp>
31#include <opencv2/cudawarping.hpp>
32#include <opencv2/dnn.hpp>
34
36#include <cuda_runtime_api.h>
38
40#include <NvInfer.h>
41#include <NvOnnxParser.h>
43
44#include <sys/stat.h>
45
47void preprocessImage(cv::Mat &img, float *gpu_input, const nvinfer1::Dims &dims, float meanR, float meanG, float meanB)
48{
49 if (img.empty()) {
50 std::cerr << "Image is empty." << std::endl;
51 return;
52 }
53
54 cv::cuda::GpuMat gpu_frame;
55 // Upload image to GPU
56 gpu_frame.upload(img);
57
58 // input_dims is in NxCxHxW format.
59 auto input_width = dims.d[3];
60 auto input_height = dims.d[2];
61 auto channels = dims.d[1];
62 auto input_size = cv::Size(input_width, input_height);
63
64 // Resize
65 cv::cuda::GpuMat resized;
66 cv::cuda::resize(gpu_frame, resized, input_size, 0, 0, cv::INTER_NEAREST);
67
68 // Normalize
69 cv::cuda::GpuMat flt_image;
70 resized.convertTo(flt_image, CV_32FC3);
71 cv::cuda::subtract(flt_image, cv::Scalar(meanR, meanG, meanB), flt_image, cv::noArray(), -1);
72 cv::cuda::divide(flt_image, cv::Scalar(127.5f, 127.5f, 127.5f), flt_image, 1, -1);
73
74 // To tensor
75 std::vector<cv::cuda::GpuMat> chw;
76 for (int i = 0; i < channels; ++i)
77 chw.emplace_back(cv::cuda::GpuMat(input_size, CV_32FC1, gpu_input + i * input_width * input_height));
78 cv::cuda::split(flt_image, chw);
79}
81
83size_t getSizeByDim(const nvinfer1::Dims &dims)
84{
85 size_t size = 1;
86 for (int i = 0; i < dims.nbDims; ++i)
87 size *= dims.d[i];
88 return size;
89}
91
93std::vector<cv::Rect> postprocessResults(std::vector<void *> buffers, const std::vector<nvinfer1::Dims> &output_dims,
94 int batch_size, int image_width, int image_height, float confThresh,
95 float nmsThresh, std::vector<int> &classIds)
96{
97 // private variables of vpDetectorDNN
98 std::vector<cv::Rect> m_boxes, m_boxesNMS;
99 std::vector<int> m_classIds;
100 std::vector<float> m_confidences;
101 std::vector<int> m_indices;
102
103 // copy results from GPU to CPU
104 std::vector<std::vector<float> > cpu_outputs;
105 for (size_t i = 0; i < output_dims.size(); i++) {
106 cpu_outputs.push_back(std::vector<float>(getSizeByDim(output_dims[i]) * batch_size));
107 cudaMemcpy(cpu_outputs[i].data(), (float *)buffers[1 + i], cpu_outputs[i].size() * sizeof(float),
108 cudaMemcpyDeviceToHost);
109 }
110
111 // post process
112 int N = output_dims[0].d[1], C = output_dims[0].d[2]; // (1 x N x C format); N: Number of output detection boxes
113 // (fixed in the model), C: Number of classes.
114 for (int i = 0; i < N; i++) // for all N (boxes)
115 {
116 uint32_t maxClass = 0;
117 float maxScore = -1000.0f;
118
119 for (int j = 1; j < C; j++) // ignore background (classId = 0).
120 {
121 const float score = cpu_outputs[0][i * C + j];
122
123 if (score < confThresh)
124 continue;
125
126 if (score > maxScore) {
127 maxScore = score;
128 maxClass = j;
129 }
130 }
131
132 if (maxScore > confThresh) {
133 int left = (int)(cpu_outputs[1][4 * i] * image_width);
134 int top = (int)(cpu_outputs[1][4 * i + 1] * image_height);
135 int right = (int)(cpu_outputs[1][4 * i + 2] * image_width);
136 int bottom = (int)(cpu_outputs[1][4 * i + 3] * image_height);
137 int width = right - left + 1;
138 int height = bottom - top + 1;
139
140 m_boxes.push_back(cv::Rect(left, top, width, height));
141 m_classIds.push_back(maxClass);
142 m_confidences.push_back(maxScore);
143 }
144 }
145
146 cv::dnn::NMSBoxes(m_boxes, m_confidences, confThresh, nmsThresh, m_indices);
147 m_boxesNMS.resize(m_indices.size());
148 for (size_t i = 0; i < m_indices.size(); ++i) {
149 int idx = m_indices[i];
150 m_boxesNMS[i] = m_boxes[idx];
151 }
152
153 classIds = m_classIds; // Returning detected objects class Ids.
154 return m_boxesNMS;
155}
157
158class Logger : public nvinfer1::ILogger
159{
160public:
161 void log(Severity severity, const char *msg) noexcept // override
162 {
163 if ((severity == Severity::kERROR) || (severity == Severity::kINTERNAL_ERROR) || (severity == Severity::kVERBOSE))
164 std::cout << msg << std::endl;
165 }
166} gLogger;
167
168// destroy TensoRT objects if something goes wrong
169struct TRTDestroy
170{
171 template <class T> void operator()(T *obj) const
172 {
173 if (obj)
174 obj->destroy();
175 }
176};
177
178template <class T> using TRTUniquePtr = std::unique_ptr<T, TRTDestroy>;
179
181bool parseOnnxModel(const std::string &model_path, TRTUniquePtr<nvinfer1::ICudaEngine> &engine,
182 TRTUniquePtr<nvinfer1::IExecutionContext> &context)
184{
185 // this section of code is from jetson-inference's `tensorNet`, to test if the GIE already exists.
186 char cache_prefix[FILENAME_MAX];
187 char cache_path[FILENAME_MAX];
188
189 snprintf(cache_prefix, FILENAME_MAX, "%s", model_path.c_str());
190 snprintf(cache_path, FILENAME_MAX, "%s.engine", cache_prefix);
191
192 std::cout << "attempting to open engine cache file " << cache_path << std::endl;
193
195 if (vpIoTools::checkFilename(cache_path)) {
196 char *engineStream = NULL;
197 size_t engineSize = 0;
198
199 // determine the file size of the engine
200 struct stat filestat;
201 stat(cache_path, &filestat);
202 engineSize = filestat.st_size;
203
204 // allocate memory to hold the engine
205 engineStream = (char *)malloc(engineSize);
206
207 // open the engine cache file from disk
208 FILE *cacheFile = NULL;
209 cacheFile = fopen(cache_path, "rb");
210
211 // read the serialized engine into memory
212 const size_t bytesRead = fread(engineStream, 1, engineSize, cacheFile);
213
214 if (bytesRead != engineSize) // Problem while deserializing.
215 {
216 std::cerr << "Error reading serialized engine into memory." << std::endl;
217 return false;
218 }
219
220 // close the plan cache
221 fclose(cacheFile);
222
223 // Recreate the inference runtime
224 TRTUniquePtr<nvinfer1::IRuntime> infer { nvinfer1::createInferRuntime(gLogger) };
225 engine.reset(infer->deserializeCudaEngine(engineStream, engineSize, NULL));
226 context.reset(engine->createExecutionContext());
227
228 return true;
229 }
231
233 else {
234 if (!vpIoTools::checkFilename(model_path)) {
235 std::cerr << "Could not parse ONNX model. File not found" << std::endl;
236 return false;
237 }
238
239 TRTUniquePtr<nvinfer1::IBuilder> builder { nvinfer1::createInferBuilder(gLogger) };
240 TRTUniquePtr<nvinfer1::INetworkDefinition> network {
241 builder->createNetworkV2(1U << (uint32_t)nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH) };
242 TRTUniquePtr<nvonnxparser::IParser> parser { nvonnxparser::createParser(*network, gLogger) };
243
244 // parse ONNX
245 if (!parser->parseFromFile(model_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kINFO))) {
246 std::cerr << "ERROR: could not parse the model." << std::endl;
247 return false;
248 }
249
250 TRTUniquePtr<nvinfer1::IBuilderConfig> config { builder->createBuilderConfig() };
251 // allow TRT to use up to 1GB of GPU memory for tactic selection
252 config->setMaxWorkspaceSize(32 << 20);
253 // use FP16 mode if possible
254 if (builder->platformHasFastFp16()) {
255 config->setFlag(nvinfer1::BuilderFlag::kFP16);
256 }
257
258 builder->setMaxBatchSize(1);
259
260 engine.reset(builder->buildEngineWithConfig(*network, *config));
261 context.reset(engine->createExecutionContext());
262
263 TRTUniquePtr<nvinfer1::IHostMemory> serMem { engine->serialize() };
264
265 if (!serMem) {
266 std::cout << "Failed to serialize CUDA engine." << std::endl;
267 return false;
268 }
269
270 const char *serData = (char *)serMem->data();
271 const size_t serSize = serMem->size();
272
273 // allocate memory to store the bitstream
274 char *engineMemory = (char *)malloc(serSize);
275
276 if (!engineMemory) {
277 std::cout << "Failed to allocate memory to store CUDA engine." << std::endl;
278 return false;
279 }
280
281 memcpy(engineMemory, serData, serSize);
282
283 // write the cache file
284 FILE *cacheFile = NULL;
285 cacheFile = fopen(cache_path, "wb");
286
287 fwrite(engineMemory, 1, serSize, cacheFile);
288 fclose(cacheFile);
289
290 return true;
291 }
293}
294
295int main(int argc, char **argv)
296{
297 int opt_device = 0;
298 unsigned int opt_scale = 1;
299 std::string input = "";
300 std::string modelFile = vpIoTools::getViSPImagesDataPath() + "/dnn/object_detection/ssd-mobilenet.onnx";
301 std::string labelFile = vpIoTools::getViSPImagesDataPath() + "/dnn/object_detection/pascal-voc-labels.txt";
302 std::string config = "";
303 float meanR = 127.5f, meanG = 127.5f, meanB = 127.5f;
304 float confThresh = 0.5f;
305 float nmsThresh = 0.4f;
306
307 for (int i = 1; i < argc; i++) {
308 if (std::string(argv[i]) == "--device" && i + 1 < argc) {
309 opt_device = atoi(argv[i + 1]);
310 }
311 else if (std::string(argv[i]) == "--input" && i + 1 < argc) {
312 input = std::string(argv[i + 1]);
313 }
314 else if (std::string(argv[i]) == "--model" && i + 1 < argc) {
315 modelFile = std::string(argv[i + 1]);
316 }
317 else if (std::string(argv[i]) == "--config" && i + 1 < argc) {
318 config = std::string(argv[i + 1]);
319 }
320 else if (std::string(argv[i]) == "--input-scale" && i + 1 < argc) {
321 opt_scale = atoi(argv[i + 1]);
322 }
323 else if (std::string(argv[i]) == "--mean" && i + 3 < argc) {
324 meanR = atof(argv[i + 1]);
325 meanG = atof(argv[i + 2]);
326 meanB = atof(argv[i + 3]);
327 }
328 else if (std::string(argv[i]) == "--confThresh" && i + 1 < argc) {
329 confThresh = (float)atof(argv[i + 1]);
330 }
331 else if (std::string(argv[i]) == "--nmsThresh" && i + 1 < argc) {
332 nmsThresh = (float)atof(argv[i + 1]);
333 }
334 else if (std::string(argv[i]) == "--labels" && i + 1 < argc) {
335 labelFile = std::string(argv[i + 1]);
336 }
337 else if (std::string(argv[i]) == "--help" || std::string(argv[i]) == "-h") {
338 std::cout << argv[0]
339 << " [--device <camera device number>] [--input <path to image or video>"
340 " (camera is used if input is empty)] [--model <path to net trained weights>]"
341 " [--config <path to net config file>]"
342 " [--input-scale <input scale factor>] [--mean <meanR meanG meanB>]"
343 " [--confThresh <confidence threshold>]"
344 " [--nmsThresh <NMS threshold>] [--labels <path to label file>]"
345 << std::endl;
346 return EXIT_SUCCESS;
347 }
348 }
349
350 std::string model_path(modelFile);
351 int batch_size = 1;
352
353 std::vector<std::string> labels;
354 if (!labelFile.empty()) {
355 std::ifstream f_label(labelFile);
356 std::string line;
357 while (std::getline(f_label, line)) {
358 labels.push_back(line);
359 }
360 }
361
363 // Parse the model and initialize the engine and the context.
364 TRTUniquePtr<nvinfer1::ICudaEngine> engine { nullptr };
365 TRTUniquePtr<nvinfer1::IExecutionContext> context { nullptr };
366 if (!parseOnnxModel(model_path, engine, context)) // Problem parsing Onnx model
367 {
368 std::cout << "Make sure the model file exists. To see available models, plese visit: "
369 "\n\twww.github.com/lagadic/visp-images/dnn/object_detection/"
370 << std::endl;
371 return EXIT_FAILURE;
372 }
374
375 std::vector<nvinfer1::Dims> input_dims;
376 std::vector<nvinfer1::Dims> output_dims;
377 std::vector<void *> buffers(engine->getNbBindings()); // buffers for input and output data.
378
380 for (int i = 0; i < engine->getNbBindings(); ++i) {
381 auto binding_size = getSizeByDim(engine->getBindingDimensions(i)) * batch_size * sizeof(float);
382 cudaMalloc(&buffers[i], binding_size);
383
384 if (engine->bindingIsInput(i)) {
385 input_dims.emplace_back(engine->getBindingDimensions(i));
386 }
387 else {
388 output_dims.emplace_back(engine->getBindingDimensions(i));
389 }
390 }
391
392 if (input_dims.empty() || output_dims.empty()) {
393 std::cerr << "Expect at least one input and one output for network" << std::endl;
394 return EXIT_FAILURE;
395 }
397
399 cv::VideoCapture capture;
400
401 if (input.empty()) {
402 capture.open(opt_device);
403 }
404 else {
405 capture.open(input);
406 }
407
408 if (!capture.isOpened()) { // check if we succeeded
409 std::cout << "Failed to open the camera" << std::endl;
410 return EXIT_FAILURE;
411 }
412
413 int cap_width = (int)capture.get(cv::CAP_PROP_FRAME_WIDTH);
414 int cap_height = (int)capture.get(cv::CAP_PROP_FRAME_HEIGHT);
415 capture.set(cv::CAP_PROP_FRAME_WIDTH, cap_width / opt_scale);
416 capture.set(cv::CAP_PROP_FRAME_HEIGHT, cap_height / opt_scale);
418
420 cv::Mat frame;
421 capture >> frame;
422
423 if (input.empty()) {
424 int i = 0;
425 while ((i++ < 20) && !capture.read(frame)) {
426 }; // warm up camera by skiping unread frames
427 }
428
429 vpImageConvert::convert(frame, I);
430 int height = I.getHeight(), width = I.getWidth();
431
432 std::cout << "Image size: " << width << " x " << height << std::endl;
433
434 std::vector<cv::Rect> boxesNMS;
435 std::vector<int> classIds;
436
437 vpDisplayX d(I);
438
439 double start, stop;
441 while (!vpDisplay::getClick(I, false)) {
442 // get frame.
443 capture >> frame;
444
445 vpImageConvert::convert(frame, I);
446
447 start = vpTime::measureTimeMs();
448 // preprocess
449 preprocessImage(frame, (float *)buffers[0], input_dims[0], meanR, meanG, meanB);
450
451 // inference.
452 context->enqueue(batch_size, buffers.data(), 0, nullptr);
453
454 // post-process
455 boxesNMS = postprocessResults(buffers, output_dims, batch_size, width, height, confThresh, nmsThresh, classIds);
456
457 stop = vpTime::measureTimeMs();
458 // display.
460 vpDisplay::displayText(I, 10, 10, std::to_string(stop - start), vpColor::red);
461
462 for (unsigned int i = 0; i < boxesNMS.size(); i++) {
463 vpDisplay::displayRectangle(I, vpRect(boxesNMS[i].x, boxesNMS[i].y, boxesNMS[i].width, boxesNMS[i].height),
464 vpColor::red, false, 2);
465 vpDisplay::displayText(I, boxesNMS[i].y - 10, boxesNMS[i].x, labels[classIds[i]], vpColor::red);
466 }
467
469 }
471
472 for (void *buf : buffers)
473 cudaFree(buf);
474
475 return EXIT_SUCCESS;
476}
477#else
478int main()
479{
480 std::cout << "OpenCV is not built with CUDA." << std::endl;
481
482 return EXIT_SUCCESS;
483}
484#endif
485
486#else
487int main()
488{
489 std::cout << "ViSP is not built with TensorRT." << std::endl;
490
491 return EXIT_SUCCESS;
492}
493#endif
static const vpColor red
Definition vpColor.h:211
Use the X11 console to display images on unix-like OS. Thus to enable this class X11 should be instal...
Definition vpDisplayX.h:132
static bool getClick(const vpImage< unsigned char > &I, bool blocking=true)
static void display(const vpImage< unsigned char > &I)
static void flush(const vpImage< unsigned char > &I)
static void displayRectangle(const vpImage< unsigned char > &I, const vpImagePoint &topLeft, unsigned int width, unsigned int height, const vpColor &color, bool fill=false, unsigned int thickness=1)
static void displayText(const vpImage< unsigned char > &I, const vpImagePoint &ip, const std::string &s, const vpColor &color)
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
Definition of the vpImage class member functions.
Definition vpImage.h:135
unsigned int getWidth() const
Definition vpImage.h:242
unsigned int getHeight() const
Definition vpImage.h:184
static std::string getViSPImagesDataPath()
static bool checkFilename(const std::string &filename)
Defines a rectangle in the plane.
Definition vpRect.h:76
VISP_EXPORT double measureTimeMs()