You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
230 lines
9.3 KiB
230 lines
9.3 KiB
3 months ago
|
// License: Apache 2.0. See LICENSE file in root directory.
|
||
|
// Copyright(c) 2019 Intel Corporation. All Rights Reserved.
|
||
|
|
||
|
|
||
|
#include <rs-vino/object-detection.h>
|
||
|
#include <rs-vino/openvino-helpers.h>
|
||
|
#include <rsutils/easylogging/easyloggingpp.h>
|
||
|
|
||
|
using namespace InferenceEngine;
|
||
|
|
||
|
|
||
|
const size_t DETECTED_OBJECT_SIZE = 7; // the size of each detected object
|
||
|
|
||
|
|
||
|
namespace openvino_helpers
|
||
|
{
|
||
|
object_detection::object_detection(
|
||
|
const std::string &pathToModel,
|
||
|
double detectionThreshold,
|
||
|
bool isAsync,
|
||
|
int maxBatch, bool isBatchDynamic,
|
||
|
bool doRawOutputMessages
|
||
|
)
|
||
|
: base_detection( "object detection", pathToModel, maxBatch, isBatchDynamic, isAsync, doRawOutputMessages )
|
||
|
, _detection_threshold( detectionThreshold )
|
||
|
, _max_results( 0 )
|
||
|
, _n_enqued_frames( 0 ), _width( 0 ), _height( 0 )
|
||
|
{
|
||
|
}
|
||
|
|
||
|
|
||
|
void object_detection::submit_request()
|
||
|
{
|
||
|
if( !_n_enqued_frames )
|
||
|
return;
|
||
|
_n_enqued_frames = 0;
|
||
|
base_detection::submit_request();
|
||
|
}
|
||
|
|
||
|
|
||
|
void object_detection::enqueue( const cv::Mat & frame )
|
||
|
{
|
||
|
if( !enabled() )
|
||
|
return;
|
||
|
|
||
|
if( !_request )
|
||
|
_request = net.CreateInferRequestPtr();
|
||
|
|
||
|
_width = static_cast<float>(frame.cols);
|
||
|
_height = static_cast<float>(frame.rows);
|
||
|
|
||
|
Blob::Ptr inputBlob = _request->GetBlob( _input_layer_name );
|
||
|
matU8ToBlob<uint8_t>( frame, inputBlob );
|
||
|
|
||
|
if( ! _im_info_name.empty() )
|
||
|
{
|
||
|
Blob::Ptr infoBlob = _request->GetBlob( _im_info_name );
|
||
|
|
||
|
// (height, width, image_scale)
|
||
|
float * p = infoBlob->buffer().as< PrecisionTrait< Precision::FP32 >::value_type * >();
|
||
|
p[0] = static_cast< float >( _input_width );
|
||
|
p[1] = static_cast< float >( _input_height );
|
||
|
for( size_t k = 2; k < _im_info_size; k++ )
|
||
|
p[k] = 1.f; // all scale factors are set to 1.0
|
||
|
}
|
||
|
|
||
|
_n_enqued_frames = 1;
|
||
|
}
|
||
|
|
||
|
|
||
|
CNNNetwork object_detection::read_network()
|
||
|
{
|
||
|
LOG(INFO) << "Loading " << topoName << " model from: " << pathToModel;
|
||
|
|
||
|
CNNNetwork network;
|
||
|
|
||
|
#ifdef OPENVINO2019
|
||
|
CNNNetReader netReader;
|
||
|
|
||
|
/** Read network model **/
|
||
|
netReader.ReadNetwork( pathToModel );
|
||
|
network = netReader.getNetwork();
|
||
|
|
||
|
/** Extract model name and load its weights **/
|
||
|
std::string binFileName = remove_ext( pathToModel ) + ".bin";
|
||
|
netReader.ReadWeights( binFileName );
|
||
|
#else
|
||
|
InferenceEngine::Core ie;
|
||
|
/** Read network model **/
|
||
|
network = ie.ReadNetwork( pathToModel );
|
||
|
#endif
|
||
|
|
||
|
/** Set batch size **/
|
||
|
//LOG(DEBUG) << "Batch size is set to " << maxBatch;
|
||
|
network.setBatchSize(maxBatch);
|
||
|
|
||
|
// We support networks with one or two inputs, though others may be possible...
|
||
|
InputsDataMap inputInfo(network.getInputsInfo() );
|
||
|
if( inputInfo.size() != 1 && inputInfo.size() != 2 )
|
||
|
throw std::logic_error( "Object detection network should have only one or two inputs" );
|
||
|
for( auto & item : inputInfo )
|
||
|
{
|
||
|
if( item.second->getInputData()->getTensorDesc().getDims().size() == 4 )
|
||
|
{
|
||
|
// Blob "data" (1x4) will contain the actual image data (e.g., 1,3,224,224 or 1,3,300,300)
|
||
|
_input_layer_name = item.first;
|
||
|
_input_width = item.second->getTensorDesc().getDims()[2];
|
||
|
_input_height = item.second->getTensorDesc().getDims()[3];
|
||
|
item.second->setPrecision( Precision::U8 );
|
||
|
}
|
||
|
else if( item.second->getInputData()->getTensorDesc().getDims().size() == 2 )
|
||
|
{
|
||
|
// Blob "im_info" is optional: 1x3 (height, width, image_scale)
|
||
|
_im_info_name = item.first;
|
||
|
auto const & dims = item.second->getTensorDesc().getDims();
|
||
|
if( dims[0] != 1 )
|
||
|
throw std::logic_error( "Invalid input info: layer \"" + _im_info_name + "\" should be 1x3 or 1x6" );
|
||
|
_im_info_size = dims[1];
|
||
|
item.second->setPrecision( Precision::FP32 );
|
||
|
if( _im_info_size != 3 && _im_info_size != 6 )
|
||
|
throw std::logic_error( "Invalid input info: layer \"" + _im_info_name + "\" should be 1x3 or 1x6" );
|
||
|
}
|
||
|
}
|
||
|
if( _input_layer_name.empty() )
|
||
|
throw std::logic_error( "Could not find input \"data\" layer in network" );
|
||
|
|
||
|
// Only a single "DetectionOuput" layer is expected
|
||
|
OutputsDataMap outputInfo(network.getOutputsInfo() );
|
||
|
if( outputInfo.size() != 1 )
|
||
|
throw std::logic_error(
|
||
|
"Object detection network should have only one output" );
|
||
|
_output_layer_name = outputInfo.begin()->first;
|
||
|
DataPtr & outputDataPtr = outputInfo.begin()->second;
|
||
|
|
||
|
// Checking if layer names are as expected
|
||
|
#ifdef OPENVINO2019
|
||
|
const CNNLayerPtr outputLayer = network.getLayerByName(_output_layer_name.c_str());
|
||
|
if (outputLayer->type != "DetectionOutput")
|
||
|
throw std::logic_error(
|
||
|
"Object detection network output layer(" + outputLayer->name +
|
||
|
") should be DetectionOutput, but was " + outputLayer->type);
|
||
|
if (outputLayer->params.find("num_classes") == outputLayer->params.end())
|
||
|
throw std::logic_error(
|
||
|
"Object detection network output layer (" +
|
||
|
_output_layer_name + ") should have num_classes integer attribute");
|
||
|
#else
|
||
|
#ifdef OPENVINO_NGRAPH
|
||
|
// Inference Engine integrates the nGraph Core in OpenVINO >= 2020.1
|
||
|
if (auto ngraphFunction = network.getFunction()) {
|
||
|
for (const auto& out : outputInfo) {
|
||
|
for (const auto& op : ngraphFunction->get_ops()) {
|
||
|
if (op->get_type_name() == ngraph::op::DetectionOutput::type_info.name) {
|
||
|
if (op->get_friendly_name() != out.second->getName()) {
|
||
|
std::string output_name = out.first;
|
||
|
std::string output_type = op->get_type_name();
|
||
|
throw std::logic_error(
|
||
|
"Object detection network output layer (" + output_name +
|
||
|
") should be DetectionOutput, but was " + output_type);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
#endif
|
||
|
#endif
|
||
|
|
||
|
/*
|
||
|
Expect a blob of [1, 1, N, 7], where N is the number of detected bounding boxes.
|
||
|
For each detection, the description has the format: [image_id, label, conf, x_min, y_min, x_max, y_max]
|
||
|
image_id - ID of the image in the batch
|
||
|
label - predicted class ID
|
||
|
conf - confidence for the predicted class
|
||
|
(x_min, y_min) - coordinates of the top left bounding box corner
|
||
|
(x_max, y_max) - coordinates of the bottom right bounding box corner.
|
||
|
*/
|
||
|
const SizeVector & outputDims = outputDataPtr->getTensorDesc().getDims();
|
||
|
if( outputDims.size() != 4 )
|
||
|
throw std::logic_error(
|
||
|
"Object detection network output dimensions should be 4, but was " + std::to_string( outputDims.size() ) );
|
||
|
size_t objectSize = outputDims[3];
|
||
|
if( objectSize != DETECTED_OBJECT_SIZE )
|
||
|
throw std::logic_error(
|
||
|
"Object detection network output layer last dimension should be " +
|
||
|
std::to_string( DETECTED_OBJECT_SIZE ) + "; got " + std::to_string( objectSize ) );
|
||
|
_max_results = outputDims[2];
|
||
|
outputDataPtr->setPrecision( Precision::FP32 );
|
||
|
|
||
|
return network;
|
||
|
}
|
||
|
|
||
|
|
||
|
std::vector< object_detection::Result > object_detection::fetch_results()
|
||
|
{
|
||
|
std::vector< Result > results;
|
||
|
const float *detections = _request->GetBlob( _output_layer_name )->buffer().as<float *>();
|
||
|
|
||
|
for( size_t i = 0; i < _max_results; i++ )
|
||
|
{
|
||
|
float image_id = detections[i * DETECTED_OBJECT_SIZE + 0];
|
||
|
if( image_id < 0 )
|
||
|
break;
|
||
|
|
||
|
// [image_id, label, confidence, x_min, y_min, x_max, y_max]
|
||
|
Result r;
|
||
|
r.label = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 1]);
|
||
|
r.confidence = detections[i * DETECTED_OBJECT_SIZE + 2];
|
||
|
if( r.confidence <= _detection_threshold && !doRawOutputMessages )
|
||
|
continue;
|
||
|
r.location.x = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 3] * _width);
|
||
|
r.location.y = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 4] * _height);
|
||
|
r.location.width = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 5] * _width - r.location.x);
|
||
|
r.location.height = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 6] * _height - r.location.y);
|
||
|
|
||
|
if( doRawOutputMessages )
|
||
|
{
|
||
|
LOG(DEBUG)
|
||
|
<< "[" << i << "," << r.label << "] element, prob = " << r.confidence
|
||
|
<< " (" << r.location.x << "," << r.location.y << ")-(" << r.location.width << ","
|
||
|
<< r.location.height << ")"
|
||
|
<< ((r.confidence > _detection_threshold) ? " WILL BE RENDERED!" : "");
|
||
|
}
|
||
|
|
||
|
if( r.confidence > _detection_threshold )
|
||
|
results.push_back( r );
|
||
|
}
|
||
|
|
||
|
return results;
|
||
|
}
|
||
|
}
|