You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

230 lines
9.3 KiB

// License: Apache 2.0. See LICENSE file in root directory.
// Copyright(c) 2019 Intel Corporation. All Rights Reserved.
#include <rs-vino/object-detection.h>
#include <rs-vino/openvino-helpers.h>
#include <rsutils/easylogging/easyloggingpp.h>
using namespace InferenceEngine;
const size_t DETECTED_OBJECT_SIZE = 7; // the size of each detected object
namespace openvino_helpers
{
object_detection::object_detection(
const std::string &pathToModel,
double detectionThreshold,
bool isAsync,
int maxBatch, bool isBatchDynamic,
bool doRawOutputMessages
)
: base_detection( "object detection", pathToModel, maxBatch, isBatchDynamic, isAsync, doRawOutputMessages )
, _detection_threshold( detectionThreshold )
, _max_results( 0 )
, _n_enqued_frames( 0 ), _width( 0 ), _height( 0 )
{
}
void object_detection::submit_request()
{
if( !_n_enqued_frames )
return;
_n_enqued_frames = 0;
base_detection::submit_request();
}
void object_detection::enqueue( const cv::Mat & frame )
{
if( !enabled() )
return;
if( !_request )
_request = net.CreateInferRequestPtr();
_width = static_cast<float>(frame.cols);
_height = static_cast<float>(frame.rows);
Blob::Ptr inputBlob = _request->GetBlob( _input_layer_name );
matU8ToBlob<uint8_t>( frame, inputBlob );
if( ! _im_info_name.empty() )
{
Blob::Ptr infoBlob = _request->GetBlob( _im_info_name );
// (height, width, image_scale)
float * p = infoBlob->buffer().as< PrecisionTrait< Precision::FP32 >::value_type * >();
p[0] = static_cast< float >( _input_width );
p[1] = static_cast< float >( _input_height );
for( size_t k = 2; k < _im_info_size; k++ )
p[k] = 1.f; // all scale factors are set to 1.0
}
_n_enqued_frames = 1;
}
CNNNetwork object_detection::read_network()
{
LOG(INFO) << "Loading " << topoName << " model from: " << pathToModel;
CNNNetwork network;
#ifdef OPENVINO2019
CNNNetReader netReader;
/** Read network model **/
netReader.ReadNetwork( pathToModel );
network = netReader.getNetwork();
/** Extract model name and load its weights **/
std::string binFileName = remove_ext( pathToModel ) + ".bin";
netReader.ReadWeights( binFileName );
#else
InferenceEngine::Core ie;
/** Read network model **/
network = ie.ReadNetwork( pathToModel );
#endif
/** Set batch size **/
//LOG(DEBUG) << "Batch size is set to " << maxBatch;
network.setBatchSize(maxBatch);
// We support networks with one or two inputs, though others may be possible...
InputsDataMap inputInfo(network.getInputsInfo() );
if( inputInfo.size() != 1 && inputInfo.size() != 2 )
throw std::logic_error( "Object detection network should have only one or two inputs" );
for( auto & item : inputInfo )
{
if( item.second->getInputData()->getTensorDesc().getDims().size() == 4 )
{
// Blob "data" (1x4) will contain the actual image data (e.g., 1,3,224,224 or 1,3,300,300)
_input_layer_name = item.first;
_input_width = item.second->getTensorDesc().getDims()[2];
_input_height = item.second->getTensorDesc().getDims()[3];
item.second->setPrecision( Precision::U8 );
}
else if( item.second->getInputData()->getTensorDesc().getDims().size() == 2 )
{
// Blob "im_info" is optional: 1x3 (height, width, image_scale)
_im_info_name = item.first;
auto const & dims = item.second->getTensorDesc().getDims();
if( dims[0] != 1 )
throw std::logic_error( "Invalid input info: layer \"" + _im_info_name + "\" should be 1x3 or 1x6" );
_im_info_size = dims[1];
item.second->setPrecision( Precision::FP32 );
if( _im_info_size != 3 && _im_info_size != 6 )
throw std::logic_error( "Invalid input info: layer \"" + _im_info_name + "\" should be 1x3 or 1x6" );
}
}
if( _input_layer_name.empty() )
throw std::logic_error( "Could not find input \"data\" layer in network" );
// Only a single "DetectionOuput" layer is expected
OutputsDataMap outputInfo(network.getOutputsInfo() );
if( outputInfo.size() != 1 )
throw std::logic_error(
"Object detection network should have only one output" );
_output_layer_name = outputInfo.begin()->first;
DataPtr & outputDataPtr = outputInfo.begin()->second;
// Checking if layer names are as expected
#ifdef OPENVINO2019
const CNNLayerPtr outputLayer = network.getLayerByName(_output_layer_name.c_str());
if (outputLayer->type != "DetectionOutput")
throw std::logic_error(
"Object detection network output layer(" + outputLayer->name +
") should be DetectionOutput, but was " + outputLayer->type);
if (outputLayer->params.find("num_classes") == outputLayer->params.end())
throw std::logic_error(
"Object detection network output layer (" +
_output_layer_name + ") should have num_classes integer attribute");
#else
#ifdef OPENVINO_NGRAPH
// Inference Engine integrates the nGraph Core in OpenVINO >= 2020.1
if (auto ngraphFunction = network.getFunction()) {
for (const auto& out : outputInfo) {
for (const auto& op : ngraphFunction->get_ops()) {
if (op->get_type_name() == ngraph::op::DetectionOutput::type_info.name) {
if (op->get_friendly_name() != out.second->getName()) {
std::string output_name = out.first;
std::string output_type = op->get_type_name();
throw std::logic_error(
"Object detection network output layer (" + output_name +
") should be DetectionOutput, but was " + output_type);
}
}
}
}
}
#endif
#endif
/*
Expect a blob of [1, 1, N, 7], where N is the number of detected bounding boxes.
For each detection, the description has the format: [image_id, label, conf, x_min, y_min, x_max, y_max]
image_id - ID of the image in the batch
label - predicted class ID
conf - confidence for the predicted class
(x_min, y_min) - coordinates of the top left bounding box corner
(x_max, y_max) - coordinates of the bottom right bounding box corner.
*/
const SizeVector & outputDims = outputDataPtr->getTensorDesc().getDims();
if( outputDims.size() != 4 )
throw std::logic_error(
"Object detection network output dimensions should be 4, but was " + std::to_string( outputDims.size() ) );
size_t objectSize = outputDims[3];
if( objectSize != DETECTED_OBJECT_SIZE )
throw std::logic_error(
"Object detection network output layer last dimension should be " +
std::to_string( DETECTED_OBJECT_SIZE ) + "; got " + std::to_string( objectSize ) );
_max_results = outputDims[2];
outputDataPtr->setPrecision( Precision::FP32 );
return network;
}
std::vector< object_detection::Result > object_detection::fetch_results()
{
std::vector< Result > results;
const float *detections = _request->GetBlob( _output_layer_name )->buffer().as<float *>();
for( size_t i = 0; i < _max_results; i++ )
{
float image_id = detections[i * DETECTED_OBJECT_SIZE + 0];
if( image_id < 0 )
break;
// [image_id, label, confidence, x_min, y_min, x_max, y_max]
Result r;
r.label = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 1]);
r.confidence = detections[i * DETECTED_OBJECT_SIZE + 2];
if( r.confidence <= _detection_threshold && !doRawOutputMessages )
continue;
r.location.x = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 3] * _width);
r.location.y = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 4] * _height);
r.location.width = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 5] * _width - r.location.x);
r.location.height = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 6] * _height - r.location.y);
if( doRawOutputMessages )
{
LOG(DEBUG)
<< "[" << i << "," << r.label << "] element, prob = " << r.confidence
<< " (" << r.location.x << "," << r.location.y << ")-(" << r.location.width << ","
<< r.location.height << ")"
<< ((r.confidence > _detection_threshold) ? " WILL BE RENDERED!" : "");
}
if( r.confidence > _detection_threshold )
results.push_back( r );
}
return results;
}
}