// License: Apache 2.0. See LICENSE file in root directory. // Copyright(c) 2019 Intel Corporation. All Rights Reserved. #include #include #include using namespace InferenceEngine; const size_t DETECTED_OBJECT_SIZE = 7; // the size of each detected object namespace openvino_helpers { object_detection::object_detection( const std::string &pathToModel, double detectionThreshold, bool isAsync, int maxBatch, bool isBatchDynamic, bool doRawOutputMessages ) : base_detection( "object detection", pathToModel, maxBatch, isBatchDynamic, isAsync, doRawOutputMessages ) , _detection_threshold( detectionThreshold ) , _max_results( 0 ) , _n_enqued_frames( 0 ), _width( 0 ), _height( 0 ) { } void object_detection::submit_request() { if( !_n_enqued_frames ) return; _n_enqued_frames = 0; base_detection::submit_request(); } void object_detection::enqueue( const cv::Mat & frame ) { if( !enabled() ) return; if( !_request ) _request = net.CreateInferRequestPtr(); _width = static_cast(frame.cols); _height = static_cast(frame.rows); Blob::Ptr inputBlob = _request->GetBlob( _input_layer_name ); matU8ToBlob( frame, inputBlob ); if( ! _im_info_name.empty() ) { Blob::Ptr infoBlob = _request->GetBlob( _im_info_name ); // (height, width, image_scale) float * p = infoBlob->buffer().as< PrecisionTrait< Precision::FP32 >::value_type * >(); p[0] = static_cast< float >( _input_width ); p[1] = static_cast< float >( _input_height ); for( size_t k = 2; k < _im_info_size; k++ ) p[k] = 1.f; // all scale factors are set to 1.0 } _n_enqued_frames = 1; } CNNNetwork object_detection::read_network() { LOG(INFO) << "Loading " << topoName << " model from: " << pathToModel; CNNNetwork network; #ifdef OPENVINO2019 CNNNetReader netReader; /** Read network model **/ netReader.ReadNetwork( pathToModel ); network = netReader.getNetwork(); /** Extract model name and load its weights **/ std::string binFileName = remove_ext( pathToModel ) + ".bin"; netReader.ReadWeights( binFileName ); #else InferenceEngine::Core ie; /** Read network model **/ network = ie.ReadNetwork( pathToModel ); #endif /** Set batch size **/ //LOG(DEBUG) << "Batch size is set to " << maxBatch; network.setBatchSize(maxBatch); // We support networks with one or two inputs, though others may be possible... InputsDataMap inputInfo(network.getInputsInfo() ); if( inputInfo.size() != 1 && inputInfo.size() != 2 ) throw std::logic_error( "Object detection network should have only one or two inputs" ); for( auto & item : inputInfo ) { if( item.second->getInputData()->getTensorDesc().getDims().size() == 4 ) { // Blob "data" (1x4) will contain the actual image data (e.g., 1,3,224,224 or 1,3,300,300) _input_layer_name = item.first; _input_width = item.second->getTensorDesc().getDims()[2]; _input_height = item.second->getTensorDesc().getDims()[3]; item.second->setPrecision( Precision::U8 ); } else if( item.second->getInputData()->getTensorDesc().getDims().size() == 2 ) { // Blob "im_info" is optional: 1x3 (height, width, image_scale) _im_info_name = item.first; auto const & dims = item.second->getTensorDesc().getDims(); if( dims[0] != 1 ) throw std::logic_error( "Invalid input info: layer \"" + _im_info_name + "\" should be 1x3 or 1x6" ); _im_info_size = dims[1]; item.second->setPrecision( Precision::FP32 ); if( _im_info_size != 3 && _im_info_size != 6 ) throw std::logic_error( "Invalid input info: layer \"" + _im_info_name + "\" should be 1x3 or 1x6" ); } } if( _input_layer_name.empty() ) throw std::logic_error( "Could not find input \"data\" layer in network" ); // Only a single "DetectionOuput" layer is expected OutputsDataMap outputInfo(network.getOutputsInfo() ); if( outputInfo.size() != 1 ) throw std::logic_error( "Object detection network should have only one output" ); _output_layer_name = outputInfo.begin()->first; DataPtr & outputDataPtr = outputInfo.begin()->second; // Checking if layer names are as expected #ifdef OPENVINO2019 const CNNLayerPtr outputLayer = network.getLayerByName(_output_layer_name.c_str()); if (outputLayer->type != "DetectionOutput") throw std::logic_error( "Object detection network output layer(" + outputLayer->name + ") should be DetectionOutput, but was " + outputLayer->type); if (outputLayer->params.find("num_classes") == outputLayer->params.end()) throw std::logic_error( "Object detection network output layer (" + _output_layer_name + ") should have num_classes integer attribute"); #else #ifdef OPENVINO_NGRAPH // Inference Engine integrates the nGraph Core in OpenVINO >= 2020.1 if (auto ngraphFunction = network.getFunction()) { for (const auto& out : outputInfo) { for (const auto& op : ngraphFunction->get_ops()) { if (op->get_type_name() == ngraph::op::DetectionOutput::type_info.name) { if (op->get_friendly_name() != out.second->getName()) { std::string output_name = out.first; std::string output_type = op->get_type_name(); throw std::logic_error( "Object detection network output layer (" + output_name + ") should be DetectionOutput, but was " + output_type); } } } } } #endif #endif /* Expect a blob of [1, 1, N, 7], where N is the number of detected bounding boxes. For each detection, the description has the format: [image_id, label, conf, x_min, y_min, x_max, y_max] image_id - ID of the image in the batch label - predicted class ID conf - confidence for the predicted class (x_min, y_min) - coordinates of the top left bounding box corner (x_max, y_max) - coordinates of the bottom right bounding box corner. */ const SizeVector & outputDims = outputDataPtr->getTensorDesc().getDims(); if( outputDims.size() != 4 ) throw std::logic_error( "Object detection network output dimensions should be 4, but was " + std::to_string( outputDims.size() ) ); size_t objectSize = outputDims[3]; if( objectSize != DETECTED_OBJECT_SIZE ) throw std::logic_error( "Object detection network output layer last dimension should be " + std::to_string( DETECTED_OBJECT_SIZE ) + "; got " + std::to_string( objectSize ) ); _max_results = outputDims[2]; outputDataPtr->setPrecision( Precision::FP32 ); return network; } std::vector< object_detection::Result > object_detection::fetch_results() { std::vector< Result > results; const float *detections = _request->GetBlob( _output_layer_name )->buffer().as(); for( size_t i = 0; i < _max_results; i++ ) { float image_id = detections[i * DETECTED_OBJECT_SIZE + 0]; if( image_id < 0 ) break; // [image_id, label, confidence, x_min, y_min, x_max, y_max] Result r; r.label = static_cast(detections[i * DETECTED_OBJECT_SIZE + 1]); r.confidence = detections[i * DETECTED_OBJECT_SIZE + 2]; if( r.confidence <= _detection_threshold && !doRawOutputMessages ) continue; r.location.x = static_cast(detections[i * DETECTED_OBJECT_SIZE + 3] * _width); r.location.y = static_cast(detections[i * DETECTED_OBJECT_SIZE + 4] * _height); r.location.width = static_cast(detections[i * DETECTED_OBJECT_SIZE + 5] * _width - r.location.x); r.location.height = static_cast(detections[i * DETECTED_OBJECT_SIZE + 6] * _height - r.location.y); if( doRawOutputMessages ) { LOG(DEBUG) << "[" << i << "," << r.label << "] element, prob = " << r.confidence << " (" << r.location.x << "," << r.location.y << ")-(" << r.location.width << "," << r.location.height << ")" << ((r.confidence > _detection_threshold) ? " WILL BE RENDERED!" : ""); } if( r.confidence > _detection_threshold ) results.push_back( r ); } return results; } }