hight-level-robotics-congni.../librealsense/wrappers/openvino/rs-vino/object-detection.cpp

// License: Apache 2.0. See LICENSE file in root directory.
// Copyright(c) 2019 Intel Corporation. All Rights Reserved.


#include <rs-vino/object-detection.h>
#include <rs-vino/openvino-helpers.h>
#include <rsutils/easylogging/easyloggingpp.h>

using namespace InferenceEngine;


const size_t DETECTED_OBJECT_SIZE = 7;   // the size of each detected object


namespace openvino_helpers
{
    object_detection::object_detection(
        const std::string &pathToModel,
        double detectionThreshold,
        bool isAsync,
        int maxBatch, bool isBatchDynamic,
        bool doRawOutputMessages
    )
        : base_detection( "object detection", pathToModel, maxBatch, isBatchDynamic, isAsync, doRawOutputMessages )
        , _detection_threshold( detectionThreshold )
        , _max_results( 0 )
        , _n_enqued_frames( 0 ), _width( 0 ), _height( 0 )
    {
    }


    void object_detection::submit_request()
    {
        if( !_n_enqued_frames )
            return;
        _n_enqued_frames = 0;
        base_detection::submit_request();
    }


    void object_detection::enqueue( const cv::Mat & frame )
    {
        if( !enabled() )
            return;

        if( !_request )
            _request = net.CreateInferRequestPtr();

        _width = static_cast<float>(frame.cols);
        _height = static_cast<float>(frame.rows);

        Blob::Ptr  inputBlob = _request->GetBlob( _input_layer_name );
        matU8ToBlob<uint8_t>( frame, inputBlob );

        if( ! _im_info_name.empty() )
        {
            Blob::Ptr infoBlob = _request->GetBlob( _im_info_name );

            // (height, width, image_scale)
            float * p = infoBlob->buffer().as< PrecisionTrait< Precision::FP32 >::value_type * >();
            p[0] = static_cast< float >( _input_width );
            p[1] = static_cast< float >( _input_height );
            for( size_t k = 2; k < _im_info_size; k++ )
                p[k] = 1.f;  // all scale factors are set to 1.0
        }

        _n_enqued_frames = 1;
    }


    CNNNetwork object_detection::read_network()
    {
        LOG(INFO) << "Loading " << topoName << " model from: " << pathToModel;

        CNNNetwork network;

#ifdef OPENVINO2019
        CNNNetReader netReader;

        /** Read network model **/
        netReader.ReadNetwork( pathToModel );
        network = netReader.getNetwork();

        /** Extract model name and load its weights **/
        std::string binFileName = remove_ext( pathToModel ) + ".bin";
        netReader.ReadWeights( binFileName );
#else
        InferenceEngine::Core ie;
        /** Read network model **/
        network = ie.ReadNetwork( pathToModel );
#endif

        /** Set batch size **/
        //LOG(DEBUG) << "Batch size is set to " << maxBatch;
        network.setBatchSize(maxBatch);

        // We support networks with one or two inputs, though others may be possible...
        InputsDataMap inputInfo(network.getInputsInfo() );
        if( inputInfo.size() != 1  &&  inputInfo.size() != 2 )
            throw std::logic_error( "Object detection network should have only one or two inputs" );
        for( auto & item : inputInfo )
        {
            if( item.second->getInputData()->getTensorDesc().getDims().size() == 4 )
            {
                // Blob "data" (1x4) will contain the actual image data (e.g., 1,3,224,224 or 1,3,300,300)
                _input_layer_name = item.first;
                _input_width = item.second->getTensorDesc().getDims()[2];
                _input_height = item.second->getTensorDesc().getDims()[3];
                item.second->setPrecision( Precision::U8 );
            }
            else if( item.second->getInputData()->getTensorDesc().getDims().size() == 2 )
            {
                // Blob "im_info" is optional: 1x3 (height, width, image_scale)
                _im_info_name = item.first;
                auto const & dims = item.second->getTensorDesc().getDims();
                if( dims[0] != 1 )
                    throw std::logic_error( "Invalid input info: layer \"" + _im_info_name + "\" should be 1x3 or 1x6" );
                _im_info_size = dims[1];
                item.second->setPrecision( Precision::FP32 );
                if( _im_info_size != 3  &&  _im_info_size != 6 )
                    throw std::logic_error( "Invalid input info: layer \"" + _im_info_name + "\" should be 1x3 or 1x6" );
            }
        }
        if( _input_layer_name.empty() )
            throw std::logic_error( "Could not find input \"data\" layer in network" );

        // Only a single "DetectionOuput" layer is expected
        OutputsDataMap outputInfo(network.getOutputsInfo() );
        if( outputInfo.size() != 1 )
            throw std::logic_error(
                "Object detection network should have only one output" );
        _output_layer_name = outputInfo.begin()->first;
        DataPtr & outputDataPtr = outputInfo.begin()->second;

        // Checking if layer names are as expected
#ifdef OPENVINO2019
        const CNNLayerPtr outputLayer = network.getLayerByName(_output_layer_name.c_str());
        if (outputLayer->type != "DetectionOutput")
            throw std::logic_error(
                "Object detection network output layer(" + outputLayer->name +
                ") should be DetectionOutput, but was " + outputLayer->type);
        if (outputLayer->params.find("num_classes") == outputLayer->params.end())
            throw std::logic_error(
                "Object detection network output layer (" +
                _output_layer_name + ") should have num_classes integer attribute");
#else
#ifdef OPENVINO_NGRAPH
        // Inference Engine integrates the nGraph Core in OpenVINO >= 2020.1
        if (auto ngraphFunction = network.getFunction()) {
            for (const auto& out : outputInfo) {
                for (const auto& op : ngraphFunction->get_ops()) {
                    if (op->get_type_name() == ngraph::op::DetectionOutput::type_info.name) {
                        if (op->get_friendly_name() != out.second->getName()) {
                            std::string output_name = out.first;
                            std::string output_type = op->get_type_name();
                            throw std::logic_error(
                                "Object detection network output layer (" + output_name +
                                ") should be DetectionOutput, but was " + output_type);
                        }
                    }
                }
            }
        }
#endif
#endif

        /*
            Expect a blob of [1, 1, N, 7], where N is the number of detected bounding boxes.
            For each detection, the description has the format: [image_id, label, conf, x_min, y_min, x_max, y_max]
                image_id - ID of the image in the batch
                label - predicted class ID
                conf - confidence for the predicted class
                (x_min, y_min) - coordinates of the top left bounding box corner
                (x_max, y_max) - coordinates of the bottom right bounding box corner.
        */
        const SizeVector & outputDims = outputDataPtr->getTensorDesc().getDims();
        if( outputDims.size() != 4 )
            throw std::logic_error(
                "Object detection network output dimensions should be 4, but was " + std::to_string( outputDims.size() ) );
        size_t objectSize = outputDims[3];
        if( objectSize != DETECTED_OBJECT_SIZE )
            throw std::logic_error(
                "Object detection network output layer last dimension should be " +
                std::to_string( DETECTED_OBJECT_SIZE ) + "; got " + std::to_string( objectSize ) );
        _max_results = outputDims[2];
        outputDataPtr->setPrecision( Precision::FP32 );

        return network;
    }


    std::vector< object_detection::Result > object_detection::fetch_results()
    {
        std::vector< Result > results;
        const float *detections = _request->GetBlob( _output_layer_name )->buffer().as<float *>();

        for( size_t i = 0; i < _max_results; i++ )
        {
            float image_id = detections[i * DETECTED_OBJECT_SIZE + 0];
            if( image_id < 0 )
                break;

            // [image_id, label, confidence, x_min, y_min, x_max, y_max]
            Result r;
            r.label = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 1]);
            r.confidence = detections[i * DETECTED_OBJECT_SIZE + 2];
            if( r.confidence <= _detection_threshold && !doRawOutputMessages )
                continue;
            r.location.x = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 3] * _width);
            r.location.y = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 4] * _height);
            r.location.width = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 5] * _width - r.location.x);
            r.location.height = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 6] * _height - r.location.y);

            if( doRawOutputMessages )
            {
                LOG(DEBUG)
                    << "[" << i << "," << r.label << "] element, prob = " << r.confidence
                    << "    (" << r.location.x << "," << r.location.y << ")-(" << r.location.width << ","
                    << r.location.height << ")"
                    << ((r.confidence > _detection_threshold) ? " WILL BE RENDERED!" : "");
            }

            if( r.confidence > _detection_threshold )
                results.push_back( r );
        }

        return results;
    }
}
feat: Определение местоположения объектов относительно RealSense и их разметка 2 months ago			`// License: Apache 2.0. See LICENSE file in root directory.`
			`// Copyright(c) 2019 Intel Corporation. All Rights Reserved.`


			`#include <rs-vino/object-detection.h>`
			`#include <rs-vino/openvino-helpers.h>`
			`#include <rsutils/easylogging/easyloggingpp.h>`

			`using namespace InferenceEngine;`


			`const size_t DETECTED_OBJECT_SIZE = 7; // the size of each detected object`


			`namespace openvino_helpers`
			`{`
			`object_detection::object_detection(`
			`const std::string &pathToModel,`
			`double detectionThreshold,`
			`bool isAsync,`
			`int maxBatch, bool isBatchDynamic,`
			`bool doRawOutputMessages`
			`)`
			`: base_detection( "object detection", pathToModel, maxBatch, isBatchDynamic, isAsync, doRawOutputMessages )`
			`, _detection_threshold( detectionThreshold )`
			`, _max_results( 0 )`
			`, _n_enqued_frames( 0 ), _width( 0 ), _height( 0 )`
			`{`
			`}`


			`void object_detection::submit_request()`
			`{`
			`if( !_n_enqued_frames )`
			`return;`
			`_n_enqued_frames = 0;`
			`base_detection::submit_request();`
			`}`


			`void object_detection::enqueue( const cv::Mat & frame )`
			`{`
			`if( !enabled() )`
			`return;`

			`if( !_request )`
			`_request = net.CreateInferRequestPtr();`

			`_width = static_cast<float>(frame.cols);`
			`_height = static_cast<float>(frame.rows);`

			`Blob::Ptr inputBlob = _request->GetBlob( _input_layer_name );`
			`matU8ToBlob<uint8_t>( frame, inputBlob );`

			`if( ! _im_info_name.empty() )`
			`{`
			`Blob::Ptr infoBlob = _request->GetBlob( _im_info_name );`

			`// (height, width, image_scale)`
			`float * p = infoBlob->buffer().as< PrecisionTrait< Precision::FP32 >::value_type * >();`
			`p[0] = static_cast< float >( _input_width );`
			`p[1] = static_cast< float >( _input_height );`
			`for( size_t k = 2; k < _im_info_size; k++ )`
			`p[k] = 1.f; // all scale factors are set to 1.0`
			`}`

			`_n_enqued_frames = 1;`
			`}`


			`CNNNetwork object_detection::read_network()`
			`{`
			`LOG(INFO) << "Loading " << topoName << " model from: " << pathToModel;`

			`CNNNetwork network;`

			`#ifdef OPENVINO2019`
			`CNNNetReader netReader;`

			`/ Read network model /`
			`netReader.ReadNetwork( pathToModel );`
			`network = netReader.getNetwork();`

			`/ Extract model name and load its weights /`
			`std::string binFileName = remove_ext( pathToModel ) + ".bin";`
			`netReader.ReadWeights( binFileName );`
			`#else`
			`InferenceEngine::Core ie;`
			`/ Read network model /`
			`network = ie.ReadNetwork( pathToModel );`
			`#endif`

			`/ Set batch size /`
			`//LOG(DEBUG) << "Batch size is set to " << maxBatch;`
			`network.setBatchSize(maxBatch);`

			`// We support networks with one or two inputs, though others may be possible...`
			`InputsDataMap inputInfo(network.getInputsInfo() );`
			`if( inputInfo.size() != 1 && inputInfo.size() != 2 )`
			`throw std::logic_error( "Object detection network should have only one or two inputs" );`
			`for( auto & item : inputInfo )`
			`{`
			`if( item.second->getInputData()->getTensorDesc().getDims().size() == 4 )`
			`{`
			`// Blob "data" (1x4) will contain the actual image data (e.g., 1,3,224,224 or 1,3,300,300)`
			`_input_layer_name = item.first;`
			`_input_width = item.second->getTensorDesc().getDims()[2];`
			`_input_height = item.second->getTensorDesc().getDims()[3];`
			`item.second->setPrecision( Precision::U8 );`
			`}`
			`else if( item.second->getInputData()->getTensorDesc().getDims().size() == 2 )`
			`{`
			`// Blob "im_info" is optional: 1x3 (height, width, image_scale)`
			`_im_info_name = item.first;`
			`auto const & dims = item.second->getTensorDesc().getDims();`
			`if( dims[0] != 1 )`
			`throw std::logic_error( "Invalid input info: layer \"" + _im_info_name + "\" should be 1x3 or 1x6" );`
			`_im_info_size = dims[1];`
			`item.second->setPrecision( Precision::FP32 );`
			`if( _im_info_size != 3 && _im_info_size != 6 )`
			`throw std::logic_error( "Invalid input info: layer \"" + _im_info_name + "\" should be 1x3 or 1x6" );`
			`}`
			`}`
			`if( _input_layer_name.empty() )`
			`throw std::logic_error( "Could not find input \"data\" layer in network" );`

			`// Only a single "DetectionOuput" layer is expected`
			`OutputsDataMap outputInfo(network.getOutputsInfo() );`
			`if( outputInfo.size() != 1 )`
			`throw std::logic_error(`
			`"Object detection network should have only one output" );`
			`_output_layer_name = outputInfo.begin()->first;`
			`DataPtr & outputDataPtr = outputInfo.begin()->second;`

			`// Checking if layer names are as expected`
			`#ifdef OPENVINO2019`
			`const CNNLayerPtr outputLayer = network.getLayerByName(_output_layer_name.c_str());`
			`if (outputLayer->type != "DetectionOutput")`
			`throw std::logic_error(`
			`"Object detection network output layer(" + outputLayer->name +`
			`") should be DetectionOutput, but was " + outputLayer->type);`
			`if (outputLayer->params.find("num_classes") == outputLayer->params.end())`
			`throw std::logic_error(`
			`"Object detection network output layer (" +`
			`_output_layer_name + ") should have num_classes integer attribute");`
			`#else`
			`#ifdef OPENVINO_NGRAPH`
			`// Inference Engine integrates the nGraph Core in OpenVINO >= 2020.1`
			`if (auto ngraphFunction = network.getFunction()) {`
			`for (const auto& out : outputInfo) {`
			`for (const auto& op : ngraphFunction->get_ops()) {`
			`if (op->get_type_name() == ngraph::op::DetectionOutput::type_info.name) {`
			`if (op->get_friendly_name() != out.second->getName()) {`
			`std::string output_name = out.first;`
			`std::string output_type = op->get_type_name();`
			`throw std::logic_error(`
			`"Object detection network output layer (" + output_name +`
			`") should be DetectionOutput, but was " + output_type);`
			`}`
			`}`
			`}`
			`}`
			`}`
			`#endif`
			`#endif`

			`/*`
			`Expect a blob of [1, 1, N, 7], where N is the number of detected bounding boxes.`
			`For each detection, the description has the format: [image_id, label, conf, x_min, y_min, x_max, y_max]`
			`image_id - ID of the image in the batch`
			`label - predicted class ID`
			`conf - confidence for the predicted class`
			`(x_min, y_min) - coordinates of the top left bounding box corner`
			`(x_max, y_max) - coordinates of the bottom right bounding box corner.`
			`*/`
			`const SizeVector & outputDims = outputDataPtr->getTensorDesc().getDims();`
			`if( outputDims.size() != 4 )`
			`throw std::logic_error(`
			`"Object detection network output dimensions should be 4, but was " + std::to_string( outputDims.size() ) );`
			`size_t objectSize = outputDims[3];`
			`if( objectSize != DETECTED_OBJECT_SIZE )`
			`throw std::logic_error(`
			`"Object detection network output layer last dimension should be " +`
			`std::to_string( DETECTED_OBJECT_SIZE ) + "; got " + std::to_string( objectSize ) );`
			`_max_results = outputDims[2];`
			`outputDataPtr->setPrecision( Precision::FP32 );`

			`return network;`
			`}`


			`std::vector< object_detection::Result > object_detection::fetch_results()`
			`{`
			`std::vector< Result > results;`
			`const float detections = _request->GetBlob( _output_layer_name )->buffer().as<float >();`

			`for( size_t i = 0; i < _max_results; i++ )`
			`{`
			`float image_id = detections[i * DETECTED_OBJECT_SIZE + 0];`
			`if( image_id < 0 )`
			`break;`

			`// [image_id, label, confidence, x_min, y_min, x_max, y_max]`
			`Result r;`
			`r.label = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 1]);`
			`r.confidence = detections[i * DETECTED_OBJECT_SIZE + 2];`
			`if( r.confidence <= _detection_threshold && !doRawOutputMessages )`
			`continue;`
			`r.location.x = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 3] * _width);`
			`r.location.y = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 4] * _height);`
			`r.location.width = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 5] * _width - r.location.x);`
			`r.location.height = static_cast<int>(detections[i * DETECTED_OBJECT_SIZE + 6] * _height - r.location.y);`

			`if( doRawOutputMessages )`
			`{`
			`LOG(DEBUG)`
			`<< "[" << i << "," << r.label << "] element, prob = " << r.confidence`
			`<< " (" << r.location.x << "," << r.location.y << ")-(" << r.location.width << ","`
			`<< r.location.height << ")"`
			`<< ((r.confidence > _detection_threshold) ? " WILL BE RENDERED!" : "");`
			`}`

			`if( r.confidence > _detection_threshold )`
			`results.push_back( r );`
			`}`

			`return results;`
			`}`
			`}`