JeVois  1.23
JeVois Smart Embedded Machine Vision Toolkit
Share this page:
Loading...
Searching...
No Matches
CLIP.H
Go to the documentation of this file.
1// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2//
3// JeVois Smart Embedded Machine Vision Toolkit - Copyright (C) 2024 by Laurent Itti, the University of Southern
4// California (USC), and iLab at USC. See http://iLab.usc.edu and http://jevois.org for information about this project.
5//
6// This file is part of the JeVois Smart Embedded Machine Vision Toolkit. This program is free software; you can
7// redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software
8// Foundation, version 2. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
9// without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
10// License for more details. You should have received a copy of the GNU General Public License along with this program;
11// if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
12//
13// Contact information: Laurent Itti - 3641 Watt Way, HNB-07A - Los Angeles, CA 90089-2520 - USA.
14// Tel: +1 213 740 3527 - itti@pollux.usc.edu - http://iLab.usc.edu - http://jevois.org
15// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
16/*! \file */
17
18#pragma once
19
20#ifdef JEVOIS_PRO
21
22#include <opencv2/opencv.hpp>
23
24struct clip_ctx;
25
26namespace jevois
27{
28 namespace dnn
29 {
30 //! Interface to a CLIP model used to compute text and image embeddings
31 /*! The CLIP model runs on CPU using clip.cpp and ggml. It is used to compute text or image embeddings for
32 open-world object detection models like YOLO-JeVois. The embeddings are stored in float cv::Mat with size 1x512
33 for easy concatenation of several embeddings to be given as input to YOLO-JeVois as a 1xCx512 tensor for C
34 object detection classes. \ingroup dnn */
35 class CLIP
36 {
37 public:
38 //! Construct and load a model from disk
39 CLIP(std::string const & modelpath);
40
41 //! Virtual destructor for safe inheritance
42 virtual ~CLIP();
43
44 //! Freeze/unfreeze parameters that users should not change while running
45 void freeze(bool doit);
46
47 //! Get embedding for some text, typically as a 1x512 float matrix (depends on clip model version)
48 cv::Mat textEmbedding(std::string const & txt);
49
50 //! Get text embedding size, useful if we need to know it before getting an embedding, or 0 if no text encoder
51 int textEmbeddingSize() const;
52
53 //! Get embedding for some RGB uint8 packed image, typically as a 1x512 float matrix
54 /*! Any image size is ok, the image will be rescaled and normalized to match what the CLIP model wants. */
55 cv::Mat imageEmbedding(cv::Mat const & img);
56
57 //! Get image embedding size, useful if we need to know it before getting an embedding, or 0 if no image encoder
58 int imageEmbeddingSize() const;
59
60 //! Compute cosine similarity between two embeddings
61 float similarity(cv::Mat const & emb1, cv::Mat const & emb2) const;
62
63 private:
64 struct clip_ctx * itsCtx = nullptr; // Our clip.cpp context
65 };
66
67
68 } // namespace dnn
69} // namespace jevois
70
71#endif // JEVOIS_PRO
Interface to a CLIP model used to compute text and image embeddings.
Definition CLIP.H:36
void freeze(bool doit)
Freeze/unfreeze parameters that users should not change while running.
float similarity(cv::Mat const &emb1, cv::Mat const &emb2) const
Compute cosine similarity between two embeddings.
Definition CLIP.C:109
virtual ~CLIP()
Virtual destructor for safe inheritance.
Definition CLIP.C:27
cv::Mat textEmbedding(std::string const &txt)
Get embedding for some text, typically as a 1x512 float matrix (depends on clip model version)
Definition CLIP.C:46
cv::Mat imageEmbedding(cv::Mat const &img)
Get embedding for some RGB uint8 packed image, typically as a 1x512 float matrix.
Definition CLIP.C:74
int textEmbeddingSize() const
Get text embedding size, useful if we need to know it before getting an embedding,...
Definition CLIP.C:66
int imageEmbeddingSize() const
Get image embedding size, useful if we need to know it before getting an embedding,...
Definition CLIP.C:101
Main namespace for all JeVois classes and functions.
Definition Concepts.dox:2