JeVois  1.23
JeVois Smart Embedded Machine Vision Toolkit
Share this page:
Loading...
Searching...
No Matches
CLIP.C
Go to the documentation of this file.
1// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2//
3// JeVois Smart Embedded Machine Vision Toolkit - Copyright (C) 2024 by Laurent Itti, the University of Southern
4// California (USC), and iLab at USC. See http://iLab.usc.edu and http://jevois.org for information about this project.
5//
6// This file is part of the JeVois Smart Embedded Machine Vision Toolkit. This program is free software; you can
7// redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software
8// Foundation, version 2. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
9// without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
10// License for more details. You should have received a copy of the GNU General Public License along with this program;
11// if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
12//
13// Contact information: Laurent Itti - 3641 Watt Way, HNB-07A - Los Angeles, CA 90089-2520 - USA.
14// Tel: +1 213 740 3527 - itti@pollux.usc.edu - http://iLab.usc.edu - http://jevois.org
15// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
16/*! \file */
17
18#ifdef JEVOIS_PRO
19
20#include <jevois/DNN/CLIP.H>
21#include <jevois/Debug/Log.H>
22#include <clip.cpp/clip.h>
23
24#define CLIP_THREADS 4
25
26// ####################################################################################################
28{
29 if (itsCtx) clip_free(itsCtx);
30}
31
32// ####################################################################################################
33jevois::dnn::CLIP::CLIP(std::string const & fname)
34{
35 if (itsCtx) clip_free(itsCtx);
36
37 LINFO("Loading CLIP model " << fname << " ...");
38
39 itsCtx = clip_model_load(fname.c_str(), 0 /* verbosity */);
40 if (itsCtx == nullptr) LFATAL("Failed to load model from " << fname);
41
42 LINFO("CLIP model ready.");
43}
44
45// ####################################################################################################
46cv::Mat jevois::dnn::CLIP::textEmbedding(std::string const & txt)
47{
48 // First get tokens from text:
49 clip_tokens tokens;
50 if (!clip_tokenize(itsCtx, txt.c_str(), &tokens)) LFATAL("Failed to tokenize [" << txt << ']');
51
52 // Then get embedding from the tokens:
53 int const vec_dim = clip_get_text_hparams(itsCtx)->projection_dim;
54 cv::Mat ret(1, vec_dim, CV_32F); float * vec = (float *)ret.data;
55
56 if (!clip_text_encode(itsCtx, CLIP_THREADS, &tokens, vec, false)) LFATAL("Failed to encode text [" << txt << ']');
57
58 // Standardize to unit norm, as expected by YOLO-JeVois:
59 float const norm = cv::norm(ret);
60 ret /= norm;
61
62 return ret;
63}
64
65// ####################################################################################################
67{
68 if (itsCtx == nullptr) LFATAL("No CLIP model loaded");
69 if (clip_model_has_text_encoder(itsCtx) == false) return 0;
70 return clip_get_text_hparams(itsCtx)->projection_dim;
71}
72
73// ####################################################################################################
74cv::Mat jevois::dnn::CLIP::imageEmbedding(cv::Mat const & img)
75{
76 if (itsCtx == nullptr) LFATAL("No CLIP model loaded");
77 if (img.type() != CV_8UC3) LFATAL("input image must be CV_8UC3 in RGB order");
78
79 // Create a clip image from our cv::Mat with zero copy:
80 clip_image_u8 const img_input
81 { img.cols, img.rows, const_cast<uint8_t *>(img.data), size_t(img.rows * img.cols * 3) };
82
83 // Pre-process image to float32 RGB with bilinear interpolation and value normalization:
84 clip_image_f32 img_res;
85 if (!clip_image_preprocess(itsCtx, &img_input, &img_res)) LFATAL("Failed to pre-process image for CLIP");
86
87 // Get the embedding for the image:
88 const int vec_dim = clip_get_vision_hparams(itsCtx)->projection_dim;
89 cv::Mat ret(1, vec_dim, CV_32F);
90 clip_image_encode(itsCtx, CLIP_THREADS, &img_res, (float *)ret.data, false);
91 clip_image_f32_clean(&img_res); // NOTE: do not free img_input, its pixel data is owned by cv::Mat img
92
93 // Standardize to unit norm, as expected by YOLO-JeVois:
94 float const norm = cv::norm(ret);
95 ret /= norm;
96
97 return ret;
98}
99
100// ####################################################################################################
102{
103 if (itsCtx == nullptr) LFATAL("No CLIP model loaded");
104 if (clip_model_has_vision_encoder(itsCtx) == false) return 0;
105 return clip_get_vision_hparams(itsCtx)->projection_dim;
106}
107
108// ####################################################################################################
109float jevois::dnn::CLIP::similarity(cv::Mat const & emb1, cv::Mat const & emb2) const
110{
111 size_t const vec_dim = emb1.total();
112 if (vec_dim != emb2.total()) LFATAL("Mismatched embedding sizes: " << vec_dim << " vs. " << emb2.total());
113 if (emb1.type() != CV_32F || emb2.type() != CV_32F) LFATAL("Embedding type must be CV_32F");
114
115 return clip_similarity_score((float const *)(emb1.data), (float const *)(emb2.data), vec_dim);
116}
117
118#endif // JEVOIS_PRO
#define CLIP_THREADS
Definition CLIP.C:24
float similarity(cv::Mat const &emb1, cv::Mat const &emb2) const
Compute cosine similarity between two embeddings.
Definition CLIP.C:109
virtual ~CLIP()
Virtual destructor for safe inheritance.
Definition CLIP.C:27
cv::Mat textEmbedding(std::string const &txt)
Get embedding for some text, typically as a 1x512 float matrix (depends on clip model version)
Definition CLIP.C:46
cv::Mat imageEmbedding(cv::Mat const &img)
Get embedding for some RGB uint8 packed image, typically as a 1x512 float matrix.
Definition CLIP.C:74
int textEmbeddingSize() const
Get text embedding size, useful if we need to know it before getting an embedding,...
Definition CLIP.C:66
int imageEmbeddingSize() const
Get image embedding size, useful if we need to know it before getting an embedding,...
Definition CLIP.C:101
CLIP(std::string const &modelpath)
Construct and load a model from disk.
Definition CLIP.C:33
#define LFATAL(msg)
Convenience macro for users to print out console or syslog messages, FATAL level.
Definition Log.H:230
#define LINFO(msg)
Convenience macro for users to print out console or syslog messages, INFO level.
Definition Log.H:194