22#include <clip.cpp/clip.h>
29 if (itsCtx) clip_free(itsCtx);
35 if (itsCtx) clip_free(itsCtx);
37 LINFO(
"Loading CLIP model " << fname <<
" ...");
39 itsCtx = clip_model_load(fname.c_str(), 0 );
40 if (itsCtx ==
nullptr)
LFATAL(
"Failed to load model from " << fname);
42 LINFO(
"CLIP model ready.");
50 if (!clip_tokenize(itsCtx, txt.c_str(), &tokens))
LFATAL(
"Failed to tokenize [" << txt <<
']');
53 int const vec_dim = clip_get_text_hparams(itsCtx)->projection_dim;
54 cv::Mat ret(1, vec_dim, CV_32F);
float * vec = (
float *)ret.data;
56 if (!clip_text_encode(itsCtx,
CLIP_THREADS, &tokens, vec,
false))
LFATAL(
"Failed to encode text [" << txt <<
']');
59 float const norm = cv::norm(ret);
68 if (itsCtx ==
nullptr)
LFATAL(
"No CLIP model loaded");
69 if (clip_model_has_text_encoder(itsCtx) ==
false)
return 0;
70 return clip_get_text_hparams(itsCtx)->projection_dim;
76 if (itsCtx ==
nullptr)
LFATAL(
"No CLIP model loaded");
77 if (img.type() != CV_8UC3)
LFATAL(
"input image must be CV_8UC3 in RGB order");
80 clip_image_u8
const img_input
81 { img.cols, img.rows,
const_cast<uint8_t *
>(img.data),
size_t(img.rows * img.cols * 3) };
84 clip_image_f32 img_res;
85 if (!clip_image_preprocess(itsCtx, &img_input, &img_res))
LFATAL(
"Failed to pre-process image for CLIP");
88 const int vec_dim = clip_get_vision_hparams(itsCtx)->projection_dim;
89 cv::Mat ret(1, vec_dim, CV_32F);
90 clip_image_encode(itsCtx,
CLIP_THREADS, &img_res, (
float *)ret.data,
false);
91 clip_image_f32_clean(&img_res);
94 float const norm = cv::norm(ret);
103 if (itsCtx ==
nullptr)
LFATAL(
"No CLIP model loaded");
104 if (clip_model_has_vision_encoder(itsCtx) ==
false)
return 0;
105 return clip_get_vision_hparams(itsCtx)->projection_dim;
111 size_t const vec_dim = emb1.total();
112 if (vec_dim != emb2.total())
LFATAL(
"Mismatched embedding sizes: " << vec_dim <<
" vs. " << emb2.total());
113 if (emb1.type() != CV_32F || emb2.type() != CV_32F)
LFATAL(
"Embedding type must be CV_32F");
115 return clip_similarity_score((
float const *)(emb1.data), (
float const *)(emb2.data), vec_dim);
float similarity(cv::Mat const &emb1, cv::Mat const &emb2) const
Compute cosine similarity between two embeddings.
virtual ~CLIP()
Virtual destructor for safe inheritance.
cv::Mat textEmbedding(std::string const &txt)
Get embedding for some text, typically as a 1x512 float matrix (depends on clip model version)
cv::Mat imageEmbedding(cv::Mat const &img)
Get embedding for some RGB uint8 packed image, typically as a 1x512 float matrix.
int textEmbeddingSize() const
Get text embedding size, useful if we need to know it before getting an embedding,...
int imageEmbeddingSize() const
Get image embedding size, useful if we need to know it before getting an embedding,...
CLIP(std::string const &modelpath)
Construct and load a model from disk.
#define LFATAL(msg)
Convenience macro for users to print out console or syslog messages, FATAL level.
#define LINFO(msg)
Convenience macro for users to print out console or syslog messages, INFO level.