basedoc/yolo-jevois-export_8py_source.html

#!/usr/bin/env python3


# Adapted from https://github.com/ibaiGorordo/ONNX-YOLO-World-Open-Vocabulary-Object-Detection

# MIT License


from copy import deepcopy

import torch

import onnx

import os

from argparse import ArgumentParser

from ultralytics import YOLOWorld

from torchviz import make_dot

import subprocess


class ModelExporter(torch.nn.Module):


    def __init__(self, yoloModel, device='cpu'):

        super(ModelExporter, self).__init__()

        model = deepcopy(yoloModel).to(device)

        for p in model.parameters():

            p.requires_grad = False

        model.eval()

        model.float()

        model = model.fuse()


        self.model = model

        self.device = device


    def forward(self, x, txt_feats):

        return self.model.predict(x, txt_feats=txt_feats)


    def export(self, output_dir, model_name, img_width, img_height, num_classes):

        print(f"JEVOIS: exporting {model_name} to ONNX {img_width}x{img_height}-{num_classes}c")

        x = torch.randn(1, 3, img_height, img_width, requires_grad=False).to(self.device)

        txt_feats = torch.randn(1, num_classes, 512, requires_grad=False).to(self.device)

        prefix = model_name.split('-')[0] # yolov8s, yolov8l, etc


        # First export full model:

        with torch.no_grad():

            torch.onnx.export(self,

                              (x, txt_feats),

                              "tmp_fullmodel.onnx",

                              do_constant_folding=True,

                              opset_version=17, # was 17,

                              verbose=False,

                              input_names=["images", "txt_feats"],

                              output_names=["output"])


        # JEVOIS: approach 1 (for 8-bit quantization): extract the text-processing portion, to be run

        # only when class names are changed. This network computes 5 tensors from the CLIP embeddings, which will be

        # multiplied with vision tensors as the model runs:

        innames = ["txt_feats"]

        outnames = ["/model.12/attn/Transpose_1_output_0",

                    "/model.15/attn/Transpose_1_output_0",

                    "/model.18/attn/Transpose_1_output_0",

                    "/model.21/attn/Transpose_1_output_0",

                    "/model.22/cv4.0/Div_output_0"]

        outpath = os.path.join(output_dir, f"{prefix}-jevois-{img_width}x{img_height}-{num_classes}c-txt.onnx")

        print(f"  JEVOIS: extracting text model to {outpath}")

        onnx.utils.extract_model("tmp_fullmodel.onnx", outpath, innames, outnames)


        # JEVOIS: approach 1: Then extract the vision processing portion (to be quantized), taking the 5 tensors from

        # text processing as inputs, in addition to the input image:

        innames = ["images"] + outnames

        outnames = ["/model.22/cv2.0/cv2.0.2/Conv_output_0", "/model.22/cv4.0/Add_output_0",

                    "/model.22/cv2.1/cv2.1.2/Conv_output_0", "/model.22/cv4.1/Add_output_0",

                    "/model.22/cv2.2/cv2.2.2/Conv_output_0", "/model.22/cv4.2/Add_output_0"]

        outpath = os.path.join(output_dir, f"{prefix}-jevois-{img_width}x{img_height}-{num_classes}c-img.onnx")

        print(f"  JEVOIS: extracting image model to {outpath}")

        onnx.utils.extract_model("tmp_fullmodel.onnx", outpath, innames, outnames)


        # The Div input has variable size; fix it to 1xCx512:

        subprocess.run(["python", "-m", "onnxruntime.tools.make_dynamic_shape_fixed",

                        "--input_name", "/model.22/cv4.0/Div_output_0",

                        "--input_shape", f"1,{num_classes},512", outpath, outpath])


        # JEVOIS: approach 2 (for 16-bit quantization, slower at runtime): only truncate the model to yield split

        # outputs (6 tensors for boxes, class scores at 3 strides), take image and CLIP embeddings as input:

        innames = ["images", "txt_feats"]

        outpath = os.path.join(output_dir, f"{prefix}-jevois-{img_width}x{img_height}-{num_classes}c.onnx")

        print(f"  JEVOIS: extracting combo model to {outpath}")

        onnx.utils.extract_model("tmp_fullmodel.onnx", outpath, innames, outnames)


        os.remove("tmp_fullmodel.onnx")


def main():

    parser = ArgumentParser()

    parser.add_argument("--img_width", type=int, default=512)

    parser.add_argument("--img_height", type=int, default=288)

    parser.add_argument("--num_classes", type=int, default=-1)

    parser.add_argument("--model_name", type=str, default="yolov8s-worldv2.pt")

    parser.add_argument("--output_dir", type=str, default="")

    parser.add_argument("--device", type=str, default=torch.device("cuda" if torch.cuda.is_available() else "cpu"))


    args = parser.parse_args()

    img_width = args.img_width

    img_height = args.img_height

    num_classes = args.num_classes

    model_name = args.model_name

    output_dir = args.output_dir

    device = args.device


    if num_classes > 0:

        nclass = [num_classes]

    else:

        nclass = [1, 8, 16, 32, 64]


    for nc in nclass:

        yoloModel = YOLOWorld(model_name)

        yoloModel.set_classes(["person"] * nc)


        #print(yoloModel)


        export_model = ModelExporter(yoloModel.model, device)

        export_model.export(output_dir, model_name, img_width, img_height, nc)


if __name__ == "__main__":

    main()

yolo-jevois-export.ModelExporter
Definition yolo-jevois-export.py:15

yolo-jevois-export.ModelExporter.__init__
__init__(self, yoloModel, device='cpu')
Definition yolo-jevois-export.py:16

yolo-jevois-export.ModelExporter.model
model
Definition yolo-jevois-export.py:25

yolo-jevois-export.ModelExporter.export
export(self, output_dir, model_name, img_width, img_height, num_classes)
Definition yolo-jevois-export.py:31

yolo-jevois-export.ModelExporter.device
device
Definition yolo-jevois-export.py:26

yolo-jevois-export.ModelExporter.forward
forward(self, x, txt_feats)
Definition yolo-jevois-export.py:28

yolo-jevois-export.main
main()
Definition yolo-jevois-export.py:86