basedoc/create-yolo-jevois-npu-dataset_8py_source.html

#!/usr/bin/env python3


import numpy as np

import os

from PIL import Image

import random

import subprocess

import onnxruntime as ort


def create_dataset(width, height, nc, numimages):

    numtexts = 365 # number of category labels in the quantization dataset

    imgdir='Objects365/images/val'

    txtdir='Objects365/labels/val'


    txt_embeds = np.ndarray((numtexts, 512), np.float32)


    images = np.ndarray((numimages, 3, height, width), np.float32)

    txt = np.ndarray((numimages, nc, 512), np.float32)


    txt0 = np.ndarray((numimages, 4, 32, nc), np.float32)

    txt1 = np.ndarray((numimages, 2, 32, nc), np.float32)

    txt2 = np.ndarray((numimages, 4, 32, nc), np.float32)

    txt3 = np.ndarray((numimages, 8, 32, nc), np.float32)

    txt4 = np.ndarray((numimages, nc, 512), np.float32)


    # read all the text embeddings into one array, for the 365 object labels

    print("Loading all text embeddings...")

    idx = 0

    for i in range(numtexts):

        fname = f"text_vec_{i}.npy"

        if os.path.isfile(fname):

            e = np.load(fname)

            e /= np.linalg.norm(e[0,:]) # our model expects unit-norm text features

            txt_embeds[idx, :] = e

            idx += 1

        else:

            print(f"Cannot load {fname}")

            exit(1)


    # Read each image and get nc labels, in randomized order but including as many of that image's labels as possible:

    print("Processing images...")

    allimages = list(filter(lambda f: f.endswith('.jpg'), os.listdir(imgdir)))

    selected_images = random.sample(allimages, numimages)


    # load the oonx text processor

    session = ort.InferenceSession(f"yolov8s-jevois-{width}x{height}-{nc}c-txt.onnx")


    idx = 0

    for imgname in selected_images:

        fname = os.path.join(imgdir, imgname)

        if not os.path.isfile(fname):

            print(f"Cannot load {fname}")

            exit(1)


        image = Image.open(fname).resize((width, height));

        arr = np.array(image).astype(np.float32)

        arr = (arr - 0.0) / 255.0 # pre-processing. Here: mean=[0 0 0], scale=1/255 but varies by model

        arr = np.transpose(arr, (2, 0, 1))

        images[idx, :, :, :] = arr


        # Get the categories present in that image:

        labelpath = os.path.join(txtdir, imgname.replace('.jpg', '.txt'))

        x = subprocess.run("cat " + str(labelpath) + " | awk '{ print $1 }' | sort -n | uniq",

                           shell=True, capture_output=True, text=True)

        categs_str = x.stdout.splitlines()

        categs = [int(c) for c in categs_str]


        print(f"categs in {labelpath}: {categs}")


        random.shuffle(categs)

        extra = list(range(numtexts))

        random.shuffle(extra)

        categs += extra

        categs = categs[0:nc]


        print(f"selected categs in {labelpath}: {categs}")


        for i in range(nc):

            txt[idx, i, :] = txt_embeds[categs[i]]


        # Run the text processing model:

        txtin = txt[idx, :, :]

        txtin = np.expand_dims(txtin, axis=0)


        outs = session.run(None, { "txt_feats": txtin })

        txt0[idx, :, :, :] = outs[0]

        txt1[idx, :, :, :] = outs[1]

        txt2[idx, :, :, :] = outs[2]

        txt3[idx, :, :, :] = outs[3]

        txt4[idx, :, :] = outs[4]


        idx += 1


    with open(f"objects365-images-{width}x{height}-{nc}.npy", 'wb') as f: np.save(f, images)

    with open(f"objects365-texts-{width}x{height}-{nc}.npy", 'wb') as f: np.save(f, txt)

    with open(f"objects365-txt0-{width}x{height}-{nc}.npy", 'wb') as f: np.save(f, txt0)

    with open(f"objects365-txt1-{width}x{height}-{nc}.npy", 'wb') as f: np.save(f, txt1)

    with open(f"objects365-txt2-{width}x{height}-{nc}.npy", 'wb') as f: np.save(f, txt2)

    with open(f"objects365-txt3-{width}x{height}-{nc}.npy", 'wb') as f: np.save(f, txt3)

    with open(f"objects365-txt4-{width}x{height}-{nc}.npy", 'wb') as f: np.save(f, txt4)


classes=[1, 8, 16, 32, 64]


numimages = 5000 # 20000 ok with 512x288; 80000 possible but crashes NPU converter

for nc in classes:

    create_dataset(512, 288, nc, numimages)

    create_dataset(1024, 576, nc, numimages)


create-yolo-jevois-npu-dataset.create_dataset
create_dataset(width, height, nc, numimages)
Definition create-yolo-jevois-npu-dataset.py:10