|
|
import typer
|
|
|
from groundingdino.util.inference import load_model, load_image, predict
|
|
|
from tqdm import tqdm
|
|
|
import torchvision
|
|
|
import fiftyone as fo
|
|
|
|
|
|
|
|
|
def main(
|
|
|
image_directory: str = 'test_grounding_dino',
|
|
|
text_prompt: str = 'bus, car',
|
|
|
box_threshold: float = 0.15,
|
|
|
text_threshold: float = 0.10,
|
|
|
export_dataset: bool = False,
|
|
|
view_dataset: bool = False,
|
|
|
export_annotated_images: bool = True,
|
|
|
weights_path : str = "groundingdino_swint_ogc.pth",
|
|
|
config_path: str = "../../GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
|
|
|
subsample: int = None,
|
|
|
):
|
|
|
|
|
|
model = load_model(config_path, weights_path)
|
|
|
|
|
|
dataset = fo.Dataset.from_images_dir(image_directory)
|
|
|
|
|
|
|
|
|
if subsample is not None:
|
|
|
|
|
|
if subsample < len(dataset):
|
|
|
dataset = dataset.take(subsample).clone()
|
|
|
|
|
|
for sample in tqdm(dataset):
|
|
|
|
|
|
image_source, image = load_image(sample.filepath)
|
|
|
|
|
|
boxes, logits, phrases = predict(
|
|
|
model=model,
|
|
|
image=image,
|
|
|
caption=text_prompt,
|
|
|
box_threshold=box_threshold,
|
|
|
text_threshold=text_threshold,
|
|
|
)
|
|
|
|
|
|
detections = []
|
|
|
|
|
|
for box, logit, phrase in zip(boxes, logits, phrases):
|
|
|
|
|
|
rel_box = torchvision.ops.box_convert(box, 'cxcywh', 'xywh')
|
|
|
|
|
|
detections.append(
|
|
|
fo.Detection(
|
|
|
label=phrase,
|
|
|
bounding_box=rel_box,
|
|
|
confidence=logit,
|
|
|
))
|
|
|
|
|
|
# Store detections in a field name of your choice
|
|
|
sample["detections"] = fo.Detections(detections=detections)
|
|
|
sample.save()
|
|
|
|
|
|
# loads the voxel fiftyone UI ready for viewing the dataset.
|
|
|
if view_dataset:
|
|
|
session = fo.launch_app(dataset)
|
|
|
session.wait()
|
|
|
|
|
|
# exports COCO dataset ready for training
|
|
|
if export_dataset:
|
|
|
dataset.export(
|
|
|
'coco_dataset',
|
|
|
dataset_type=fo.types.COCODetectionDataset,
|
|
|
)
|
|
|
|
|
|
# saves bounding boxes plotted on the input images to disk
|
|
|
if export_annotated_images:
|
|
|
dataset.draw_labels(
|
|
|
'images_with_bounding_boxes',
|
|
|
label_fields=['detections']
|
|
|
)
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
typer.run(main)
|