Spaces:
Runtime error
Runtime error
Delete tool
Browse files
tool/__pycache__/detector.cpython-311.pyc
DELETED
|
Binary file (6.36 kB)
|
|
|
tool/__pycache__/segmentor.cpython-311.pyc
DELETED
|
Binary file (5.54 kB)
|
|
|
tool/__pycache__/transfer_tools.cpython-311.pyc
DELETED
|
Binary file (3.53 kB)
|
|
|
tool/detector.py
DELETED
|
@@ -1,93 +0,0 @@
|
|
| 1 |
-
import torch
|
| 2 |
-
import numpy as np
|
| 3 |
-
import cv2
|
| 4 |
-
import PIL
|
| 5 |
-
|
| 6 |
-
from groundingdino.models import build_model as build_grounding_dino
|
| 7 |
-
from groundingdino.util.slconfig import SLConfig
|
| 8 |
-
from groundingdino.util.utils import clean_state_dict
|
| 9 |
-
from groundingdino.util.inference import annotate, load_image, predict
|
| 10 |
-
import groundingdino.datasets.transforms as T
|
| 11 |
-
|
| 12 |
-
from torchvision.ops import box_convert
|
| 13 |
-
|
| 14 |
-
class Detector:
|
| 15 |
-
def __init__(self, device):
|
| 16 |
-
config_file = "src/groundingdino/groundingdino/config/GroundingDINO_SwinT_OGC.py"
|
| 17 |
-
grounding_dino_ckpt = './ckpt/groundingdino_swint_ogc.pth'
|
| 18 |
-
args = SLConfig.fromfile(config_file)
|
| 19 |
-
args.device = device
|
| 20 |
-
self.deivce = device
|
| 21 |
-
self.gd = build_grounding_dino(args)
|
| 22 |
-
|
| 23 |
-
checkpoint = torch.load(grounding_dino_ckpt, map_location='cpu')
|
| 24 |
-
log = self.gd.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
|
| 25 |
-
print("Model loaded from {} \n => {}".format(grounding_dino_ckpt, log))
|
| 26 |
-
self.gd.eval()
|
| 27 |
-
|
| 28 |
-
def image_transform_grounding(self, init_image):
|
| 29 |
-
transform = T.Compose([
|
| 30 |
-
T.RandomResize([800], max_size=1333),
|
| 31 |
-
T.ToTensor(),
|
| 32 |
-
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
|
| 33 |
-
])
|
| 34 |
-
image, _ = transform(init_image, None) # 3, h, w
|
| 35 |
-
return init_image, image
|
| 36 |
-
|
| 37 |
-
def image_transform_grounding_for_vis(self, init_image):
|
| 38 |
-
transform = T.Compose([
|
| 39 |
-
T.RandomResize([800], max_size=1333),
|
| 40 |
-
])
|
| 41 |
-
image, _ = transform(init_image, None) # 3, h, w
|
| 42 |
-
return image
|
| 43 |
-
|
| 44 |
-
def transfer_boxes_format(self, boxes, height, width):
|
| 45 |
-
boxes = boxes * torch.Tensor([width, height, width, height])
|
| 46 |
-
boxes = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy")
|
| 47 |
-
|
| 48 |
-
transfered_boxes = []
|
| 49 |
-
for i in range(len(boxes)):
|
| 50 |
-
box = boxes[i]
|
| 51 |
-
transfered_box = [[int(box[0]), int(box[1])], [int(box[2]), int(box[3])]]
|
| 52 |
-
transfered_boxes.append(transfered_box)
|
| 53 |
-
|
| 54 |
-
transfered_boxes = np.array(transfered_boxes)
|
| 55 |
-
return transfered_boxes
|
| 56 |
-
|
| 57 |
-
@torch.no_grad()
|
| 58 |
-
def run_grounding(self, origin_frame, grounding_caption, box_threshold, text_threshold):
|
| 59 |
-
'''
|
| 60 |
-
return:
|
| 61 |
-
annotated_frame:nd.array
|
| 62 |
-
transfered_boxes: nd.array [N, 4]: [[x0, y0], [x1, y1]]
|
| 63 |
-
'''
|
| 64 |
-
height, width, _ = origin_frame.shape
|
| 65 |
-
img_pil = PIL.Image.fromarray(origin_frame)
|
| 66 |
-
re_width, re_height = img_pil.size
|
| 67 |
-
_, image_tensor = self.image_transform_grounding(img_pil)
|
| 68 |
-
# img_pil = self.image_transform_grounding_for_vis(img_pil)
|
| 69 |
-
|
| 70 |
-
# run grounidng
|
| 71 |
-
boxes, logits, phrases = predict(self.gd, image_tensor, grounding_caption, box_threshold, text_threshold, device=self.deivce)
|
| 72 |
-
annotated_frame = annotate(image_source=np.asarray(img_pil), boxes=boxes, logits=logits, phrases=phrases)[:, :, ::-1]
|
| 73 |
-
annotated_frame = cv2.resize(annotated_frame, (width, height), interpolation=cv2.INTER_LINEAR)
|
| 74 |
-
|
| 75 |
-
# transfer boxes to sam-format
|
| 76 |
-
transfered_boxes = self.transfer_boxes_format(boxes, re_height, re_width)
|
| 77 |
-
return annotated_frame, transfered_boxes
|
| 78 |
-
|
| 79 |
-
if __name__ == "__main__":
|
| 80 |
-
detector = Detector("cuda")
|
| 81 |
-
origin_frame = cv2.imread('./debug/point.png')
|
| 82 |
-
origin_frame = cv2.cvtColor(origin_frame, cv2.COLOR_BGR2RGB)
|
| 83 |
-
grounding_caption = "swan.water"
|
| 84 |
-
box_threshold = 0.25
|
| 85 |
-
text_threshold = 0.25
|
| 86 |
-
|
| 87 |
-
annotated_frame, boxes = detector.run_grounding(origin_frame, grounding_caption, box_threshold, text_threshold)
|
| 88 |
-
cv2.imwrite('./debug/x.png', annotated_frame)
|
| 89 |
-
|
| 90 |
-
for i in range(len(boxes)):
|
| 91 |
-
bbox = boxes[i]
|
| 92 |
-
origin_frame = cv2.rectangle(origin_frame, bbox[0], bbox[1], (0, 0, 255))
|
| 93 |
-
cv2.imwrite('./debug/bbox_frame.png', origin_frame)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tool/segmentor.py
DELETED
|
@@ -1,96 +0,0 @@
|
|
| 1 |
-
import torch
|
| 2 |
-
import cv2
|
| 3 |
-
import numpy as np
|
| 4 |
-
from sam.segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator
|
| 5 |
-
|
| 6 |
-
class Segmentor:
|
| 7 |
-
def __init__(self, sam_args):
|
| 8 |
-
"""
|
| 9 |
-
sam_args:
|
| 10 |
-
sam_checkpoint: path of SAM checkpoint
|
| 11 |
-
generator_args: args for everything_generator
|
| 12 |
-
gpu_id: device
|
| 13 |
-
"""
|
| 14 |
-
self.device = sam_args["gpu_id"]
|
| 15 |
-
self.sam = sam_model_registry[sam_args["model_type"]](checkpoint=sam_args["sam_checkpoint"])
|
| 16 |
-
self.sam.to(device=self.device)
|
| 17 |
-
self.everything_generator = SamAutomaticMaskGenerator(model=self.sam, **sam_args['generator_args'])
|
| 18 |
-
self.interactive_predictor = self.everything_generator.predictor
|
| 19 |
-
self.have_embedded = False
|
| 20 |
-
|
| 21 |
-
@torch.no_grad()
|
| 22 |
-
def set_image(self, image):
|
| 23 |
-
# calculate the embedding only once per frame.
|
| 24 |
-
if not self.have_embedded:
|
| 25 |
-
self.interactive_predictor.set_image(image)
|
| 26 |
-
self.have_embedded = True
|
| 27 |
-
@torch.no_grad()
|
| 28 |
-
def interactive_predict(self, prompts, mode, multimask=True):
|
| 29 |
-
assert self.have_embedded, 'image embedding for sam need be set before predict.'
|
| 30 |
-
|
| 31 |
-
if mode == 'point':
|
| 32 |
-
masks, scores, logits = self.interactive_predictor.predict(point_coords=prompts['point_coords'],
|
| 33 |
-
point_labels=prompts['point_modes'],
|
| 34 |
-
multimask_output=multimask)
|
| 35 |
-
elif mode == 'mask':
|
| 36 |
-
masks, scores, logits = self.interactive_predictor.predict(mask_input=prompts['mask_prompt'],
|
| 37 |
-
multimask_output=multimask)
|
| 38 |
-
elif mode == 'point_mask':
|
| 39 |
-
masks, scores, logits = self.interactive_predictor.predict(point_coords=prompts['point_coords'],
|
| 40 |
-
point_labels=prompts['point_modes'],
|
| 41 |
-
mask_input=prompts['mask_prompt'],
|
| 42 |
-
multimask_output=multimask)
|
| 43 |
-
|
| 44 |
-
return masks, scores, logits
|
| 45 |
-
|
| 46 |
-
@torch.no_grad()
|
| 47 |
-
def segment_with_click(self, origin_frame, coords, modes, multimask=True):
|
| 48 |
-
'''
|
| 49 |
-
|
| 50 |
-
return:
|
| 51 |
-
mask: one-hot
|
| 52 |
-
'''
|
| 53 |
-
self.set_image(origin_frame)
|
| 54 |
-
|
| 55 |
-
prompts = {
|
| 56 |
-
'point_coords': coords,
|
| 57 |
-
'point_modes': modes,
|
| 58 |
-
}
|
| 59 |
-
masks, scores, logits = self.interactive_predict(prompts, 'point', multimask)
|
| 60 |
-
mask, logit = masks[np.argmax(scores)], logits[np.argmax(scores), :, :]
|
| 61 |
-
prompts = {
|
| 62 |
-
'point_coords': coords,
|
| 63 |
-
'point_modes': modes,
|
| 64 |
-
'mask_prompt': logit[None, :, :]
|
| 65 |
-
}
|
| 66 |
-
masks, scores, logits = self.interactive_predict(prompts, 'point_mask', multimask)
|
| 67 |
-
mask = masks[np.argmax(scores)]
|
| 68 |
-
|
| 69 |
-
return mask.astype(np.uint8)
|
| 70 |
-
|
| 71 |
-
def segment_with_box(self, origin_frame, bbox, reset_image=False):
|
| 72 |
-
if reset_image:
|
| 73 |
-
self.interactive_predictor.set_image(origin_frame)
|
| 74 |
-
else:
|
| 75 |
-
self.set_image(origin_frame)
|
| 76 |
-
# coord = np.array([[int((bbox[1][0] - bbox[0][0]) / 2.), int((bbox[1][1] - bbox[0][1]) / 2)]])
|
| 77 |
-
# point_label = np.array([1])
|
| 78 |
-
|
| 79 |
-
masks, scores, logits = self.interactive_predictor.predict(
|
| 80 |
-
point_coords=None,
|
| 81 |
-
point_labels=None,
|
| 82 |
-
box=np.array([bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]]),
|
| 83 |
-
multimask_output=True
|
| 84 |
-
)
|
| 85 |
-
mask, logit = masks[np.argmax(scores)], logits[np.argmax(scores), :, :]
|
| 86 |
-
|
| 87 |
-
masks, scores, logits = self.interactive_predictor.predict(
|
| 88 |
-
point_coords=None,
|
| 89 |
-
point_labels=None,
|
| 90 |
-
box=np.array([[bbox[0][0], bbox[0][1], bbox[1][0], bbox[1][1]]]),
|
| 91 |
-
mask_input=logit[None, :, :],
|
| 92 |
-
multimask_output=True
|
| 93 |
-
)
|
| 94 |
-
mask = masks[np.argmax(scores)]
|
| 95 |
-
|
| 96 |
-
return [mask]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tool/transfer_tools.py
DELETED
|
@@ -1,51 +0,0 @@
|
|
| 1 |
-
import cv2
|
| 2 |
-
import numpy as np
|
| 3 |
-
|
| 4 |
-
def mask2bbox(mask):
|
| 5 |
-
if len(np.where(mask > 0)[0]) == 0:
|
| 6 |
-
print(f'not mask')
|
| 7 |
-
return np.array([[0, 0], [0, 0]]).astype(np.int64)
|
| 8 |
-
|
| 9 |
-
x_ = np.sum(mask, axis=0)
|
| 10 |
-
y_ = np.sum(mask, axis=1)
|
| 11 |
-
|
| 12 |
-
x0 = np.min(np.nonzero(x_)[0])
|
| 13 |
-
x1 = np.max(np.nonzero(x_)[0])
|
| 14 |
-
y0 = np.min(np.nonzero(y_)[0])
|
| 15 |
-
y1 = np.max(np.nonzero(y_)[0])
|
| 16 |
-
|
| 17 |
-
return np.array([[x0, y0], [x1, y1]]).astype(np.int64)
|
| 18 |
-
|
| 19 |
-
def draw_outline(mask, frame):
|
| 20 |
-
_, binary_mask = cv2.threshold(mask, 0, 255, cv2.THRESH_BINARY)
|
| 21 |
-
|
| 22 |
-
contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
| 23 |
-
|
| 24 |
-
cv2.drawContours(frame, contours, -1, (0, 0, 255), 2)
|
| 25 |
-
|
| 26 |
-
return frame
|
| 27 |
-
|
| 28 |
-
def draw_points(points, modes, frame):
|
| 29 |
-
neg_points = points[np.argwhere(modes==0)[:, 0]]
|
| 30 |
-
pos_points = points[np.argwhere(modes==1)[:, 0]]
|
| 31 |
-
|
| 32 |
-
for i in range(len(neg_points)):
|
| 33 |
-
point = neg_points[i]
|
| 34 |
-
cv2.circle(frame, (point[0], point[1]), 8, (255, 80, 80), -1)
|
| 35 |
-
|
| 36 |
-
for i in range(len(pos_points)):
|
| 37 |
-
point = pos_points[i]
|
| 38 |
-
cv2.circle(frame, (point[0], point[1]), 8, (0, 153, 255), -1)
|
| 39 |
-
|
| 40 |
-
return frame
|
| 41 |
-
|
| 42 |
-
if __name__ == '__main__':
|
| 43 |
-
mask = cv2.imread('./debug/mask.jpg', cv2.IMREAD_GRAYSCALE)
|
| 44 |
-
frame = cv2.imread('./debug/frame.jpg')
|
| 45 |
-
draw_frame = draw_outline(mask, frame)
|
| 46 |
-
|
| 47 |
-
cv2.imwrite('./debug/outline.jpg', draw_frame)
|
| 48 |
-
|
| 49 |
-
# bbox = mask2bbox(mask)
|
| 50 |
-
# draw_0 = cv2.rectangle(mask, bbox[0], bbox[1], (0, 0, 255))
|
| 51 |
-
# cv2.imwrite('./debug/rect.png', draw_0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|