Spaces:

3dlg-hcvc
/

opdmulti-demo

Sleeping

App Files Files Community

atwang commited on Sep 27, 2023

Commit

6d737eb

1 Parent(s): 5ceacf4

local app demo is working

Browse files

Files changed (12) hide show

README.md +40 -0
app.py +42 -22
dev-requirements.txt +0 -3
examples/174-8460.png +0 -0
examples/174-8460_d.png +0 -0
examples/187-0.png +0 -0
examples/187-0_d.png +0 -0
examples/187-23040.png +0 -0
examples/187-23040_d.png +0 -0
inference.py +7 -346
requirements.txt +3 -1
visualization.py +353 -0

README.md CHANGED Viewed

@@ -11,3 +11,43 @@ license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Installation
+To setup the environment, run the following (recommended in a virtual environment):
+```
+# install base requirements
+pip install -r pre-requirements.txt
+pip install -r requirements.txt
+cd mask2former/modeling/pixel_decoder/ops
+python setup.py build install
+# Option A: running locally only
+pip install open3d==0.17.0
+# Option B: running over ssh connection / headless environment
+# in a separate folder
+git clone https://github.com/isl-org/Open3D.git
+cd Open3D/
+mkdir build && cd build
+cmake -DENABLE_HEADLESS_RENDERING=ON -DBUILD_GUI=OFF -DBUILD_WEBRTC=OFF -DUSE_SYSTEM_GLEW=OFF -DUSE_SYSTEM_GLFW=OFF ..
+make -j$(nproc)
+make install-pip-package
+# to test custom build
+cd ../examples/python/visualization/
+python headless_rendering.py
+```
+The setup with pre-requirements.txt and requirements.txt resolves the issue that certain packages need to be installed
+prior to others. By default, most additional packages should be added to requirements.txt.
+## Usage
+To startup the application locally, run
+```
+gradio app.py
+```
+You can view the app on the specified port (usually 7860). To run over an ssh connection, setup port forwarding using
+`-L 7860:localhost:7860` when you create your ssh connection. Note that you will need to install Open3D in headless
+rendering for this to work.

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ from inference import main, setup_cfg
 # internal settings
 NUM_PROCESSES = 1
-CROP = True
 SCORE_THRESHOLD = 0.8
 MAX_PARTS = 5
 ARGS = SimpleNamespace(
@@ -24,6 +24,7 @@ ARGS = SimpleNamespace(
     output=".output",
     cpu=True,
 )
 outputs = []
@@ -52,16 +53,6 @@ def predict(rgb_image: str, depth_image: str, intrinsics: np.ndarray, num_sample
                         images[file].append(os.path.join(sub_path, image_file))
         return images
-    def get_generator(images):
-        def gen():
-            while True:
-                for im in images:
-                    time.sleep(0.025)
-                    yield im
-                time.sleep(3)
-        return gen
     # clear old predictions
     for path in os.listdir(ARGS.output):
         full_path = os.path.join(ARGS.output, path)
@@ -89,15 +80,32 @@ def predict(rgb_image: str, depth_image: str, intrinsics: np.ndarray, num_sample
     # process output
     # TODO: may want to select these in decreasing order of score
     image_files = find_images(ARGS.output)
-    output = []
     for count, part in enumerate(image_files):
         if count < MAX_PARTS:
-            # output.append(gr.update(value=get_generator([Image.open(im) for im in image_files[part]]), visible=True))
-            output.append(get_generator([Image.open(im) for im in image_files[part]]))
-    # while len(output) < MAX_PARTS:
-    #     output.append(gr.update(visible=False))
-    yield from output[0]()
 with gr.Blocks() as demo:
@@ -145,7 +153,7 @@ with gr.Blocks() as demo:
         interactive=True,
     )
     num_samples = gr.Number(
-        value=10,
         label="Number of samples",
         show_label=True,
         interactive=True,
@@ -154,16 +162,28 @@ with gr.Blocks() as demo:
         maximum=20,
     )
     submit_btn = gr.Button("Run model")
     # TODO: do we want to set a maximum limit on how many parts we render? We could also show the number of components
     # identified.
-    # images = [gr.Image(type="pil", label=f"Part {idx + 1}", visible=False) for idx in range(MAX_PARTS)]
-    image = gr.Image(type="pil", visible=True)
-    # TODO: maybe need to use a queue here so we don't overload the instance
     submit_btn.click(
-        fn=predict, inputs=[rgb_image, depth_image, intrinsics, num_samples], outputs=image, api_name="run_model"
     )
 demo.queue(api_open=False)

 # internal settings
 NUM_PROCESSES = 1
+CROP = False
 SCORE_THRESHOLD = 0.8
 MAX_PARTS = 5
 ARGS = SimpleNamespace(
     output=".output",
     cpu=True,
 )
+NUM_SAMPLES = 10
 outputs = []
                         images[file].append(os.path.join(sub_path, image_file))
         return images
     # clear old predictions
     for path in os.listdir(ARGS.output):
         full_path = os.path.join(ARGS.output, path)
     # process output
     # TODO: may want to select these in decreasing order of score
     image_files = find_images(ARGS.output)
+    outputs = []
     for count, part in enumerate(image_files):
         if count < MAX_PARTS:
+            outputs.append([Image.open(im) for im in image_files[part]])
+    return [
+        *[gr.update(value=out[0], visible=True) for out in outputs],
+        *[gr.update(visible=False) for _ in range(MAX_PARTS - len(outputs))],
+    ]
+def get_trigger(idx: int, fps: int = 40, oscillate: bool = True):
+    def iter_images(*args, **kwargs):
+        if idx < len(outputs):
+            for im in outputs[idx]:
+                time.sleep(1.0 / fps)
+                yield im
+            if oscillate:
+                for im in reversed(outputs[idx]):
+                    time.sleep(1.0 / fps)
+                    yield im
+        else:
+            raise ValueError("Could not find any images to load into this module.")
+    return iter_images
 with gr.Blocks() as demo:
         interactive=True,
     )
     num_samples = gr.Number(
+        value=NUM_SAMPLES,
         label="Number of samples",
         show_label=True,
         interactive=True,
         maximum=20,
     )
+    examples = gr.Examples(
+        examples=[
+            ["examples/59-4860.png", "examples/59-4860_d.png"],
+            ["examples/174-8460.png", "examples/174-8460_d.png"],
+            ["examples/187-0.png", "examples/187-0_d.png"],
+            ["examples/187-23040.png", "examples/187-23040_d.png"],
+        ],
+        inputs=[rgb_image, depth_image],
+        api_name=False,
+        examples_per_page=2,
+    )
     submit_btn = gr.Button("Run model")
     # TODO: do we want to set a maximum limit on how many parts we render? We could also show the number of components
     # identified.
+    images = [gr.Image(type="pil", label=f"Part {idx + 1}", visible=False) for idx in range(MAX_PARTS)]
+    for idx, image_comp in enumerate(images):
+        image_comp.select(get_trigger(idx), inputs=[], outputs=image_comp, api_name=False)
     submit_btn.click(
+        fn=predict, inputs=[rgb_image, depth_image, intrinsics, num_samples], outputs=images, api_name=False
     )
 demo.queue(api_open=False)

dev-requirements.txt DELETED Viewed

@@ -1,3 +0,0 @@
-black==23.9.1
-gradio==3.44.3
-huggingface-hub==0.17.2

examples/174-8460.png ADDED Viewed

examples/174-8460_d.png ADDED Viewed

examples/187-0.png ADDED Viewed

examples/187-0_d.png ADDED Viewed

examples/187-23040.png ADDED Viewed

examples/187-23040_d.png ADDED Viewed

inference.py CHANGED Viewed

@@ -19,7 +19,6 @@ import argparse
 import logging
 import os
 import time
-from copy import deepcopy
 from typing import Any
 import imageio
@@ -34,13 +33,19 @@ from detectron2.projects.deeplab import add_deeplab_config
 from detectron2.structures import instances
 from detectron2.utils import comm
 from detectron2.utils.logger import setup_logger
-from PIL import Image, ImageChops
 from mask2former import (
     add_maskformer2_config,
     add_motionnet_config,
 )
 from utilities import prediction_to_json
 # import based on torch version. Required for model loading. Code is taken from fvcore.common.checkpoint, in order to
 # replicate model loading without the overhead of setting up an OPDTrainer
@@ -63,9 +68,7 @@ TYPE_CLASSIFICATION = {
     1: "translation",
 }
-POINT_COLOR = [1, 0, 0]  # red for demonstration
 ARROW_COLOR = [0, 1, 0]  # green
-IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg")
 def get_parser() -> argparse.ArgumentParser:
@@ -336,348 +339,6 @@ def predict(model: nn.Module, inp: list[dict[str, Any]]) -> list[dict[str, insta
     return out
-def generate_rotation_visualization(
-    pcd: o3d.geometry.PointCloud,
-    axis_arrow: o3d.geometry.TriangleMesh,
-    mask: np.ndarray,
-    axis_vector: np.ndarray,
-    origin: np.ndarray,
-    range_min: float,
-    range_max: float,
-    num_samples: int,
-    output_dir: str,
-) -> None:
-    """
-    Generate visualization files for a rotation motion of a part.
-    :param pcd: point cloud object representing 2D image input (RGBD) as a point cloud
-    :param axis_arrow: mesh object representing axis arrow of rotation to be rendered in visualization
-    :param mask: mask np.array of dimensions (height, width) representing the part to be rotated in the image
-    :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of rotation
-    :param origin: np.array of dimensions (3, ) representing the origin point of the axis of rotation
-    :param range_min: float representing the minimum range of motion in radians
-    :param range_max: float representing the maximum range of motion in radians
-    :param num_samples: number of sample states to visualize in between range_min and range_max of motion
-    :param output_dir: string path to directory in which to save visualization output
-    """
-    angle_in_radians = np.linspace(range_min, range_max, num_samples)
-    angles_in_degrees = angle_in_radians * 180 / np.pi
-    for idx, angle_in_degrees in enumerate(angles_in_degrees):
-        # Make a copy of your original point cloud and arrow for each rotation
-        rotated_pcd = deepcopy(pcd)
-        rotated_arrow = deepcopy(axis_arrow)
-        angle_rad = np.radians(angle_in_degrees)
-        rotated_pcd = rotate_part(rotated_pcd, mask, axis_vector, origin, angle_rad)
-        # Create a Visualizer object for each rotation
-        vis = o3d.visualization.Visualizer()
-        vis.create_window()
-        # Add the rotated geometries
-        vis.add_geometry(rotated_pcd)
-        vis.add_geometry(rotated_arrow)
-        # Apply the additional rotation around x-axis if desired
-        angle_x = np.pi * 5.5 / 5  # 198 degrees
-        rotation_matrix = o3d.geometry.get_rotation_matrix_from_axis_angle(np.asarray([1, 0, 0]) * angle_x)
-        rotated_pcd.rotate(rotation_matrix, center=rotated_pcd.get_center())
-        rotated_arrow.rotate(rotation_matrix, center=rotated_pcd.get_center())
-        # Capture and save the image
-        output_filename = f"{output_dir}/{idx}.png"
-        vis.capture_screen_image(output_filename, do_render=True)
-        vis.destroy_window()
-def generate_translation_visualization(
-    pcd: o3d.geometry.PointCloud,
-    axis_arrow: o3d.geometry.TriangleMesh,
-    mask: np.ndarray,
-    end: np.ndarray,
-    range_min: float,
-    range_max: float,
-    num_samples: int,
-    output_dir: str,
-) -> None:
-    """
-    Generate visualization files for a translation motion of a part.
-    :param pcd: point cloud object representing 2D image input (RGBD) as a point cloud
-    :param axis_arrow: mesh object representing axis arrow of translation to be rendered in visualization
-    :param mask: mask np.array of dimensions (height, width) representing the part to be translated in the image
-    :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of translation
-    :param origin: np.array of dimensions (3, ) representing the origin point of the axis of translation
-    :param range_min: float representing the minimum range of motion
-    :param range_max: float representing the maximum range of motion
-    :param num_samples: number of sample states to visualize in between range_min and range_max of motion
-    :param output_dir: string path to directory in which to save visualization output
-    """
-    translate_distances = np.linspace(range_min, range_max, num_samples)
-    for idx, translate_distance in enumerate(translate_distances):
-        translated_pcd = deepcopy(pcd)
-        translated_arrow = deepcopy(axis_arrow)
-        translated_pcd = translate_part(translated_pcd, mask, end, translate_distance.item())
-        # Create a Visualizer object for each rotation
-        vis = o3d.visualization.Visualizer()
-        vis.create_window()
-        # Add the translated geometries
-        vis.add_geometry(translated_pcd)
-        vis.add_geometry(translated_arrow)
-        # Apply the additional rotation around x-axis if desired
-        # TODO: not sure why we need this rotation for the translation, and when it would be desired
-        angle_x = np.pi * 5.5 / 5  # 198 degrees
-        R = o3d.geometry.get_rotation_matrix_from_axis_angle(np.asarray([1, 0, 0]) * angle_x)
-        translated_pcd.rotate(R, center=translated_pcd.get_center())
-        translated_arrow.rotate(R, center=translated_pcd.get_center())
-        # Capture and save the image
-        output_filename = f"{output_dir}/{idx}.png"
-        vis.capture_screen_image(output_filename, do_render=True)
-        vis.destroy_window()
-def get_rotation_matrix_from_vectors(vec1: np.ndarray, vec2: np.ndarray) -> np.ndarray:
-    """
-    Find the rotation matrix that aligns vec1 to vec2
-    :param vec1: A 3d "source" vector
-    :param vec2: A 3d "destination" vector
-    :return: A transform matrix (3x3) which when applied to vec1, aligns it with vec2.
-    """
-    a, b = (vec1 / np.linalg.norm(vec1)).reshape(3), (vec2 / np.linalg.norm(vec2)).reshape(3)
-    v = np.cross(a, b)
-    c = np.dot(a, b)
-    s = np.linalg.norm(v)
-    kmat = np.array([[0, -v[2], v[1]], [v[2], 0, -v[0]], [-v[1], v[0], 0]])
-    rotation_matrix = np.eye(3) + kmat + kmat.dot(kmat) * ((1 - c) / (s**2))
-    return rotation_matrix
-def draw_line(start_point: np.ndarray, end_point: np.ndarray) -> o3d.geometry.TriangleMesh:
-    """
-    Generate 3D mesh representing axis from start_point to end_point.
-    :param start_point: np.ndarray of dimensions (3, ) representing the start point of the axis
-    :param end_point: np.ndarray of dimensions (3, ) representing the end point of the axis
-    :return: mesh object representing axis from start to end
-    """
-    # Compute direction vector and normalize it
-    direction_vector = end_point - start_point
-    normalized_vector = direction_vector / np.linalg.norm(direction_vector)
-    # Compute the rotation matrix to align the Z-axis with the desired direction
-    target_vector = np.array([0, 0, 1])
-    rot_mat = get_rotation_matrix_from_vectors(target_vector, normalized_vector)
-    # Create the cylinder (shaft of the arrow)
-    cylinder_length = 0.9  # 90% of the total arrow length, you can adjust as needed
-    cylinder_radius = 0.01  # Adjust the thickness of the arrow shaft
-    cylinder = o3d.geometry.TriangleMesh.create_cylinder(radius=cylinder_radius, height=cylinder_length)
-    # Move base of cylinder to origin, rotate, then translate to start_point
-    cylinder.translate([0, 0, 0])
-    cylinder.rotate(rot_mat, center=[0, 0, 0])
-    cylinder.translate(start_point)
-    # Create the cone (head of the arrow)
-    cone_height = 0.1  # 10% of the total arrow length, adjust as needed
-    cone_radius = 0.03  # Adjust the size of the arrowhead
-    cone = o3d.geometry.TriangleMesh.create_cone(radius=cone_radius, height=cone_height)
-    # Move base of cone to origin, rotate, then translate to end of cylinder
-    cone.translate([-0, 0, 0])
-    cone.rotate(rot_mat, center=[0, 0, 0])
-    cone.translate(start_point + normalized_vector * 0.4)
-    arrow = cylinder + cone
-    return arrow
-def rotate_part(
-    pcd: o3d.geometry.PointCloud, mask: np.ndarray, axis_vector: np.ndarray, origin: np.ndarray, angle_rad: float
-) -> o3d.geometry.PointCloud:
-    """
-    Generate rotated point cloud of mask based on provided angle around axis.
-    :param pcd: point cloud object representing points of image
-    :param mask: mask np.array of dimensions (height, width) representing the part to be rotated in the image
-    :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of rotation
-    :param origin: np.array of dimensions (3, ) representing the origin point of the axis of rotation
-    :param angle_rad: angle in radians to rotate mask part
-    :return: point cloud object after rotation of masked part
-    """
-    # Get the coordinates of the point cloud as a numpy array
-    points_np = np.asarray(pcd.points)
-    # Convert point cloud colors to numpy array for easier manipulation
-    colors_np = np.asarray(pcd.colors)
-    # Create skew-symmetric matrix from end
-    K = np.array(
-        [
-            [0, -axis_vector[2], axis_vector[1]],
-            [axis_vector[2], 0, -axis_vector[0]],
-            [-axis_vector[1], axis_vector[0], 0],
-        ]
-    )
-    # Compute rotation matrix using Rodrigues' formula
-    R = np.eye(3) + np.sin(angle_rad) * K + (1 - np.cos(angle_rad)) * np.dot(K, K)
-    # Iterate over the mask and rotate the points corresponding to the object pixels
-    for i in range(mask.shape[0]):
-        for j in range(mask.shape[1]):
-            if mask[i, j] > 0:  # This condition checks if the pixel belongs to the object
-                point_index = i * mask.shape[1] + j
-                # Translate the point such that the rotation origin is at the world origin
-                translated_point = points_np[point_index] - origin
-                # Rotate the translated point
-                rotated_point = np.dot(R, translated_point)
-                # Translate the point back
-                points_np[point_index] = rotated_point + origin
-                colors_np[point_index] = POINT_COLOR
-    # Update the point cloud's coordinates
-    pcd.points = o3d.utility.Vector3dVector(points_np)
-    # Update point cloud colors
-    pcd.colors = o3d.utility.Vector3dVector(colors_np)
-    return pcd
-def translate_part(pcd, mask, axis_vector, distance):
-    """
-    Generate translated point cloud of mask based on provided angle around axis.
-    :param pcd: point cloud object representing points of image
-    :param mask: mask np.array of dimensions (height, width) representing the part to be translated in the image
-    :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of translation
-    :param distance: distance within coordinate system to translate mask part
-    :return: point cloud object after translation of masked part
-    """
-    normalized_vector = axis_vector / np.linalg.norm(axis_vector)
-    translation_vector = normalized_vector * distance
-    # Convert point cloud colors to numpy array for easier manipulation
-    colors_np = np.asarray(pcd.colors)
-    # Get the coordinates of the point cloud as a numpy array
-    points_np = np.asarray(pcd.points)
-    # Iterate over the mask and assign the color to the points corresponding to the object pixels
-    for i in range(mask.shape[0]):
-        for j in range(mask.shape[1]):
-            if mask[i, j] > 0:  # This condition checks if the pixel belongs to the object
-                point_index = i * mask.shape[1] + j
-                colors_np[point_index] = POINT_COLOR
-                points_np[point_index] += translation_vector
-    # Update point cloud colors
-    pcd.colors = o3d.utility.Vector3dVector(colors_np)
-    # Update the point cloud's coordinates
-    pcd.points = o3d.utility.Vector3dVector(points_np)
-    return pcd
-def batch_trim(images_path: str, save_path: str, identical: bool = False) -> None:
-    """
-    Trim white spaces from all images in the given path and save new images to folder.
-    :param images_path: local path to folder containing all images. Images must have the extension ".png", ".jpg", or
-    ".jpeg".
-    :param save_path: local path to folder in which to save trimmed images
-    :param identical: if True, will apply same crop to all images, else each image will have its whitespace trimmed
-    independently. Note that in the latter case, each image may have a slightly different size.
-    """
-    def get_trim(im):
-        """Trim whitespace from an image and return the cropped image."""
-        bg = Image.new(im.mode, im.size, im.getpixel((0, 0)))
-        diff = ImageChops.difference(im, bg)
-        diff = ImageChops.add(diff, diff, 2.0, -100)
-        bbox = diff.getbbox()
-        return bbox
-    if identical:  #
-        images = []
-        optimal_box = None
-        # load all images
-        for image_file in sorted(os.listdir(images_path)):
-            if image_file.endswith(IMAGE_EXTENSIONS):
-                image_path = os.path.join(images_path, image_file)
-                images.append(Image.open(image_path))
-        # find optimal box size
-        for im in images:
-            bbox = get_trim(im)
-            if bbox is None:
-                bbox = (0, 0, im.size[0], im.size[1])  # bound entire image
-            if optimal_box is None:
-                optimal_box = bbox
-            else:
-                optimal_box = (
-                    min(optimal_box[0], bbox[0]),
-                    min(optimal_box[1], bbox[1]),
-                    max(optimal_box[2], bbox[2]),
-                    max(optimal_box[3], bbox[3]),
-                )
-        # apply cropping, if optimal box was found
-        for idx, im in enumerate(images):
-            im.crop(optimal_box)
-            im.save(os.path.join(save_path, f"{idx}.png"))
-            im.close()
-    else:  # trim each image separately
-        for image_file in os.listdir(images_path):
-            if image_file.endswith(IMAGE_EXTENSIONS):
-                image_path = os.path.join(images_path, image_file)
-                with Image.open(image_path) as im:
-                    bbox = get_trim(im)
-                    trimmed = im.crop(bbox) if bbox else im
-                    trimmed.save(os.path.join(save_path, image_file))
-def create_gif(image_folder_path: str, num_samples: int, gif_filename: str = "output.gif") -> None:
-    """
-    Create gif out of folder of images and save to file.
-    :param image_folder_path: path to folder containing images (non-recursive). Assumes images are named as {i}.png for
-    each of i from 0 to num_samples.
-    :param num_samples: number of sampled images to compile into gif.
-    :param gif_filename: filename for gif, defaults to "output.gif"
-    """
-    # Generate a list of image filenames (assuming the images are saved as 0.png, 1.png, etc.)
-    image_files = [f"{image_folder_path}/{i}.png" for i in range(num_samples)]
-    # Read the images using imageio
-    images = [imageio.imread(image_file) for image_file in image_files]
-    assert all(
-        images[0].shape == im.shape for im in images
-    ), f"Found some images with a different shape: {[im.shape for im in images]}"
-    # Save images as a gif
-    gif_output_path = f"{image_folder_path}/{gif_filename}"
-    imageio.mimsave(gif_output_path, images, duration=0.1)
-    return
 def main(
     cfg: CfgNode,
     rgb_image: str,

 import logging
 import os
 import time
 from typing import Any
 import imageio
 from detectron2.structures import instances
 from detectron2.utils import comm
 from detectron2.utils.logger import setup_logger
 from mask2former import (
     add_maskformer2_config,
     add_motionnet_config,
 )
 from utilities import prediction_to_json
+from visualization import (
+    draw_line,
+    generate_rotation_visualization,
+    generate_translation_visualization,
+    batch_trim,
+    create_gif,
+)
 # import based on torch version. Required for model loading. Code is taken from fvcore.common.checkpoint, in order to
 # replicate model loading without the overhead of setting up an OPDTrainer
     1: "translation",
 }
 ARROW_COLOR = [0, 1, 0]  # green
 def get_parser() -> argparse.ArgumentParser:
     return out
 def main(
     cfg: CfgNode,
     rgb_image: str,

requirements.txt CHANGED Viewed

@@ -1,6 +1,5 @@
 h5py==3.9.0
 imageio==2.31.3
-open3d==0.17.0
 opencv-python==4.8.0.76
 pandas==2.1.0
 pycocotools==2.0.7
@@ -8,5 +7,8 @@ scikit-image==0.21.0
 scikit-learn==1.3.0
 scipy==1.11.2
 timm==0.9.7
 detectron2 @ git+https://github.com/facebookresearch/detectron2.git@fc9c33b1f6e5d4c37bbb46dde19af41afc1ddb2a
 -e mask2former/modeling/pixel_decoder/ops/

 h5py==3.9.0
 imageio==2.31.3
 opencv-python==4.8.0.76
 pandas==2.1.0
 pycocotools==2.0.7
 scikit-learn==1.3.0
 scipy==1.11.2
 timm==0.9.7
+black==23.9.1
+gradio==3.44.3
+huggingface-hub==0.17.2
 detectron2 @ git+https://github.com/facebookresearch/detectron2.git@fc9c33b1f6e5d4c37bbb46dde19af41afc1ddb2a
 -e mask2former/modeling/pixel_decoder/ops/

visualization.py ADDED Viewed

	@@ -0,0 +1,353 @@

+import os
+from copy import deepcopy
+import imageio
+import open3d as o3d
+import numpy as np
+from PIL import Image, ImageChops
+POINT_COLOR = [1, 0, 0]  # red for demonstration
+ARROW_COLOR = [0, 1, 0]  # green
+IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg")
+def generate_rotation_visualization(
+    pcd: o3d.geometry.PointCloud,
+    axis_arrow: o3d.geometry.TriangleMesh,
+    mask: np.ndarray,
+    axis_vector: np.ndarray,
+    origin: np.ndarray,
+    range_min: float,
+    range_max: float,
+    num_samples: int,
+    output_dir: str,
+) -> None:
+    """
+    Generate visualization files for a rotation motion of a part.
+    :param pcd: point cloud object representing 2D image input (RGBD) as a point cloud
+    :param axis_arrow: mesh object representing axis arrow of rotation to be rendered in visualization
+    :param mask: mask np.array of dimensions (height, width) representing the part to be rotated in the image
+    :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of rotation
+    :param origin: np.array of dimensions (3, ) representing the origin point of the axis of rotation
+    :param range_min: float representing the minimum range of motion in radians
+    :param range_max: float representing the maximum range of motion in radians
+    :param num_samples: number of sample states to visualize in between range_min and range_max of motion
+    :param output_dir: string path to directory in which to save visualization output
+    """
+    angle_in_radians = np.linspace(range_min, range_max, num_samples)
+    angles_in_degrees = angle_in_radians * 180 / np.pi
+    for idx, angle_in_degrees in enumerate(angles_in_degrees):
+        # Make a copy of your original point cloud and arrow for each rotation
+        rotated_pcd = deepcopy(pcd)
+        rotated_arrow = deepcopy(axis_arrow)
+        angle_rad = np.radians(angle_in_degrees)
+        rotated_pcd = rotate_part(rotated_pcd, mask, axis_vector, origin, angle_rad)
+        # Create a Visualizer object for each rotation
+        vis = o3d.visualization.Visualizer()
+        vis.create_window(visible=False)
+        # Add the rotated geometries
+        vis.add_geometry(rotated_pcd)
+        vis.add_geometry(rotated_arrow)
+        # Apply the additional rotation around x-axis if desired
+        angle_x = np.pi * 5.5 / 5  # 198 degrees
+        rotation_matrix = o3d.geometry.get_rotation_matrix_from_axis_angle(np.asarray([1, 0, 0]) * angle_x)
+        rotated_pcd.rotate(rotation_matrix, center=rotated_pcd.get_center())
+        rotated_arrow.rotate(rotation_matrix, center=rotated_pcd.get_center())
+        # Capture and save the image
+        output_filename = f"{output_dir}/{idx}.png"
+        vis.capture_screen_image(output_filename, do_render=True)
+        vis.destroy_window()
+def generate_translation_visualization(
+    pcd: o3d.geometry.PointCloud,
+    axis_arrow: o3d.geometry.TriangleMesh,
+    mask: np.ndarray,
+    end: np.ndarray,
+    range_min: float,
+    range_max: float,
+    num_samples: int,
+    output_dir: str,
+) -> None:
+    """
+    Generate visualization files for a translation motion of a part.
+    :param pcd: point cloud object representing 2D image input (RGBD) as a point cloud
+    :param axis_arrow: mesh object representing axis arrow of translation to be rendered in visualization
+    :param mask: mask np.array of dimensions (height, width) representing the part to be translated in the image
+    :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of translation
+    :param origin: np.array of dimensions (3, ) representing the origin point of the axis of translation
+    :param range_min: float representing the minimum range of motion
+    :param range_max: float representing the maximum range of motion
+    :param num_samples: number of sample states to visualize in between range_min and range_max of motion
+    :param output_dir: string path to directory in which to save visualization output
+    """
+    translate_distances = np.linspace(range_min, range_max, num_samples)
+    for idx, translate_distance in enumerate(translate_distances):
+        translated_pcd = deepcopy(pcd)
+        translated_arrow = deepcopy(axis_arrow)
+        translated_pcd = translate_part(translated_pcd, mask, end, translate_distance.item())
+        # Create a Visualizer object for each rotation
+        vis = o3d.visualization.Visualizer()
+        vis.create_window(visible=False)
+        # Add the translated geometries
+        vis.add_geometry(translated_pcd)
+        vis.add_geometry(translated_arrow)
+        # Apply the additional rotation around x-axis if desired
+        # TODO: not sure why we need this rotation for the translation, and when it would be desired
+        angle_x = np.pi * 5.5 / 5  # 198 degrees
+        R = o3d.geometry.get_rotation_matrix_from_axis_angle(np.asarray([1, 0, 0]) * angle_x)
+        translated_pcd.rotate(R, center=translated_pcd.get_center())
+        translated_arrow.rotate(R, center=translated_pcd.get_center())
+        # Capture and save the image
+        output_filename = f"{output_dir}/{idx}.png"
+        vis.capture_screen_image(output_filename, do_render=True)
+        vis.destroy_window()
+def get_rotation_matrix_from_vectors(vec1: np.ndarray, vec2: np.ndarray) -> np.ndarray:
+    """
+    Find the rotation matrix that aligns vec1 to vec2
+    :param vec1: A 3d "source" vector
+    :param vec2: A 3d "destination" vector
+    :return: A transform matrix (3x3) which when applied to vec1, aligns it with vec2.
+    """
+    a, b = (vec1 / np.linalg.norm(vec1)).reshape(3), (vec2 / np.linalg.norm(vec2)).reshape(3)
+    v = np.cross(a, b)
+    c = np.dot(a, b)
+    s = np.linalg.norm(v)
+    kmat = np.array([[0, -v[2], v[1]], [v[2], 0, -v[0]], [-v[1], v[0], 0]])
+    rotation_matrix = np.eye(3) + kmat + kmat.dot(kmat) * ((1 - c) / (s**2))
+    return rotation_matrix
+def draw_line(start_point: np.ndarray, end_point: np.ndarray) -> o3d.geometry.TriangleMesh:
+    """
+    Generate 3D mesh representing axis from start_point to end_point.
+    :param start_point: np.ndarray of dimensions (3, ) representing the start point of the axis
+    :param end_point: np.ndarray of dimensions (3, ) representing the end point of the axis
+    :return: mesh object representing axis from start to end
+    """
+    # Compute direction vector and normalize it
+    direction_vector = end_point - start_point
+    normalized_vector = direction_vector / np.linalg.norm(direction_vector)
+    # Compute the rotation matrix to align the Z-axis with the desired direction
+    target_vector = np.array([0, 0, 1])
+    rot_mat = get_rotation_matrix_from_vectors(target_vector, normalized_vector)
+    # Create the cylinder (shaft of the arrow)
+    cylinder_length = 0.9  # 90% of the total arrow length, you can adjust as needed
+    cylinder_radius = 0.01  # Adjust the thickness of the arrow shaft
+    cylinder = o3d.geometry.TriangleMesh.create_cylinder(radius=cylinder_radius, height=cylinder_length)
+    # Move base of cylinder to origin, rotate, then translate to start_point
+    cylinder.translate([0, 0, 0])
+    cylinder.rotate(rot_mat, center=[0, 0, 0])
+    cylinder.translate(start_point)
+    # Create the cone (head of the arrow)
+    cone_height = 0.1  # 10% of the total arrow length, adjust as needed
+    cone_radius = 0.03  # Adjust the size of the arrowhead
+    cone = o3d.geometry.TriangleMesh.create_cone(radius=cone_radius, height=cone_height)
+    # Move base of cone to origin, rotate, then translate to end of cylinder
+    cone.translate([-0, 0, 0])
+    cone.rotate(rot_mat, center=[0, 0, 0])
+    cone.translate(start_point + normalized_vector * 0.4)
+    arrow = cylinder + cone
+    return arrow
+def rotate_part(
+    pcd: o3d.geometry.PointCloud, mask: np.ndarray, axis_vector: np.ndarray, origin: np.ndarray, angle_rad: float
+) -> o3d.geometry.PointCloud:
+    """
+    Generate rotated point cloud of mask based on provided angle around axis.
+    :param pcd: point cloud object representing points of image
+    :param mask: mask np.array of dimensions (height, width) representing the part to be rotated in the image
+    :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of rotation
+    :param origin: np.array of dimensions (3, ) representing the origin point of the axis of rotation
+    :param angle_rad: angle in radians to rotate mask part
+    :return: point cloud object after rotation of masked part
+    """
+    # Get the coordinates of the point cloud as a numpy array
+    points_np = np.asarray(pcd.points)
+    # Convert point cloud colors to numpy array for easier manipulation
+    colors_np = np.asarray(pcd.colors)
+    # Create skew-symmetric matrix from end
+    K = np.array(
+        [
+            [0, -axis_vector[2], axis_vector[1]],
+            [axis_vector[2], 0, -axis_vector[0]],
+            [-axis_vector[1], axis_vector[0], 0],
+        ]
+    )
+    # Compute rotation matrix using Rodrigues' formula
+    R = np.eye(3) + np.sin(angle_rad) * K + (1 - np.cos(angle_rad)) * np.dot(K, K)
+    # Iterate over the mask and rotate the points corresponding to the object pixels
+    for i in range(mask.shape[0]):
+        for j in range(mask.shape[1]):
+            if mask[i, j] > 0:  # This condition checks if the pixel belongs to the object
+                point_index = i * mask.shape[1] + j
+                # Translate the point such that the rotation origin is at the world origin
+                translated_point = points_np[point_index] - origin
+                # Rotate the translated point
+                rotated_point = np.dot(R, translated_point)
+                # Translate the point back
+                points_np[point_index] = rotated_point + origin
+                colors_np[point_index] = POINT_COLOR
+    # Update the point cloud's coordinates
+    pcd.points = o3d.utility.Vector3dVector(points_np)
+    # Update point cloud colors
+    pcd.colors = o3d.utility.Vector3dVector(colors_np)
+    return pcd
+def translate_part(pcd, mask, axis_vector, distance):
+    """
+    Generate translated point cloud of mask based on provided angle around axis.
+    :param pcd: point cloud object representing points of image
+    :param mask: mask np.array of dimensions (height, width) representing the part to be translated in the image
+    :param axis_vector: np.array of dimensions (3, ) representing the vector of the axis of translation
+    :param distance: distance within coordinate system to translate mask part
+    :return: point cloud object after translation of masked part
+    """
+    normalized_vector = axis_vector / np.linalg.norm(axis_vector)
+    translation_vector = normalized_vector * distance
+    # Convert point cloud colors to numpy array for easier manipulation
+    colors_np = np.asarray(pcd.colors)
+    # Get the coordinates of the point cloud as a numpy array
+    points_np = np.asarray(pcd.points)
+    # Iterate over the mask and assign the color to the points corresponding to the object pixels
+    for i in range(mask.shape[0]):
+        for j in range(mask.shape[1]):
+            if mask[i, j] > 0:  # This condition checks if the pixel belongs to the object
+                point_index = i * mask.shape[1] + j
+                colors_np[point_index] = POINT_COLOR
+                points_np[point_index] += translation_vector
+    # Update point cloud colors
+    pcd.colors = o3d.utility.Vector3dVector(colors_np)
+    # Update the point cloud's coordinates
+    pcd.points = o3d.utility.Vector3dVector(points_np)
+    return pcd
+def batch_trim(images_path: str, save_path: str, identical: bool = False) -> None:
+    """
+    Trim white spaces from all images in the given path and save new images to folder.
+    :param images_path: local path to folder containing all images. Images must have the extension ".png", ".jpg", or
+    ".jpeg".
+    :param save_path: local path to folder in which to save trimmed images
+    :param identical: if True, will apply same crop to all images, else each image will have its whitespace trimmed
+    independently. Note that in the latter case, each image may have a slightly different size.
+    """
+    def get_trim(im):
+        """Trim whitespace from an image and return the cropped image."""
+        bg = Image.new(im.mode, im.size, im.getpixel((0, 0)))
+        diff = ImageChops.difference(im, bg)
+        diff = ImageChops.add(diff, diff, 2.0, -100)
+        bbox = diff.getbbox()
+        return bbox
+    if identical:  #
+        images = []
+        optimal_box = None
+        # load all images
+        for image_file in sorted(os.listdir(images_path)):
+            if image_file.endswith(IMAGE_EXTENSIONS):
+                image_path = os.path.join(images_path, image_file)
+                images.append(Image.open(image_path))
+        # find optimal box size
+        for im in images:
+            bbox = get_trim(im)
+            if bbox is None:
+                bbox = (0, 0, im.size[0], im.size[1])  # bound entire image
+            if optimal_box is None:
+                optimal_box = bbox
+            else:
+                optimal_box = (
+                    min(optimal_box[0], bbox[0]),
+                    min(optimal_box[1], bbox[1]),
+                    max(optimal_box[2], bbox[2]),
+                    max(optimal_box[3], bbox[3]),
+                )
+        # apply cropping, if optimal box was found
+        for idx, im in enumerate(images):
+            im.crop(optimal_box)
+            im.save(os.path.join(save_path, f"{idx}.png"))
+            im.close()
+    else:  # trim each image separately
+        for image_file in os.listdir(images_path):
+            if image_file.endswith(IMAGE_EXTENSIONS):
+                image_path = os.path.join(images_path, image_file)
+                with Image.open(image_path) as im:
+                    bbox = get_trim(im)
+                    trimmed = im.crop(bbox) if bbox else im
+                    trimmed.save(os.path.join(save_path, image_file))
+def create_gif(image_folder_path: str, num_samples: int, gif_filename: str = "output.gif") -> None:
+    """
+    Create gif out of folder of images and save to file.
+    :param image_folder_path: path to folder containing images (non-recursive). Assumes images are named as {i}.png for
+    each of i from 0 to num_samples.
+    :param num_samples: number of sampled images to compile into gif.
+    :param gif_filename: filename for gif, defaults to "output.gif"
+    """
+    # Generate a list of image filenames (assuming the images are saved as 0.png, 1.png, etc.)
+    image_files = [f"{image_folder_path}/{i}.png" for i in range(num_samples)]
+    # Read the images using imageio
+    images = [imageio.imread(image_file) for image_file in image_files]
+    assert all(
+        images[0].shape == im.shape for im in images
+    ), f"Found some images with a different shape: {[im.shape for im in images]}"
+    # Save images as a gif
+    gif_output_path = f"{image_folder_path}/{gif_filename}"
+    imageio.mimsave(gif_output_path, images, duration=0.1)
+    return