thodan · MartinSmeyer · Dec 10, 2024 · Dec 17, 2024 · Dec 17, 2024 · Jan 8, 2025
diff --git a/bop_toolkit_lib/config.py b/bop_toolkit_lib/config.py
@@ -23,7 +23,7 @@
 ######## Extended ########
 
 # Folder for outputs (e.g. visualizations).
-output_path = r"/path/to/output/folder"
+output_path = r"/tmp"
 
 # For offscreen C++ rendering: Path to the build folder of bop_renderer (github.com/thodan/bop_renderer).
 bop_renderer_path = r"/path/to/bop_renderer/build"

diff --git a/bop_toolkit_lib/dataset_params.py b/bop_toolkit_lib/dataset_params.py
@@ -89,6 +89,8 @@ def get_model_params(datasets_path, dataset_name, model_type=None):
         "hopev2": list(range(1, 29)),
         "hot3d": list(range(1, 34)),
         "handal": list(range(1, 41)),
+        "ipd": list(range(0,21)),
+        "xyz": list(range(1,18))
     }[dataset_name]
 
     # ID's of objects with ambiguous views evaluated using the ADI pose error
@@ -110,6 +112,8 @@ def get_model_params(datasets_path, dataset_name, model_type=None):
         "hopev2": [],
         "hot3d": [1, 2, 3, 5, 22, 24, 25, 29, 30, 32],
         "handal": [26, 35, 36, 37, 38, 39, 40],
+        "ipd": [],
+        "xyz": []
     }[dataset_name]
 
     # T-LESS includes two types of object models, CAD and reconstructed.
@@ -419,7 +423,7 @@ def get_split_params(datasets_path, dataset_name, split, split_type=None):
         p["aria_eval_modality"] = "rgb"
         def hot3d_eval_modality(scene_id):
             if scene_id in p["test_quest3_scene_ids"] or scene_id in p["train_quest3_scene_ids"]:
-                return p["quest3_eval_modality"]
+                return p["aria_eval_modality"]
             elif scene_id in p["test_aria_scene_ids"] or scene_id in p["train_aria_scene_ids"]:
                 return p["aria_eval_modality"]
             else:
@@ -433,6 +437,95 @@ def hot3d_eval_modality(scene_id):
             "gray2": "jpg",
         }
 
+        if split == "test":
+            p["depth_range"] = None  # Not calculated yet.
+            p["azimuth_range"] = None  # Not calculated yet.
+            p["elev_range"] = None  # Not calculated yet.
+
+        supported_error_types = ["ad", "add", "adi", "mssd", "mspd"]
+    elif dataset_name == "ipd":
+            modalities_have_separate_annotations = True 
+            p["im_modalities"] = ["rgb_photoneo", "depth_photoneo"]
+            p["test_scene_ids"] = list(range(0,1))
+            # p["test_aria_scene_ids"] = list(range(3365, 3832))
+            p["scene_ids"] = {
+                "test": p["test_scene_ids"],  # test_quest3 + test_aria
+                "train": p["test_scene_ids"],  # train_quest3 + train_aria
+                "train_pbr": p["test_scene_ids"],  # train_quest3 + train_aria
+            }[split]
+            # p["im_size"] = (2400, 2400)
+            # p["im_size"] = (1936, 1216)
+
+            p["photoneo_im_size"] = (2064, 1544)
+            p["im_size"] = p["photoneo_im_size"]
+
+
+
+            def ipd_eval_modality(scene_id):
+                return "rgb_photoneo"
+
+            p["eval_modality"] = ipd_eval_modality
+
+            exts = {
+                "rgb_photoneo": ".png",
+                "depth_photoneo": ".png",
+            }
+
+
+            if split == "test":
+                p["depth_range"] = None  # Not calculated yet.
+                p["azimuth_range"] = None  # Not calculated yet.
+                p["elev_range"] = None  # Not calculated yet.
+
+            supported_error_types = ["ad", "add", "adi", "mssd", "mspd"]
+
+    elif dataset_name == "xyz":
+        modalities_have_separate_annotations = True 
+        p["im_modalities"] = ["gray_photoneo", "depth_photoneo", "gray_xyz", "depth_xyz", "rgb_realsense", "depth_realsense"]
+        p["test_scene_ids"] = list(range(1,87))
+        # p["test_aria_scene_ids"] = list(range(3365, 3832))
+        p["scene_ids"] = {
+            "test": p["test_scene_ids"],  # test_quest3 + test_aria
+            "train": p["test_scene_ids"],  # train_quest3 + train_aria
+            "train_pbr": list(range(50)),  # train_quest3 + train_aria
+        }[split]
+
+        # These are probably mixed up in the real data!
+        p["photoneo_im_size"] = (1440, 1080)
+        p["realsense_im_size"] = (1280, 720)
+        p["xyz_im_size"] = (2064, 1544)
+        # pbr im size
+        p["im_size"] = p["photoneo_im_size"]
+
+        def xyz_eval_modality(scene_id):
+            return "gray_xyz"
+
+        p["eval_modality"] = xyz_eval_modality
+
+        if "pbr" == split_type:
+            # The PBR data is in classical BOP format without sensor names.
+            p["eval_modality"] = None
+            modalities_have_separate_annotations = False
+        # def hot3d_eval_modality(scene_id):
+        #     if scene_id in p["test_quest3_scene_ids"] or scene_id in p["train_quest3_scene_ids"]:
+        #         return p["quest3_eval_modality"]
+        #     elif scene_id in p["test_aria_scene_ids"] or scene_id in p["train_aria_scene_ids"]:
+        #         return p["aria_eval_modality"]
+        #     else:
+        #         raise ValueError("scene_id {} not part of hot3d valid scenes".format(scene_id))
+
+        # p["eval_modality"] = hot3d_eval_modality
+
+        exts = {
+            "gray_photoneo": ".png",
+            "depth_photoneo": ".png",
+            "gray_xyz": ".png",
+            "depth_xyz": ".png",
+            "rgb_realsense": ".png",
+            "depth_realsense": ".png",
+        }
+        rgb_ext = ".png"
+
         if split == "test":
             p["depth_range"] = None  # Not calculated yet.
             p["azimuth_range"] = None  # Not calculated yet.
@@ -500,7 +593,14 @@ def hot3d_eval_modality(scene_id):
 
     else:
         assert exts is not None, "Need to set 'exts' for dataset {}".format()
+        present_scene_id = get_present_scene_ids(p)[0]
         for moda in p["im_modalities"]:
+            sensor_moda = moda
+            if not os.path.exists(join(
+                        split_path, "{present_scene_id:06d}", "scene_gt_{}.json".format(moda)
+                    )):
+                # If modalities have aligned extrinsics/intrinsics they are combined in one file 
+                sensor_moda = moda[(moda.find("_") + 1):]
             p.update(
                 {
                     # Path template to modality image.
@@ -509,33 +609,34 @@ def hot3d_eval_modality(scene_id):
                     ),
                     # Path template to a file with per-image camera parameters.
                     "scene_camera_{}_tpath".format(moda): join(
-                        split_path, "{scene_id:06d}", "scene_camera_{}.json".format(moda)
+                        split_path, "{scene_id:06d}", "scene_camera_{}.json".format(sensor_moda)
                     ),
                     # Path template to a file with GT annotations.
                     "scene_gt_{}_tpath".format(moda): join(
-                        split_path, "{scene_id:06d}", "scene_gt_{}.json".format(moda)
+                        split_path, "{scene_id:06d}", "scene_gt_{}.json".format(sensor_moda)
                     ),
                     # Path template to a file with meta information about the GT annotations.
                     "scene_gt_info_{}_tpath".format(moda): join(
-                        split_path, "{scene_id:06d}", "scene_gt_info_{}.json".format(moda)
+                        split_path, "{scene_id:06d}", "scene_gt_info_{}.json".format(sensor_moda)
                     ),
                     # Path template to a file with the coco GT annotations.
                     "scene_gt_coco_{}_tpath".format(moda): join(
-                        split_path, "{scene_id:06d}", "scene_gt_coco_{}.json".format(moda)
+                        split_path, "{scene_id:06d}", "scene_gt_coco_{}.json".format(sensor_moda)
                     ),
                     # Path template to a mask of the full object silhouette.
                     "mask_{}_tpath".format(moda): join(
-                        split_path, "{scene_id:06d}", "mask_{}".format(moda), "{im_id:06d}_{gt_id:06d}.png"
+                        split_path, "{scene_id:06d}", "mask_{}".format(sensor_moda), "{im_id:06d}_{gt_id:06d}.png"
                     ),
                     # Path template to a mask of the visible part of an object silhouette.
                     "mask_visib_{}_tpath".format(moda): join(
                         split_path,
                         "{scene_id:06d}",
-                        "mask_visib_{}".format(moda),
+                        "mask_visib_{}".format(sensor_moda),
                         "{im_id:06d}_{gt_id:06d}.png",
                     ),
                 }
             )
+            print(p)
 
     return p
 
@@ -559,11 +660,13 @@ def scene_tpaths_keys(eval_modality, scene_id=None):
 
     tpath_keys = [
         "scene_gt_tpath", "scene_gt_info_tpath", "scene_camera_tpath", 
-        "scene_gt_coco_tpath", "mask_tpath", "mask_visib_tpath"
+        "scene_gt_coco_tpath", "mask_tpath", "mask_visib_tpath", "rgb_tpath", 
+        "gray_tpath", "depth_tpath"
     ]
     tpath_keys_multi = [
         "scene_gt_{}_tpath", "scene_gt_info_{}_tpath", "scene_camera_{}_tpath", 
-        "scene_gt_coco_{}_tpath", "mask_{}_tpath", "mask_visib_{}_tpath"
+        "scene_gt_coco_{}_tpath", "mask_{}_tpath", "mask_visib_{}_tpath", "{}_tpath", 
+        "{}_tpath", "depth_{}_tpath"
     ]
 
     assert len(tpath_keys) == len(tpath_keys_multi)
@@ -580,7 +683,10 @@ def scene_tpaths_keys(eval_modality, scene_id=None):
             tpath_keys_dic[key] = key_multi.format(eval_modality[scene_id])
         else:
             raise ValueError("eval_modality type not supported, either None, str, callable or dictionary")
-
+    # TODO: Find a nicer solution. e.g. split modality and sensor throughout the bop toolkit.
+    parts = tpath_keys_dic["depth_tpath"].split("_")
+    parts.pop(1)
+    tpath_keys_dic["depth_tpath"] = "_".join(parts)
     return tpath_keys_dic
 
 

diff --git a/bop_toolkit_lib/inout.py b/bop_toolkit_lib/inout.py
@@ -621,6 +621,7 @@ def load_ply(path):
         "float": ("f", 4),
         "double": ("d", 8),
         "int": ("i", 4),
+        "uint": ("I", 4),
         "uchar": ("B", 1),
     }
 

diff --git a/bop_toolkit_lib/visualization.py b/bop_toolkit_lib/visualization.py
@@ -268,6 +268,8 @@ def vis_object_poses(
             {"name": "min diff", "fmt": ":.3f", "val": np.min(depth_diff_valid)},
             {"name": "max diff", "fmt": ":.3f", "val": np.max(depth_diff_valid)},
             {"name": "mean diff", "fmt": ":.3f", "val": np.mean(depth_diff_valid)},
+            {"name": "median diff", "fmt": ":.3f", "val": np.median(np.abs(depth_diff_valid))},
+            {"name": "25 percentile", "fmt": ":.3f", "val": np.percentile(np.abs(depth_diff_valid), 25)},
         ]
         depth_diff_vis = write_text_on_image(depth_diff_vis, depth_info)
         inout.save_im(vis_depth_diff_path, depth_diff_vis)
diff --git a/requirements.txt b/requirements.txt
@@ -3,7 +3,7 @@ kiwisolver==1.3.1
 matplotlib==2.2.4
 imageio==2.5.0
 pypng==0.0.19
-Cython==0.29.24
+Cython>=0.29.24
 PyOpenGL==3.1.0
 triangle>=20190115.2
 glumpy==1.1.0

diff --git a/scripts/vis_gt_poses.py b/scripts/vis_gt_poses.py
@@ -35,7 +35,7 @@
 ################################################################################
 p = {
     # See dataset_params.py for options.
-    "dataset": "lm",
+    "dataset": "ipd",
     # Dataset split. Options: 'train', 'val', 'test'.
     "dataset_split": "test",
     # Dataset split type. None = default. See dataset_params.py for options.
@@ -50,6 +50,10 @@
     "scene_ids": [],
     "im_ids": [],
     "gt_ids": [],
+    # Which sensor to visualize. By default it uses the evaluation modality set
+    # in dataset_params.py. Set to None for rendering PBR images or BOP core datasets.
+    # Set to sensor for new BOP core sets, e.g. "photoneo".
+    "sensor": "",
 
     # ---------------------------------------------------------------------------------
     # Next parameters apply only to classical BOP19 datasets (not the H3 BOP24 format)
@@ -91,7 +95,7 @@
     raise ImportError("Missing hand_tracking_toolkit dependency, mandatory for HOT3D dataset.")
 
 # if HOT3D dataset is used, next parameters are set
-if p["dataset"] == "hot3d":
+if p["dataset"] in ["hot3d"]:
     p["vis_rgb"] = True
     p["vis_rgb_resolve_visib"] = False
     p["vis_depth_diff"] = False
@@ -104,6 +108,11 @@
 model_type = "eval"  # None = default.
 dp_model = dataset_params.get_model_params(p["datasets_path"], p["dataset"], model_type)
 
+# Find color modality of specified sensor.
+if p["sensor"]:
+    sensor_mods = [mod.split("_")[0] for mod in dp_split["im_modalities"] if p["sensor"] in mod]
+    p["modality"] = [mod for mod in sensor_mods if any(col in mod for col in ["rgb","gray"])][0]
+
 # Load colors.
 colors_path = os.path.join(os.path.dirname(visualization.__file__), "colors.json")
 colors = inout.load_json(colors_path)
@@ -142,11 +151,15 @@
     aria_im_size = dp_split["aria_im_size"][dp_split["aria_eval_modality"]]
     quest3_ren = renderer_htt.RendererHtt(quest3_im_size, p["renderer_type"], shading="flat")
     aria_ren = renderer_htt.RendererHtt(aria_im_size, p["renderer_type"], shading="flat")
-else:  # classical BOP format
+elif p["sensor"]:  # classical BOP format
+    width, height = dp_split["{}_im_size".format(p["sensor"])]
+else:
     width, height = dp_split["im_size"]
-    ren = renderer.create_renderer(
-        width, height, p["renderer_type"], mode=renderer_mode, shading="flat"
-    )
+
+ren = renderer.create_renderer(
+    width, height, p["renderer_type"], mode=renderer_mode, shading="flat"
+)
+# ren = renderer_htt.RendererHtt(dp_split["im_size"], p["renderer_type"], shading="flat")
 
 # Load object models.
 models = {}
@@ -164,7 +177,11 @@
 
 scene_ids = dataset_params.get_present_scene_ids(dp_split)
 for scene_id in scene_ids:
-    tpath_keys = dataset_params.scene_tpaths_keys(dp_split["eval_modality"], scene_id)
+    if p["sensor"]:
+        tpath_keys = dataset_params.scene_tpaths_keys("{}_{}".format(p["modality"], p["sensor"]))
+    else:
+        tpath_keys = dataset_params.scene_tpaths_keys(dp_split["eval_modality"], scene_id)
+
     if p["dataset"] == "hot3d":  # for other dataset the renderer does not change
         # find which renderer to use (quest3 or aria)
         if scene_id in dp_split["test_quest3_scene_ids"] or scene_id in dp_split["train_quest3_scene_ids"]:
@@ -224,10 +241,16 @@
                 }
             )
 
-        if p["dataset"] == "hot3d":
+        if p["dataset"] in ["hot3d", "ipd", "xyz"]:
             # load the image of the eval modality
+
+            img_path = dp_split[tpath_keys["rgb_tpath"]].format(scene_id=scene_id, im_id=im_id)
+            if not os.path.exists(img_path):
+                print("rbg path {} does not exist, looking for gray images".format(img_path))
+                img_path = dp_split[tpath_keys["gray_tpath"]].format(scene_id=scene_id, im_id=im_id)
             rgb = inout.load_im(
-                dp_split[dp_split["eval_modality"](scene_id) + "_tpath"].format(scene_id=scene_id, im_id=im_id)
+                    # dp_split[dp_split["eval_modality"](scene_id) + "_tpath"].format(scene_id=scene_id, im_id=im_id)
+                img_path
             )
             # if image is grayscale (quest3), convert it to 3 channels
             if rgb.ndim == 2:
@@ -249,32 +272,39 @@
                     raise ValueError("RGB nor gray images are available.")
 
         depth = None
-        if p["dataset"] != "hot3d":
+        if p["dataset"] not in ["hot3d"]:
             if p["vis_depth_diff"] or (p["vis_rgb"] and p["vis_rgb_resolve_visib"]):
                 depth = inout.load_depth(
-                    dp_split["depth_tpath"].format(scene_id=scene_id, im_id=im_id)
+                    dp_split[tpath_keys["depth_tpath"]].format(scene_id=scene_id, im_id=im_id)
                 )
                 depth *= scene_camera[im_id]["depth_scale"]  # Convert to [mm].
 
+                # if depth.ndim == 2:
+                #     depth = np.dstack([depth, depth, depth])
+                # breakpoint()
+                # depth = depth[:,:,0]
+
         # Path to the output RGB visualization.
         vis_rgb_path = None
         if p["vis_rgb"]:
+            split = p["dataset_split"] if not p["sensor"] else p["dataset_split"] + "_{}".format(p["sensor"])
             vis_rgb_path = p["vis_rgb_tpath"].format(
                 vis_path=p["vis_path"],
                 dataset=p["dataset"],
-                split=p["dataset_split"],
+                split=split,
                 scene_id=scene_id,
                 im_id=im_id,
             )
 
         # Path to the output depth difference visualization.
         vis_depth_diff_path = None
         if p["dataset"] != "hot3d":
+            split = p["dataset_split"] if not p["sensor"] else p["dataset_split"] + "_{}".format(p["sensor"])
             if p["vis_depth_diff"]:
                 vis_depth_diff_path = p["vis_depth_diff_tpath"].format(
                     vis_path=p["vis_path"],
                     dataset=p["dataset"],
-                    split=p["dataset_split"],
+                    split=split,
                     scene_id=scene_id,
                     im_id=im_id,
                 )