diff --git a/common/perception_utils/CMakeLists.txt b/common/perception_utils/CMakeLists.txt
index d4bbe928c0be..18ef18303fb1 100644
--- a/common/perception_utils/CMakeLists.txt
+++ b/common/perception_utils/CMakeLists.txt
@@ -4,4 +4,13 @@ project(perception_utils)
 find_package(autoware_cmake REQUIRED)
 autoware_package()
 
+ament_auto_add_library(${PROJECT_NAME} SHARED
+  src/run_length_encoder.cpp
+)
+
+find_package(OpenCV REQUIRED)
+target_link_libraries(${PROJECT_NAME}
+  ${OpenCV_LIBS}
+)
+
 ament_auto_package()
diff --git a/common/perception_utils/include/perception_utils/run_length_encoder.hpp b/common/perception_utils/include/perception_utils/run_length_encoder.hpp
new file mode 100644
index 000000000000..18b5f21854e3
--- /dev/null
+++ b/common/perception_utils/include/perception_utils/run_length_encoder.hpp
@@ -0,0 +1,29 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PERCEPTION_UTILS__RUN_LENGTH_ENCODER_HPP_
+
+#define PERCEPTION_UTILS__RUN_LENGTH_ENCODER_HPP_
+#include <opencv2/opencv.hpp>
+
+#include <utility>
+#include <vector>
+
+namespace perception_utils
+{
+std::vector<std::pair<uint8_t, int>> runLengthEncoder(const cv::Mat & mask);
+cv::Mat runLengthDecoder(const std::vector<uint8_t> & rle_data, const int rows, const int cols);
+}  // namespace perception_utils
+
+#endif  // PERCEPTION_UTILS__RUN_LENGTH_ENCODER_HPP_
diff --git a/common/perception_utils/package.xml b/common/perception_utils/package.xml
index 9d5fbf40a4e8..2e65dce1aab0 100644
--- a/common/perception_utils/package.xml
+++ b/common/perception_utils/package.xml
@@ -12,6 +12,7 @@
   <buildtool_depend>ament_cmake_auto</buildtool_depend>
   <buildtool_depend>autoware_cmake</buildtool_depend>
 
+  <depend>libopencv-dev</depend>
   <depend>rclcpp</depend>
 
   <export>
diff --git a/common/perception_utils/src/run_length_encoder.cpp b/common/perception_utils/src/run_length_encoder.cpp
new file mode 100644
index 000000000000..fb7f5ba33b84
--- /dev/null
+++ b/common/perception_utils/src/run_length_encoder.cpp
@@ -0,0 +1,65 @@
+// Copyright 2024 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "perception_utils/run_length_encoder.hpp"
+
+namespace perception_utils
+{
+
+std::vector<std::pair<uint8_t, int>> runLengthEncoder(const cv::Mat & image)
+{
+  std::vector<std::pair<uint8_t, int>> compressed_data;
+  const int rows = image.rows;
+  const int cols = image.cols;
+  compressed_data.emplace_back(image.at<uint8_t>(0, 0), 0);
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      uint8_t current_value = image.at<uint8_t>(i, j);
+      if (compressed_data.back().first == current_value) {
+        ++compressed_data.back().second;
+      } else {
+        compressed_data.emplace_back(current_value, 1);
+      }
+    }
+  }
+  return compressed_data;
+}
+
+cv::Mat runLengthDecoder(const std::vector<uint8_t> & rle_data, const int rows, const int cols)
+{
+  cv::Mat mask(rows, cols, CV_8UC1, cv::Scalar(0));
+  int idx = 0;
+  int step = sizeof(uint8_t) + sizeof(int);
+  for (size_t i = 0; i < rle_data.size(); i += step) {
+    uint8_t value;
+    int length;
+    std::memcpy(&value, &rle_data[i], sizeof(uint8_t));
+    std::memcpy(
+      &length, &rle_data[i + 1],
+      sizeof(
+        int));  // under the condition that we know rle_data[i] only consume 1 element of the vector
+    for (int j = 0; j < length; ++j) {
+      int row_idx = static_cast<int>(idx / cols);
+      int col_idx = static_cast<int>(idx % cols);
+      mask.at<uint8_t>(row_idx, col_idx) = value;
+      idx++;
+      if (idx > rows * cols) {
+        break;
+      }
+    }
+  }
+  return mask;
+}
+
+}  // namespace perception_utils
diff --git a/common/tensorrt_common/include/tensorrt_common/tensorrt_common.hpp b/common/tensorrt_common/include/tensorrt_common/tensorrt_common.hpp
index aabaea7ca633..6691c1fb9e97 100644
--- a/common/tensorrt_common/include/tensorrt_common/tensorrt_common.hpp
+++ b/common/tensorrt_common/include/tensorrt_common/tensorrt_common.hpp
@@ -15,7 +15,9 @@
 #ifndef TENSORRT_COMMON__TENSORRT_COMMON_HPP_
 #define TENSORRT_COMMON__TENSORRT_COMMON_HPP_
 
+#ifndef YOLOX_STANDALONE
 #include <rclcpp/rclcpp.hpp>
+#endif
 
 #include <NvInfer.h>
 #include <NvOnnxParser.h>
@@ -86,6 +88,7 @@ struct BuildConfig
     profile_per_layer(profile_per_layer),
     clip_value(clip_value)
   {
+#ifndef YOLOX_STANDALONE
     if (
       std::find(valid_calib_type.begin(), valid_calib_type.end(), calib_type_str) ==
       valid_calib_type.end()) {
@@ -95,6 +98,7 @@ struct BuildConfig
               << "Default calibration type will be used: MinMax" << std::endl;
       std::cerr << message.str();
     }
+#endif
   }
 };
 
diff --git a/launch/tier4_perception_launch/launch/object_recognition/detection/detection.launch.xml b/launch/tier4_perception_launch/launch/object_recognition/detection/detection.launch.xml
index 754d07c6d99b..df7733b3102b 100644
--- a/launch/tier4_perception_launch/launch/object_recognition/detection/detection.launch.xml
+++ b/launch/tier4_perception_launch/launch/object_recognition/detection/detection.launch.xml
@@ -3,6 +3,7 @@
   <!-- Interface parameters -->
   <arg name="mode" default="lidar" description="options: `camera_lidar_radar_fusion`, `camera_lidar_fusion`, `lidar_radar_fusion`, `lidar` or `radar`"/>
   <arg name="use_detection_by_tracker"/>
+  <arg name="use_image_segmentation_based_filter"/>
 
   <!-- Lidar parameters -->
   <arg name="input/pointcloud"/>
@@ -60,6 +61,7 @@
       <arg name="use_low_height_cropbox" value="$(var use_low_height_cropbox)"/>
       <arg name="use_roi_based_cluster" value="$(var use_roi_based_cluster)"/>
       <arg name="detection_by_tracker_param_path" value="$(var detection_by_tracker_param_path)"/>
+      <arg name="use_image_segmentation_based_filter" value="$(var use_image_segmentation_based_filter)"/>
     </include>
     <!-- Lidar dnn-based detectors-->
     <include file="$(find-pkg-share tier4_perception_launch)/launch/object_recognition/detection/detector/lidar_dnn_detector.launch.xml">
diff --git a/launch/tier4_perception_launch/launch/object_recognition/detection/detector/camera_lidar_detector.launch.xml b/launch/tier4_perception_launch/launch/object_recognition/detection/detector/camera_lidar_detector.launch.xml
index fe0ba0e614e6..bbc8545fc39c 100644
--- a/launch/tier4_perception_launch/launch/object_recognition/detection/detector/camera_lidar_detector.launch.xml
+++ b/launch/tier4_perception_launch/launch/object_recognition/detection/detector/camera_lidar_detector.launch.xml
@@ -9,6 +9,7 @@
   <arg name="lidar_detection_model" default="centerpoint" description="options: `centerpoint`, `apollo`, `pointpainting`, `clustering`"/>
   <arg name="use_roi_based_cluster" default="false"/>
   <arg name="use_low_intensity_cluster_filter" default="false"/>
+  <arg name="use_image_segmentation_based_filter" default="false"/>
 
   <!-- Camera parameters -->
   <arg name="image_raw0" default="/image_raw" description="image raw topic name"/>
@@ -104,14 +105,24 @@
     </include>
   </group>
 
+  <!-- Image_segmentation based filter, apply for camera0 only-->
+  <group>
+    <include file="$(find-pkg-share image_projection_based_fusion)/launch/segmentation_pointcloud_fusion.launch.xml" if="$(var use_image_segmentation_based_filter)">
+      <arg name="input/pointcloud" value="/perception/object_recognition/detection/pointcloud_map_filtered/pointcloud"/>
+      <arg name="output/pointcloud" value="/perception/object_recognition/detection/segmentation_based_filtered/pointcloud"/>
+    </include>
+  </group>
+
   <!-- Clustering -->
   <group>
     <push-ros-namespace namespace="clustering"/>
     <group>
+      <let name="euclidean_cluster_input" value="/perception/object_recognition/detection/pointcloud_map_filtered/pointcloud" unless="$(var use_image_segmentation_based_filter)"/>
+      <let name="euclidean_cluster_input" value="/perception/object_recognition/detection/segmentation_based_filtered/pointcloud" if="$(var use_image_segmentation_based_filter)"/>
       <let name="euclidean_cluster_output" value="euclidean_cluster/clusters" if="$(var use_roi_based_cluster)"/>
       <let name="euclidean_cluster_output" value="clusters" unless="$(var use_roi_based_cluster)"/>
       <include file="$(find-pkg-share euclidean_cluster)/launch/voxel_grid_based_euclidean_cluster.launch.xml">
-        <arg name="input_pointcloud" value="/perception/object_recognition/detection/pointcloud_map_filtered/pointcloud"/>
+        <arg name="input_pointcloud" value="$(var euclidean_cluster_input)"/>
         <arg name="output_clusters" value="$(var euclidean_cluster_output)"/>
         <arg name="use_low_height_cropbox" value="$(var use_low_height_cropbox)"/>
       </include>
diff --git a/launch/tier4_perception_launch/launch/perception.launch.xml b/launch/tier4_perception_launch/launch/perception.launch.xml
index dd80ab5b2cde..40ab96043e98 100644
--- a/launch/tier4_perception_launch/launch/perception.launch.xml
+++ b/launch/tier4_perception_launch/launch/perception.launch.xml
@@ -79,6 +79,7 @@
   <arg name="use_object_filter" default="true" description="use object filter"/>
   <arg name="use_roi_based_cluster" default="true" description="use roi_based_cluster in clustering"/>
   <arg name="use_low_intensity_cluster_filter" default="true" description="use low_intensity_cluster_filter in clustering"/>
+  <arg name="use_image_segmentation_based_filter" default="true" description="use image_segmentation_based_filter in clustering"/>
   <arg
     name="use_empty_dynamic_object_publisher"
     default="false"
@@ -221,6 +222,7 @@
           <arg name="use_radar_tracking_fusion" value="$(var use_radar_tracking_fusion)"/>
           <arg name="use_pointcloud_map" value="$(var use_pointcloud_map)"/>
           <arg name="use_low_height_cropbox" value="$(var use_low_height_cropbox)"/>
+          <arg name="use_image_segmentation_based_filter" value="$(var use_image_segmentation_based_filter)"/>
           <arg name="use_object_filter" value="$(var use_object_filter)"/>
           <arg name="pointcloud_container_name" value="$(var pointcloud_container_name)"/>
           <arg name="use_roi_based_cluster" value="$(var use_roi_based_cluster)"/>
diff --git a/perception/image_projection_based_fusion/config/segmentation_pointcloud_fusion.param.yaml b/perception/image_projection_based_fusion/config/segmentation_pointcloud_fusion.param.yaml
index 2120a909cd67..418a3190d25d 100644
--- a/perception/image_projection_based_fusion/config/segmentation_pointcloud_fusion.param.yaml
+++ b/perception/image_projection_based_fusion/config/segmentation_pointcloud_fusion.param.yaml
@@ -1,39 +1,27 @@
 /**:
   ros__parameters:
     # if the semantic label is applied for pointcloud filtering
+
     filter_semantic_label_target:
-      [
-      true, # road
-      true, # sidewalk
-      true, # building
-      true, # wall
-      true, # fence
-      true, # pole
-      true, # traffic_light
-      true, # traffic_sign
-      true, # vegetation
-      true, # terrain
-      true, # sky
-      false, # person
-      false, # ride
-      false, # car
-      false, # truck
-      false, # bus
-      false, # train
-      false, # motorcycle
-      false, # bicycle
-      false, # others
-      ]
-    # the maximum distance of pointcloud to be applied filter,
-    # this is selected based on semantic segmentation model accuracy,
-    # calibration accuracy and unknown reaction distance
+      UNKNOWN: false
+      BUILDING: true
+      WALL: true
+      OBSTACLE: false
+      TRAFFIC_LIGHT: false
+      TRAFFIC_SIGN: false
+      PERSON: false
+      VEHICLE: false
+      BIKE: false
+      ROAD: true
+      SIDEWALK: false
+      ROAD_PAINT: false
+      CURBSTONE: false
+      CROSSWALK: false
+      VEGETATION: true
+      SKY: false
+
+    # the maximum distance of pointcloud to be applied filter
     filter_distance_threshold: 60.0
 
-    # debug
-    debug_mode: false
-    filter_scope_min_x: -100.0
-    filter_scope_max_x: 100.0
-    filter_scope_min_y: -100.0
-    filter_scope_max_y: 100.0
-    filter_scope_min_z: -100.0
-    filter_scope_max_z: 100.0
+    # Avoid using debug mask in case of multiple camera semantic segmentation fusion
+    is_publish_debug_mask: false
diff --git a/perception/image_projection_based_fusion/docs/segmentation-pointcloud-fusion.md b/perception/image_projection_based_fusion/docs/segmentation-pointcloud-fusion.md
index d59e804f1228..3c469ac15c6e 100644
--- a/perception/image_projection_based_fusion/docs/segmentation-pointcloud-fusion.md
+++ b/perception/image_projection_based_fusion/docs/segmentation-pointcloud-fusion.md
@@ -32,9 +32,7 @@ The node `segmentation_pointcloud_fusion` is a package for filtering pointcloud
 
 ### Core Parameters
 
-| Name          | Type | Description              |
-| ------------- | ---- | ------------------------ |
-| `rois_number` | int  | the number of input rois |
+{{ json_to_markdown("perception/image_projection_based_fusion/schema/segmentation_pointcloud_fusion.schema.json") }}
 
 ## Assumptions / Known limits
 
diff --git a/perception/image_projection_based_fusion/include/image_projection_based_fusion/segmentation_pointcloud_fusion/node.hpp b/perception/image_projection_based_fusion/include/image_projection_based_fusion/segmentation_pointcloud_fusion/node.hpp
index c458c17bed79..cf6a7bb6ca87 100644
--- a/perception/image_projection_based_fusion/include/image_projection_based_fusion/segmentation_pointcloud_fusion/node.hpp
+++ b/perception/image_projection_based_fusion/include/image_projection_based_fusion/segmentation_pointcloud_fusion/node.hpp
@@ -17,7 +17,11 @@
 
 #include "image_projection_based_fusion/fusion_node.hpp"
 
+#include <image_transport/image_transport.hpp>
+
 #include <string>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 
 #if __has_include(<cv_bridge/cv_bridge.hpp>)
@@ -34,7 +38,17 @@ class SegmentPointCloudFusionNode : public FusionNode<PointCloud2, PointCloud2,
   rclcpp::Publisher<PointCloud2>::SharedPtr pub_pointcloud_ptr_;
   std::vector<bool> filter_semantic_label_target_;
   float filter_distance_threshold_;
-  /* data */
+  // declare list of semantic label target, depend on trained data of yolox segmentation model
+  std::vector<std::pair<std::string, bool>> filter_semantic_label_target_list_ = {
+    {"UNKNOWN", false},       {"BUILDING", false},     {"WALL", false},       {"OBSTACLE", false},
+    {"TRAFFIC_LIGHT", false}, {"TRAFFIC_SIGN", false}, {"PERSON", false},     {"VEHICLE", false},
+    {"BIKE", false},          {"ROAD", false},         {"SIDEWALK", false},   {"ROAD_PAINT", false},
+    {"CURBSTONE", false},     {"CROSSWALK", false},    {"VEGETATION", false}, {"SKY", false}};
+
+  image_transport::Publisher pub_debug_mask_ptr_;
+  bool is_publish_debug_mask_;
+  std::unordered_set<size_t> filter_global_offset_set_;
+
 public:
   explicit SegmentPointCloudFusionNode(const rclcpp::NodeOptions & options);
 
diff --git a/perception/image_projection_based_fusion/package.xml b/perception/image_projection_based_fusion/package.xml
index 8d5a2ef1fe51..257f0c193648 100644
--- a/perception/image_projection_based_fusion/package.xml
+++ b/perception/image_projection_based_fusion/package.xml
@@ -27,6 +27,7 @@
   <depend>object_recognition_utils</depend>
   <depend>pcl_conversions</depend>
   <depend>pcl_ros</depend>
+  <depend>perception_utils</depend>
   <depend>rclcpp</depend>
   <depend>rclcpp_components</depend>
   <depend>sensor_msgs</depend>
diff --git a/perception/image_projection_based_fusion/schema/segmentation_pointcloud_fusion.schema.json b/perception/image_projection_based_fusion/schema/segmentation_pointcloud_fusion.schema.json
new file mode 100644
index 000000000000..a21ad583af46
--- /dev/null
+++ b/perception/image_projection_based_fusion/schema/segmentation_pointcloud_fusion.schema.json
@@ -0,0 +1,143 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Parameters for Segmentation Pointcloud Fusion Node",
+  "type": "object",
+  "definitions": {
+    "segmentation_pointcloud_fusion": {
+      "type": "object",
+      "properties": {
+        "filter_semantic_label_target": {
+          "type": "object",
+          "properties": {
+            "UNKNOWN": {
+              "type": "boolean",
+              "description": "If true, UNKNOWN class of semantic will be filtered.",
+              "default": false
+            },
+            "BUILDING": {
+              "type": "boolean",
+              "description": "If true, BUILDING class of semantic will be filtered.",
+              "default": true
+            },
+            "WALL": {
+              "type": "boolean",
+              "description": "If true, WALL class of semantic will be filtered.",
+              "default": true
+            },
+            "OBSTACLE": {
+              "type": "boolean",
+              "description": "If true, OBSTACLE class of semantic will be filtered.",
+              "default": false
+            },
+            "TRAFFIC_LIGHT": {
+              "type": "boolean",
+              "description": "If true, TRAFFIC_LIGHT class of semantic will be filtered.",
+              "default": false
+            },
+            "TRAFFIC_SIGN": {
+              "type": "boolean",
+              "description": "If true, TRAFFIC_SIGN class of semantic will be filtered.",
+              "default": false
+            },
+            "PERSON": {
+              "type": "boolean",
+              "description": "If true, PERSON class of semantic will be filtered.",
+              "default": false
+            },
+            "VEHICLE": {
+              "type": "boolean",
+              "description": "If true, VEHICLE class of semantic will be filtered.",
+              "default": false
+            },
+            "BIKE": {
+              "type": "boolean",
+              "description": "If true, BIKE class of semantic will be filtered.",
+              "default": false
+            },
+            "ROAD": {
+              "type": "boolean",
+              "description": "If true, ROAD class of semantic will be filtered.",
+              "default": true
+            },
+            "SIDEWALK": {
+              "type": "boolean",
+              "description": "If true, SIDEWALK class of semantic will be filtered.",
+              "default": false
+            },
+            "ROAD_PAINT": {
+              "type": "boolean",
+              "description": "If true, ROAD_PAINT class of semantic will be filtered.",
+              "default": false
+            },
+            "CURBSTONE": {
+              "type": "boolean",
+              "description": "If true, CURBSTONE class of semantic will be filtered.",
+              "default": false
+            },
+            "CROSSWALK": {
+              "type": "boolean",
+              "description": "If true, CROSSWALK class of semantic will be filtered.",
+              "default": false
+            },
+            "VEGETATION": {
+              "type": "boolean",
+              "description": "If true, VEGETATION class of semantic will be filtered.",
+              "default": true
+            },
+            "SKY": {
+              "type": "boolean",
+              "description": "If true, SKY class of semantic will be filtered.",
+              "default": false
+            }
+          },
+          "required": [
+            "UNKNOWN",
+            "BUILDING",
+            "WALL",
+            "OBSTACLE",
+            "TRAFFIC_LIGHT",
+            "TRAFFIC_SIGN",
+            "PERSON",
+            "VEHICLE",
+            "BIKE",
+            "ROAD",
+            "SIDEWALK",
+            "ROAD_PAINT",
+            "CURBSTONE",
+            "CROSSWALK",
+            "VEGETATION",
+            "SKY"
+          ]
+        },
+        "filter_distance_threshold": {
+          "type": "number",
+          "description": "A maximum distance of pointcloud to apply filter [m].",
+          "default": 60.0,
+          "minimum": 0.0
+        },
+        "is_publish_debug_mask": {
+          "type": "boolean",
+          "description": "If true, debug mask image will be published.",
+          "default": false
+        }
+      },
+      "required": [
+        "filter_semantic_label_target",
+        "filter_distance_threshold",
+        "is_publish_debug_mask"
+      ]
+    }
+  },
+  "properties": {
+    "/**": {
+      "type": "object",
+      "properties": {
+        "ros__parameters": {
+          "$ref": "#/definitions/segmentation_pointcloud_fusion"
+        }
+      },
+      "required": ["ros__parameters"]
+    }
+  },
+  "required": ["/**"]
+}
diff --git a/perception/image_projection_based_fusion/src/segmentation_pointcloud_fusion/node.cpp b/perception/image_projection_based_fusion/src/segmentation_pointcloud_fusion/node.cpp
index 0096b91f7a3b..44481a97fe56 100644
--- a/perception/image_projection_based_fusion/src/segmentation_pointcloud_fusion/node.cpp
+++ b/perception/image_projection_based_fusion/src/segmentation_pointcloud_fusion/node.cpp
@@ -17,6 +17,8 @@
 #include "image_projection_based_fusion/utils/geometry.hpp"
 #include "image_projection_based_fusion/utils/utils.hpp"
 
+#include <perception_utils/run_length_encoder.hpp>
+
 #ifdef ROS_DISTRO_GALACTIC
 #include <tf2_geometry_msgs/tf2_geometry_msgs.h>
 #include <tf2_sensor_msgs/tf2_sensor_msgs.h>
@@ -31,8 +33,15 @@ SegmentPointCloudFusionNode::SegmentPointCloudFusionNode(const rclcpp::NodeOptio
 : FusionNode<PointCloud2, PointCloud2, Image>("segmentation_pointcloud_fusion", options)
 {
   filter_distance_threshold_ = declare_parameter<float>("filter_distance_threshold");
-  filter_semantic_label_target_ =
-    declare_parameter<std::vector<bool>>("filter_semantic_label_target");
+  for (auto & item : filter_semantic_label_target_list_) {
+    item.second = declare_parameter<bool>("filter_semantic_label_target." + item.first);
+  }
+  for (const auto & item : filter_semantic_label_target_list_) {
+    RCLCPP_INFO(
+      this->get_logger(), "filter_semantic_label_target: %s %d", item.first.c_str(), item.second);
+  }
+  is_publish_debug_mask_ = declare_parameter<bool>("is_publish_debug_mask");
+  pub_debug_mask_ptr_ = image_transport::create_publisher(this, "~/debug/mask");
 }
 
 void SegmentPointCloudFusionNode::preprocess(__attribute__((unused)) PointCloud2 & pointcloud_msg)
@@ -40,10 +49,31 @@ void SegmentPointCloudFusionNode::preprocess(__attribute__((unused)) PointCloud2
   return;
 }
 
-void SegmentPointCloudFusionNode::postprocess(__attribute__((unused)) PointCloud2 & pointcloud_msg)
+void SegmentPointCloudFusionNode::postprocess(PointCloud2 & pointcloud_msg)
 {
+  auto original_cloud = std::make_shared<PointCloud2>(pointcloud_msg);
+
+  int point_step = original_cloud->point_step;
+  size_t output_pointcloud_size = 0;
+  pointcloud_msg.data.clear();
+  pointcloud_msg.data.resize(original_cloud->data.size());
+
+  for (size_t global_offset = 0; global_offset < original_cloud->data.size();
+       global_offset += point_step) {
+    if (filter_global_offset_set_.find(global_offset) == filter_global_offset_set_.end()) {
+      copyPointCloud(
+        *original_cloud, point_step, global_offset, pointcloud_msg, output_pointcloud_size);
+    }
+  }
+
+  pointcloud_msg.data.resize(output_pointcloud_size);
+  pointcloud_msg.row_step = output_pointcloud_size / pointcloud_msg.height;
+  pointcloud_msg.width = output_pointcloud_size / pointcloud_msg.point_step / pointcloud_msg.height;
+
+  filter_global_offset_set_.clear();
   return;
 }
+
 void SegmentPointCloudFusionNode::fuseOnSingleImage(
   const PointCloud2 & input_pointcloud_msg, __attribute__((unused)) const std::size_t image_id,
   [[maybe_unused]] const Image & input_mask, __attribute__((unused)) const CameraInfo & camera_info,
@@ -52,19 +82,23 @@ void SegmentPointCloudFusionNode::fuseOnSingleImage(
   if (input_pointcloud_msg.data.empty()) {
     return;
   }
-  cv_bridge::CvImagePtr in_image_ptr;
-  try {
-    in_image_ptr = cv_bridge::toCvCopy(
-      std::make_shared<sensor_msgs::msg::Image>(input_mask), input_mask.encoding);
-  } catch (const std::exception & e) {
-    RCLCPP_ERROR(this->get_logger(), "cv_bridge exception:%s", e.what());
+  if (input_mask.height == 0 || input_mask.width == 0) {
     return;
   }
+  std::vector<uint8_t> mask_data(input_mask.data.begin(), input_mask.data.end());
+  cv::Mat mask = perception_utils::runLengthDecoder(mask_data, input_mask.height, input_mask.width);
 
-  cv::Mat mask = in_image_ptr->image;
-  if (mask.cols == 0 || mask.rows == 0) {
-    return;
+  // publish debug mask
+  if (is_publish_debug_mask_) {
+    sensor_msgs::msg::Image::SharedPtr debug_mask_msg =
+      cv_bridge::CvImage(std_msgs::msg::Header(), "mono8", mask).toImageMsg();
+    debug_mask_msg->header = input_mask.header;
+    pub_debug_mask_ptr_.publish(debug_mask_msg);
   }
+  const int orig_width = camera_info.width;
+  const int orig_height = camera_info.height;
+  // resize mask to the same size as the camera image
+  cv::resize(mask, mask, cv::Size(orig_width, orig_height), 0, 0, cv::INTER_NEAREST);
   Eigen::Matrix4d projection;
   projection << camera_info.p.at(0), camera_info.p.at(1), camera_info.p.at(2), camera_info.p.at(3),
     camera_info.p.at(4), camera_info.p.at(5), camera_info.p.at(6), camera_info.p.at(7),
@@ -89,15 +123,7 @@ void SegmentPointCloudFusionNode::fuseOnSingleImage(
   int x_offset = input_pointcloud_msg.fields[pcl::getFieldIndex(input_pointcloud_msg, "x")].offset;
   int y_offset = input_pointcloud_msg.fields[pcl::getFieldIndex(input_pointcloud_msg, "y")].offset;
   int z_offset = input_pointcloud_msg.fields[pcl::getFieldIndex(input_pointcloud_msg, "z")].offset;
-  size_t output_pointcloud_size = 0;
-  output_cloud.data.clear();
-  output_cloud.data.resize(input_pointcloud_msg.data.size());
-  output_cloud.fields = input_pointcloud_msg.fields;
-  output_cloud.header = input_pointcloud_msg.header;
-  output_cloud.height = input_pointcloud_msg.height;
-  output_cloud.point_step = input_pointcloud_msg.point_step;
-  output_cloud.is_bigendian = input_pointcloud_msg.is_bigendian;
-  output_cloud.is_dense = input_pointcloud_msg.is_dense;
+
   for (size_t global_offset = 0; global_offset < transformed_cloud.data.size();
        global_offset += point_step) {
     float transformed_x =
@@ -108,8 +134,6 @@ void SegmentPointCloudFusionNode::fuseOnSingleImage(
       *reinterpret_cast<float *>(&transformed_cloud.data[global_offset + z_offset]);
     // skip filtering pointcloud behind the camera or too far from camera
     if (transformed_z <= 0.0 || transformed_z > filter_distance_threshold_) {
-      copyPointCloud(
-        input_pointcloud_msg, point_step, global_offset, output_cloud, output_pointcloud_size);
       continue;
     }
 
@@ -122,8 +146,6 @@ void SegmentPointCloudFusionNode::fuseOnSingleImage(
       normalized_projected_point.x() > 0 && normalized_projected_point.x() < camera_info.width &&
       normalized_projected_point.y() > 0 && normalized_projected_point.y() < camera_info.height;
     if (!is_inside_image) {
-      copyPointCloud(
-        input_pointcloud_msg, point_step, global_offset, output_cloud, output_pointcloud_size);
       continue;
     }
 
@@ -131,20 +153,14 @@ void SegmentPointCloudFusionNode::fuseOnSingleImage(
     uint8_t semantic_id = mask.at<uint8_t>(
       static_cast<uint16_t>(normalized_projected_point.y()),
       static_cast<uint16_t>(normalized_projected_point.x()));
-    if (static_cast<size_t>(semantic_id) >= filter_semantic_label_target_.size()) {
-      copyPointCloud(
-        input_pointcloud_msg, point_step, global_offset, output_cloud, output_pointcloud_size);
+    if (
+      static_cast<size_t>(semantic_id) >= filter_semantic_label_target_list_.size() ||
+      !filter_semantic_label_target_list_.at(semantic_id).second) {
       continue;
     }
-    if (!filter_semantic_label_target_.at(semantic_id)) {
-      copyPointCloud(
-        input_pointcloud_msg, point_step, global_offset, output_cloud, output_pointcloud_size);
-    }
-  }
 
-  output_cloud.data.resize(output_pointcloud_size);
-  output_cloud.row_step = output_pointcloud_size / output_cloud.height;
-  output_cloud.width = output_pointcloud_size / output_cloud.point_step / output_cloud.height;
+    filter_global_offset_set_.insert(global_offset);
+  }
 }
 
 bool SegmentPointCloudFusionNode::out_of_scope(__attribute__((unused))
diff --git a/perception/tensorrt_yolox/README.md b/perception/tensorrt_yolox/README.md
index ca407b1ff681..af88e73cc04a 100644
--- a/perception/tensorrt_yolox/README.md
+++ b/perception/tensorrt_yolox/README.md
@@ -2,13 +2,13 @@
 
 ## Purpose
 
-This package detects target objects e.g., cars, trucks, bicycles, and pedestrians on a image based on [YOLOX](https://github.com/Megvii-BaseDetection/YOLOX) model.
+This package detects target objects e.g., cars, trucks, bicycles, and pedestrians and segment target objects such as cars, trucks, buses and pedestrian, building, vegetation, road, sidewalk on a image based on [YOLOX](https://github.com/Megvii-BaseDetection/YOLOX) model with multi-header structure.
 
 ## Inner-workings / Algorithms
 
 ### Cite
 
-<!-- cspell: ignore Zheng, Songtao, Feng, Zeming, Jian -->
+<!-- cspell: ignore Zheng, Songtao, Feng, Zeming, Jian, semseg -->
 
 Zheng Ge, Songtao Liu, Feng Wang, Zeming Li, Jian Sun, "YOLOX: Exceeding YOLO Series in 2021", arXiv preprint arXiv:2107.08430, 2021 [[ref](https://arxiv.org/abs/2107.08430)]
 
@@ -22,10 +22,12 @@ Zheng Ge, Songtao Liu, Feng Wang, Zeming Li, Jian Sun, "YOLOX: Exceeding YOLO Se
 
 ### Output
 
-| Name          | Type                                               | Description                                        |
-| ------------- | -------------------------------------------------- | -------------------------------------------------- |
-| `out/objects` | `tier4_perception_msgs/DetectedObjectsWithFeature` | The detected objects with 2D bounding boxes        |
-| `out/image`   | `sensor_msgs/Image`                                | The image with 2D bounding boxes for visualization |
+| Name             | Type                                               | Description                                                         |
+| ---------------- | -------------------------------------------------- | ------------------------------------------------------------------- |
+| `out/objects`    | `tier4_perception_msgs/DetectedObjectsWithFeature` | The detected objects with 2D bounding boxes                         |
+| `out/image`      | `sensor_msgs/Image`                                | The image with 2D bounding boxes for visualization                  |
+| `out/mask`       | `sensor_msgs/Image`                                | The semantic segmentation mask                                      |
+| `out/color_mask` | `sensor_msgs/Image`                                | The colorized image of semantic segmentation mask for visualization |
 
 ## Parameters
 
@@ -40,20 +42,33 @@ Zheng Ge, Songtao Liu, Feng Wang, Zeming Li, Jian Sun, "YOLOX: Exceeding YOLO Se
 
 ### Node Parameters
 
-| Name                          | Type   | Default Value | Description                                                                                                                                                                                                                              |
-| ----------------------------- | ------ | ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `model_path`                  | string | ""            | The onnx file name for yolox model                                                                                                                                                                                                       |
-| `label_path`                  | string | ""            | The label file with label names for detected objects written on it                                                                                                                                                                       |
-| `precision`                   | string | "fp16"        | The inference mode: "fp32", "fp16", "int8"                                                                                                                                                                                               |
-| `build_only`                  | bool   | false         | shutdown node after TensorRT engine file is built                                                                                                                                                                                        |
-| `calibration_algorithm`       | string | "MinMax"      | Calibration algorithm to be used for quantization when precision==int8. Valid value is one of: Entropy",("Legacy" \| "Percentile"), "MinMax"]                                                                                            |
-| `dla_core_id`                 | int    | -1            | If positive ID value is specified, the node assign inference task to the DLA core                                                                                                                                                        |
-| `quantize_first_layer`        | bool   | false         | If true, set the operating precision for the first (input) layer to be fp16. This option is valid only when precision==int8                                                                                                              |
-| `quantize_last_layer`         | bool   | false         | If true, set the operating precision for the last (output) layer to be fp16. This option is valid only when precision==int8                                                                                                              |
-| `profile_per_layer`           | bool   | false         | If true, profiler function will be enabled. Since the profile function may affect execution speed, it is recommended to set this flag true only for development purpose.                                                                 |
-| `clip_value`                  | double | 0.0           | If positive value is specified, the value of each layer output will be clipped between [0.0, clip_value]. This option is valid only when precision==int8 and used to manually specify the dynamic range instead of using any calibration |
-| `preprocess_on_gpu`           | bool   | true          | If true, pre-processing is performed on GPU                                                                                                                                                                                              |
-| `calibration_image_list_path` | string | ""            | Path to a file which contains path to images. Those images will be used for int8 quantization.                                                                                                                                           |
+| Name                                   | Type   | Default Value | Description                                                                                                                                                                                                                                 |
+| -------------------------------------- | ------ | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `model_path`                           | string | ""            | The onnx file name for yolox model                                                                                                                                                                                                          |
+| `model_name`                           | string | ""            | The yolox model name: <br /> "yolox-sPlus-T4-960x960-pseudo-finetune" for detection only, could reduce resource and processing_time <br /> "yolox-sPlus-opt-pseudoV2-T4-960x960-T4-seg16cls" for multi-task including semantic segmentation |
+| `label_path`                           | string | ""            | The label file with label names for detected objects written on it                                                                                                                                                                          |
+| `precision`                            | string | "fp16"        | The inference mode: "fp32", "fp16", "int8"                                                                                                                                                                                                  |
+| `build_only`                           | bool   | false         | shutdown node after TensorRT engine file is built                                                                                                                                                                                           |
+| `calibration_algorithm`                | string | "MinMax"      | Calibration algorithm to be used for quantization when precision==int8. Valid value is one of: Entropy",("Legacy" \| "Percentile"), "MinMax"]                                                                                               |
+| `dla_core_id`                          | int    | -1            | If positive ID value is specified, the node assign inference task to the DLA core                                                                                                                                                           |
+| `quantize_first_layer`                 | bool   | false         | If true, set the operating precision for the first (input) layer to be fp16. This option is valid only when precision==int8                                                                                                                 |
+| `quantize_last_layer`                  | bool   | false         | If true, set the operating precision for the last (output) layer to be fp16. This option is valid only when precision==int8                                                                                                                 |
+| `profile_per_layer`                    | bool   | false         | If true, profiler function will be enabled. Since the profile function may affect execution speed, it is recommended to set this flag true only for development purpose.                                                                    |
+| `clip_value`                           | double | 0.0           | If positive value is specified, the value of each layer output will be clipped between [0.0, clip_value]. This option is valid only when precision==int8 and used to manually specify the dynamic range instead of using any calibration    |
+| `preprocess_on_gpu`                    | bool   | true          | If true, pre-processing is performed on GPU                                                                                                                                                                                                 |
+| `calibration_image_list_path`          | string | ""            | Path to a file which contains path to images. Those images will be used for int8 quantization.                                                                                                                                              |
+| `yolox_s_plus_opt_param_path`          | string | ""            | Path to parameter file                                                                                                                                                                                                                      |
+| `is_publish_color_mask`                | bool   | false         | If true, publish color mask for result visualization                                                                                                                                                                                        |
+| `is_roi_overlap_segment`               | bool   | true          | If true, overlay detected object roi onto semantic segmentation to avoid over-filtering pointcloud especially small size objects                                                                                                            |
+| `overlap_roi_score_threshold`          | float  | 0.3           | minimum existence_probability of detected roi considered to replace segmentation                                                                                                                                                            |
+| `roi_overlay_segment_label.UNKNOWN`    | bool   | true          | If true, unknown objects roi will be overlaid onto sematic segmentation mask.                                                                                                                                                               |
+| `roi_overlay_segment_label.CAR`        | bool   | false         | If true, car objects roi will be overlaid onto sematic segmentation mask.                                                                                                                                                                   |
+| `roi_overlay_segment_label.TRUCK`      | bool   | false         | If true, truck objects roi will be overlaid onto sematic segmentation mask.                                                                                                                                                                 |
+| `roi_overlay_segment_label.BUS`        | bool   | false         | If true, bus objects roi will be overlaid onto sematic segmentation mask.                                                                                                                                                                   |
+| `roi_overlay_segment_label.MOTORCYCLE` | bool   | true          | If true, motorcycle objects roi will be overlaid onto sematic segmentation mask.                                                                                                                                                            |
+| `roi_overlay_segment_label.BICYCLE`    | bool   | true          | If true, bicycle objects roi will be overlaid onto sematic segmentation mask.                                                                                                                                                               |
+| `roi_overlay_segment_label.PEDESTRIAN` | bool   | true          | If true, pedestrian objects roi will be overlaid onto sematic segmentation mask.                                                                                                                                                            |
+| `roi_overlay_segment_label.ANIMAL`     | bool   | true          | If true, animal objects roi will be overlaid onto sematic segmentation mask.                                                                                                                                                                |
 
 ## Assumptions / Known limits
 
@@ -69,6 +84,27 @@ The label contained in detected 2D bounding boxes (i.e., `out/objects`) will be
 If other labels (case insensitive) are contained in the file specified via the `label_file` parameter,
 those are labeled as `UNKNOWN`, while detected rectangles are drawn in the visualization result (`out/image`).
 
+The semantic segmentation mask is a gray image whose each pixel is index of one following class:
+
+| index | semantic name    |
+| ----- | ---------------- |
+| 0     | road             |
+| 1     | building         |
+| 2     | wall             |
+| 3     | obstacle         |
+| 4     | traffic_light    |
+| 5     | traffic_sign     |
+| 6     | person           |
+| 7     | vehicle          |
+| 8     | bike             |
+| 9     | road             |
+| 10    | sidewalk         |
+| 11    | roadPaint        |
+| 12    | curbstone        |
+| 13    | crosswalk_others |
+| 14    | vegetation       |
+| 15    | sky              |
+
 ## Onnx model
 
 A sample model (named `yolox-tiny.onnx`) is downloaded by ansible script on env preparation stage, if not, please, follow [Manual downloading of artifacts](https://github.com/autowarefoundation/autoware/tree/main/ansible/roles/artifacts).
@@ -79,11 +115,12 @@ hence these parameters are ignored when users specify ONNX models including this
 
 This package accepts both `EfficientNMS_TRT` attached ONNXs and [models published from the official YOLOX repository](https://github.com/Megvii-BaseDetection/YOLOX/tree/main/demo/ONNXRuntime#download-onnx-models) (we referred to them as "plain" models).
 
-In addition to `yolox-tiny.onnx`, a custom model named `yolox-sPlus-opt.onnx` is either available.
-This model is based on YOLOX-s and tuned to perform more accurate detection with almost comparable execution speed with `yolox-tiny`.
+In addition to `yolox-tiny.onnx`, a custom model named `yolox-sPlus-opt-pseudoV2-T4-960x960-T4-seg16cls` is either available.
+This model is multi-header structure model which is based on YOLOX-s and tuned to perform more accurate detection with almost comparable execution speed with `yolox-tiny`.
 To get better results with this model, users are recommended to use some specific running arguments
 such as `precision:=int8`, `calibration_algorithm:=Entropy`, `clip_value:=6.0`.
 Users can refer `launch/yolox_sPlus_opt.launch.xml` to see how this model can be used.
+Beside detection result, this model also output image semantic segmentation result for pointcloud filtering purpose.
 
 All models are automatically converted to TensorRT format.
 These converted files will be saved in the same directory as specified ONNX files
@@ -146,7 +183,7 @@ Please refer [the official document](https://github.com/Megvii-BaseDetection/YOL
 
 ## Label file
 
-A sample label file (named `label.txt`)is also downloaded automatically during env preparation process
+A sample label file (named `label.txt`) and semantic segmentation color map file (name `semseg_color_map.csv`) are also downloaded automatically during env preparation process
 (**NOTE:** This file is incompatible with models that output labels for the COCO dataset (e.g., models from the official YOLOX repository)).
 
 This file represents the correspondence between class index (integer outputted from YOLOX network) and
@@ -157,3 +194,4 @@ with labels according to the order in this file.
 
 - <https://github.com/Megvii-BaseDetection/YOLOX>
 - <https://github.com/wep21/yolox_onnx_modifier>
+- <https://github.com/tier4/trt-yoloXP>
diff --git a/perception/tensorrt_yolox/config/yolox_s_plus_opt.param.yaml b/perception/tensorrt_yolox/config/yolox_s_plus_opt.param.yaml
index bc6717344209..57c1b40c44a4 100644
--- a/perception/tensorrt_yolox/config/yolox_s_plus_opt.param.yaml
+++ b/perception/tensorrt_yolox/config/yolox_s_plus_opt.param.yaml
@@ -1,7 +1,29 @@
+# cspell: ignore semseg
 /**:
   ros__parameters:
+    # refine segmentation mask by overlay roi class
+    # disable when sematic segmentation accuracy is good enough
+    is_roi_overlap_segment: true
+
+    # minimum existence_probability of detected roi considered to replace segmentation
+    overlap_roi_score_threshold: 0.3
+
+    # publish color mask for result visualization
+    is_publish_color_mask: false
+
+    roi_overlay_segment_label:
+      UNKNOWN : true
+      CAR : false
+      TRUCK : false
+      BUS : false
+      MOTORCYCLE : true
+      BICYCLE : true
+      PEDESTRIAN : true
+      ANIMAL: true
+
     model_path: "$(var data_path)/tensorrt_yolox/$(var model_name).onnx"
     label_path: "$(var data_path)/tensorrt_yolox/label.txt"
+    color_map_path: "$(var data_path)/tensorrt_yolox/semseg_color_map.csv"
     score_threshold: 0.35
     nms_threshold: 0.7
     precision: "int8" # Operation precision to be used on inference. Valid value is one of: [fp32, fp16, int8].
diff --git a/perception/tensorrt_yolox/include/tensorrt_yolox/preprocess.hpp b/perception/tensorrt_yolox/include/tensorrt_yolox/preprocess.hpp
index 3549ae35e70e..faac6de4e328 100644
--- a/perception/tensorrt_yolox/include/tensorrt_yolox/preprocess.hpp
+++ b/perception/tensorrt_yolox/include/tensorrt_yolox/preprocess.hpp
@@ -179,6 +179,21 @@ extern void crop_resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
 extern void multi_scale_resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
   float * dst, unsigned char * src, int d_w, int d_h, int d_c, Roi * d_roi, int s_w, int s_h,
   int s_c, int batch, float norm, cudaStream_t stream);
-}  // namespace tensorrt_yolox
 
+/**
+ * @brief Argmax on GPU
+ * @param[out] dst processed image
+ * @param[in] src probability map
+ * @param[in] d_w width for output
+ * @param[in] d_h height for output
+ * @param[in] s_w width for input
+ * @param[in] s_h height for input
+ * @param[in] s_c channel for input
+ * @param[in] batch batch size
+ * @param[in] stream cuda stream
+ */
+extern void argmax_gpu(
+  unsigned char * dst, float * src, int d_w, int d_h, int s_w, int s_h, int s_c, int batch,
+  cudaStream_t stream);
+}  // namespace tensorrt_yolox
 #endif  // TENSORRT_YOLOX__PREPROCESS_HPP_
diff --git a/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox.hpp b/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox.hpp
index c42222a70c96..d287c8a44d4c 100644
--- a/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox.hpp
+++ b/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox.hpp
@@ -52,6 +52,13 @@ struct GridAndStride
   int stride;
 };
 
+typedef struct Colormap_
+{
+  int id;
+  std::string name;
+  std::vector<unsigned char> color;
+} Colormap;
+
 /**
  * @class TrtYoloX
  * @brief TensorRT YOLOX for faster inference
@@ -85,7 +92,7 @@ class TrtYoloX
     const bool use_gpu_preprocess = false, std::string calibration_image_list_file = std::string(),
     const double norm_factor = 1.0, [[maybe_unused]] const std::string & cache_dir = "",
     const tensorrt_common::BatchConfig & batch_config = {1, 1, 1},
-    const size_t max_workspace_size = (1 << 30));
+    const size_t max_workspace_size = (1 << 30), const std::string & color_map_path = "");
   /**
    * @brief Deconstruct TrtYoloX
    */
@@ -96,7 +103,9 @@ class TrtYoloX
    * @param[out] objects results for object detection
    * @param[in] images batched images
    */
-  bool doInference(const std::vector<cv::Mat> & images, ObjectArrays & objects);
+  bool doInference(
+    const std::vector<cv::Mat> & images, ObjectArrays & objects, std::vector<cv::Mat> & masks,
+    std::vector<cv::Mat> & color_masks);
 
   /**
    * @brief run inference including pre-process and post-process
@@ -130,6 +139,22 @@ class TrtYoloX
    */
   void printProfiling(void);
 
+  /**
+   * @brief get num for multitask heads
+   */
+  int getMultitaskNum(void);
+
+  /**
+   * @brief get colorized masks from index using specific colormap
+   * @param[out] cmask colorized mask
+   * @param[in] index multitask index
+   * @param[in] colormap colormap for masks
+   */
+  void getColorizedMask(
+    const std::vector<tensorrt_yolox::Colormap> & colormap, const cv::Mat & mask,
+    cv::Mat & colorized_mask);
+  inline std::vector<Colormap> getColorMap() { return sematic_color_map_; }
+
 private:
   /**
    * @brief run preprocess including resizing, letterbox, NHWC2NCHW and toFloat on CPU
@@ -177,7 +202,9 @@ class TrtYoloX
     const cv::Mat & images, int batch_size, ObjectArrays & objects);
 
   bool feedforward(const std::vector<cv::Mat> & images, ObjectArrays & objects);
-  bool feedforwardAndDecode(const std::vector<cv::Mat> & images, ObjectArrays & objects);
+  bool feedforwardAndDecode(
+    const std::vector<cv::Mat> & images, ObjectArrays & objects, std::vector<cv::Mat> & masks,
+    std::vector<cv::Mat> & color_masks);
   void decodeOutputs(float * prob, ObjectArray & objects, float scale, cv::Size & img_size) const;
   void generateGridsAndStride(
     const int target_w, const int target_h, const std::vector<int> & strides,
@@ -206,6 +233,26 @@ class TrtYoloX
   void nmsSortedBboxes(
     const ObjectArray & face_objects, std::vector<int> & picked, float nms_threshold) const;
 
+  /**
+   * @brief get a mask image for a segmentation head
+   * @param[out] argmax argmax results
+   * @param[in] prob probability map
+   * @param[in] dims dimension for probability map
+   * @param[in] out_w mask width excluding letterbox
+   * @param[in] out_h mask height excluding letterbox
+   */
+  cv::Mat getMaskImage(float * prob, nvinfer1::Dims dims, int out_w, int out_h);
+
+  /**
+   * @brief get a mask image on GPUs for a segmentation head
+   * @param[out] mask image
+   * @param[in] prob probability map on device
+   * @param[in] out_w mask width excluding letterbox
+   * @param[in] out_h mask height excluding letterbox
+   * @param[in] b current batch
+   */
+  cv::Mat getMaskImageGpu(float * d_prob, nvinfer1::Dims dims, int out_w, int out_h, int b);
+
   std::unique_ptr<tensorrt_common::TrtCommon> trt_common_;
 
   std::vector<float> input_h_;
@@ -249,6 +296,20 @@ class TrtYoloX
   CudaUniquePtrHost<Roi[]> roi_h_;
   // device pointer for ROI
   CudaUniquePtr<Roi[]> roi_d_;
+
+  // flag whether model has multitasks
+  int multitask_;
+  // buff size for segmentation heads
+  CudaUniquePtr<float[]> segmentation_out_prob_d_;
+  CudaUniquePtrHost<float[]> segmentation_out_prob_h_;
+  size_t segmentation_out_elem_num_;
+  size_t segmentation_out_elem_num_per_batch_;
+  std::vector<cv::Mat> segmentation_masks_;
+  // host buffer for argmax postprocessing on GPU
+  CudaUniquePtrHost<unsigned char[]> argmax_buf_h_;
+  // device buffer for argmax postprocessing  on GPU
+  CudaUniquePtr<unsigned char[]> argmax_buf_d_;
+  std::vector<tensorrt_yolox::Colormap> sematic_color_map_;
 };
 
 }  // namespace tensorrt_yolox
diff --git a/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox_node.hpp b/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox_node.hpp
index 6044148a932a..64332dffa834 100644
--- a/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox_node.hpp
+++ b/perception/tensorrt_yolox/include/tensorrt_yolox/tensorrt_yolox_node.hpp
@@ -15,6 +15,8 @@
 #ifndef TENSORRT_YOLOX__TENSORRT_YOLOX_NODE_HPP_
 #define TENSORRT_YOLOX__TENSORRT_YOLOX_NODE_HPP_
 
+#include "object_recognition_utils/object_recognition_utils.hpp"
+
 #include <autoware/universe_utils/ros/debug_publisher.hpp>
 #include <autoware/universe_utils/system/stop_watch.hpp>
 #include <image_transport/image_transport.hpp>
@@ -25,6 +27,7 @@
 #include <sensor_msgs/msg/image.hpp>
 #include <std_msgs/msg/header.hpp>
 #include <tier4_perception_msgs/msg/detected_objects_with_feature.hpp>
+#include <tier4_perception_msgs/msg/semantic.hpp>
 
 #if __has_include(<cv_bridge/cv_bridge.hpp>)
 #include <cv_bridge/cv_bridge.hpp>
@@ -37,14 +40,35 @@
 #include <map>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 namespace tensorrt_yolox
 {
+// cspell: ignore Semseg
 using LabelMap = std::map<int, std::string>;
-
+using Label = tier4_perception_msgs::msg::Semantic;
 class TrtYoloXNode : public rclcpp::Node
 {
+  struct RoiOverlaySemsegLabel
+  {
+    bool UNKNOWN;
+    bool CAR;
+    bool TRUCK;
+    bool BUS;
+    bool MOTORCYCLE;
+    bool BICYCLE;
+    bool PEDESTRIAN;
+    bool ANIMAL;
+    bool isOverlay(const uint8_t label) const
+    {
+      return (label == Label::UNKNOWN && UNKNOWN) || (label == Label::CAR && CAR) ||
+             (label == Label::TRUCK && TRUCK) || (label == Label::BUS && BUS) ||
+             (label == Label::ANIMAL && ANIMAL) || (label == Label::MOTORBIKE && MOTORCYCLE) ||
+             (label == Label::BICYCLE && BICYCLE) || (label == Label::PEDESTRIAN && PEDESTRIAN);
+    };
+  };  // struct RoiOverlaySemsegLabel
+
 public:
   explicit TrtYoloXNode(const rclcpp::NodeOptions & node_options);
 
@@ -53,8 +77,14 @@ class TrtYoloXNode : public rclcpp::Node
   void onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg);
   bool readLabelFile(const std::string & label_path);
   void replaceLabelMap();
-
+  void overlapSegmentByRoi(
+    const tensorrt_yolox::Object & object, cv::Mat & mask, const int width, const int height);
+  int mapRoiLabel2SegLabel(const int32_t roi_label_index);
   image_transport::Publisher image_pub_;
+  image_transport::Publisher mask_pub_;
+
+  image_transport::Publisher color_mask_pub_;
+
   rclcpp::Publisher<tier4_perception_msgs::msg::DetectedObjectsWithFeature>::SharedPtr objects_pub_;
 
   image_transport::Subscriber image_sub_;
@@ -63,6 +93,21 @@ class TrtYoloXNode : public rclcpp::Node
 
   LabelMap label_map_;
   std::unique_ptr<tensorrt_yolox::TrtYoloX> trt_yolox_;
+  bool is_roi_overlap_segment_;
+  bool is_publish_color_mask_;
+  float overlap_roi_score_threshold_;
+  // TODO(badai-nguyen): change to function
+  std::map<std::string, int> remap_roi_to_semantic_ = {
+    {"UNKNOWN", 3},     // other
+    {"ANIMAL", 0},      // other
+    {"PEDESTRIAN", 6},  // person
+    {"CAR", 7},         // car
+    {"TRUCK", 7},       // truck
+    {"BUS", 7},         // bus
+    {"BICYCLE", 8},     // bicycle
+    {"MOTORBIKE", 8},   // motorcycle
+  };
+  RoiOverlaySemsegLabel roi_overlay_segment_labels_;
   std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_;
   std::unique_ptr<autoware::universe_utils::DebugPublisher> debug_publisher_;
 };
diff --git a/perception/tensorrt_yolox/launch/yolox_s_plus_opt.launch.xml b/perception/tensorrt_yolox/launch/yolox_s_plus_opt.launch.xml
index fa303e573bc6..e4436a0424be 100644
--- a/perception/tensorrt_yolox/launch/yolox_s_plus_opt.launch.xml
+++ b/perception/tensorrt_yolox/launch/yolox_s_plus_opt.launch.xml
@@ -1,9 +1,14 @@
 <?xml version="1.0"?>
 <launch>
-  <!-- cspell:ignore finetune -->
+  <!-- cspell: ignore semseg, finetune  -->
   <arg name="input/image" default="/sensing/camera/camera0/image_rect_color"/>
   <arg name="output/objects" default="/perception/object_recognition/detection/rois0"/>
-  <arg name="model_name" default="yolox-sPlus-T4-960x960-pseudo-finetune"/>
+  <arg name="output/mask" default="/perception/object_recognition/detection/mask0"/>
+  <arg
+    name="model_name"
+    default="yolox-sPlus-opt-pseudoV2-T4-960x960-T4-seg16cls"
+    description="options `yolox-sPlus-T4-960x960-pseudo-finetune` if only detection is needed, `yolox-sPlus-opt-pseudoV2-T4-960x960-T4-seg16cls` if sematic segmentation is also needed"
+  />
   <arg name="data_path" default="$(env HOME)/autoware_data" description="packages data and artifacts directory path"/>
   <arg name="yolox_param_path" default="$(find-pkg-share tensorrt_yolox)/config/yolox_s_plus_opt.param.yaml"/>
   <arg name="use_decompress" default="true" description="use image decompress"/>
@@ -19,6 +24,7 @@
   <node pkg="tensorrt_yolox" exec="tensorrt_yolox_node_exe" name="tensorrt_yolox" output="screen">
     <remap from="~/in/image" to="$(var input/image)"/>
     <remap from="~/out/objects" to="$(var output/objects)"/>
+    <remap from="~/out/mask" to="$(var output/mask)"/>
     <param from="$(var yolox_param_path)" allow_substs="true"/>
     <param name="build_only" value="$(var build_only)"/>
   </node>
diff --git a/perception/tensorrt_yolox/package.xml b/perception/tensorrt_yolox/package.xml
index 8e5a0ef973ca..8a80d9d82d09 100644
--- a/perception/tensorrt_yolox/package.xml
+++ b/perception/tensorrt_yolox/package.xml
@@ -24,6 +24,7 @@
   <depend>image_transport</depend>
   <depend>libopencv-dev</depend>
   <depend>object_recognition_utils</depend>
+  <depend>perception_utils</depend>
   <depend>rclcpp</depend>
   <depend>rclcpp_components</depend>
   <depend>sensor_msgs</depend>
diff --git a/perception/tensorrt_yolox/src/preprocess.cu b/perception/tensorrt_yolox/src/preprocess.cu
index 3c3087c536f1..f384de2975aa 100644
--- a/perception/tensorrt_yolox/src/preprocess.cu
+++ b/perception/tensorrt_yolox/src/preprocess.cu
@@ -594,4 +594,39 @@ void multi_scale_resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
     cuda_gridsize(N), block, 0, stream>>>(N, dst, src, d_h, d_w, s_h, s_w, d_roi, norm, batch);
 }
 
+__global__ void argmax_gpu_kernel(
+  int N, unsigned char * dst, float * src, int dst_h, int dst_w, int src_c, int src_h, int src_w,
+  int batch)
+{
+  // NHWC
+  int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+
+  if (index >= N) return;
+  int c = 0;
+  int w = index % dst_w;
+  int h = index / (dst_w);
+
+  int b;
+  for (b = 0; b < batch; b++) {
+    float max_prob = 0.0;
+    int max_index = 0;
+    int dst_index = w + dst_w * h + b * dst_h * dst_w;
+    for (c = 0; c < src_c; c++) {
+      int src_index = w + src_w * h + c * src_h * src_w + b * src_c * src_h * src_w;
+      max_index = max_prob < src[src_index] ? c : max_index;
+      max_prob = max_prob < src[src_index] ? src[src_index] : max_prob;
+    }
+    dst[dst_index] = max_index;
+  }
+}
+
+void argmax_gpu(
+  unsigned char * dst, float * src, int d_w, int d_h, int s_w, int s_h, int s_c, int batch,
+  cudaStream_t stream)
+{
+  int N = d_w * d_h;
+  argmax_gpu_kernel<<<cuda_gridsize(N), block, 0, stream>>>(
+    N, dst, src, d_h, d_w, s_c, s_h, s_w, batch);
+}
+
 }  // namespace tensorrt_yolox
diff --git a/perception/tensorrt_yolox/src/tensorrt_yolox.cpp b/perception/tensorrt_yolox/src/tensorrt_yolox.cpp
index 11f68f580acc..af6cecd2eff7 100644
--- a/perception/tensorrt_yolox/src/tensorrt_yolox.cpp
+++ b/perception/tensorrt_yolox/src/tensorrt_yolox.cpp
@@ -15,12 +15,18 @@
 #include "cuda_utils/cuda_check_error.hpp"
 #include "cuda_utils/cuda_unique_ptr.hpp"
 
+#include <experimental/filesystem>
 #include <tensorrt_yolox/calibrator.hpp>
 #include <tensorrt_yolox/preprocess.hpp>
 #include <tensorrt_yolox/tensorrt_yolox.hpp>
 
+#include <assert.h>
+
 #include <algorithm>
+#include <fstream>
 #include <functional>
+#include <iomanip>
+#include <iostream>
 #include <memory>
 #include <numeric>
 #include <stdexcept>
@@ -97,6 +103,53 @@ std::vector<std::string> loadImageList(const std::string & filename, const std::
   }
   return fileList;
 }
+
+std::vector<tensorrt_yolox::Colormap> get_seg_colormap(const std::string & filename)
+{
+  std::vector<tensorrt_yolox::Colormap> seg_cmap;
+  if (filename != "not-specified") {
+    std::vector<std::string> color_list = loadListFromTextFile(filename);
+    for (int i = 0; i < static_cast<int>(color_list.size()); i++) {
+      if (i == 0) {
+        // Skip header
+        continue;
+      }
+      std::string colormapString = color_list[i];
+      tensorrt_yolox::Colormap cmap;
+      std::vector<int> rgb;
+      size_t npos = colormapString.find_first_of(',');
+      assert(npos != std::string::npos);
+      std::string substr = colormapString.substr(0, npos);
+      int id = static_cast<int>(std::stoi(trim(substr)));
+      colormapString.erase(0, npos + 1);
+
+      npos = colormapString.find_first_of(',');
+      assert(npos != std::string::npos);
+      substr = colormapString.substr(0, npos);
+      std::string name = (trim(substr));
+      cmap.id = id;
+      cmap.name = name;
+      colormapString.erase(0, npos + 1);
+      while (!colormapString.empty()) {
+        size_t npos = colormapString.find_first_of(',');
+        if (npos != std::string::npos) {
+          substr = colormapString.substr(0, npos);
+          unsigned char c = (unsigned char)std::stoi(trim(substr));
+          cmap.color.push_back(c);
+          colormapString.erase(0, npos + 1);
+        } else {
+          unsigned char c = (unsigned char)std::stoi(trim(colormapString));
+          cmap.color.push_back(c);
+          break;
+        }
+      }
+
+      seg_cmap.push_back(cmap);
+    }
+  }
+  return seg_cmap;
+}
+
 }  // anonymous namespace
 
 namespace tensorrt_yolox
@@ -106,12 +159,14 @@ TrtYoloX::TrtYoloX(
   const float score_threshold, const float nms_threshold, tensorrt_common::BuildConfig build_config,
   const bool use_gpu_preprocess, std::string calibration_image_list_path, const double norm_factor,
   [[maybe_unused]] const std::string & cache_dir, const tensorrt_common::BatchConfig & batch_config,
-  const size_t max_workspace_size)
+  const size_t max_workspace_size, const std::string & color_map_path)
 {
   src_width_ = -1;
   src_height_ = -1;
   norm_factor_ = norm_factor;
   batch_size_ = batch_config[2];
+  multitask_ = 0;
+  sematic_color_map_ = get_seg_colormap(color_map_path);
   if (precision == "int8") {
     if (build_config.clip_value <= 0.0) {
       if (calibration_image_list_path.empty()) {
@@ -196,9 +251,18 @@ TrtYoloX::TrtYoloX(
       needs_output_decode_ = false;
       break;
     default:
+      needs_output_decode_ = true;
+      // The following three values are considered only if the specified model is plain one
+      num_class_ = num_class;
+      score_threshold_ = score_threshold;
+      nms_threshold_ = nms_threshold;
+      // Todo : Support multiple segmentation heads
+      multitask_++;
+      /*
       std::stringstream s;
       s << "\"" << model_path << "\" is unsupported format";
       throw std::runtime_error{s.str()};
+      */
   }
 
   // GPU memory allocation
@@ -234,10 +298,31 @@ TrtYoloX::TrtYoloX(
     out_scores_d_ = cuda_utils::make_unique<float[]>(batch_config[2] * max_detections_);
     out_classes_d_ = cuda_utils::make_unique<int32_t[]>(batch_config[2] * max_detections_);
   }
+  if (multitask_) {
+    // Allocate buffer for segmentation
+    segmentation_out_elem_num_ = 0;
+    for (int m = 0; m < multitask_; m++) {
+      const auto output_dims =
+        trt_common_->getBindingDimensions(m + 2);  // 0 : input, 1 : output for detections
+      size_t out_elem_num = std::accumulate(
+        output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies<int>());
+      out_elem_num = out_elem_num * batch_config[2];
+      segmentation_out_elem_num_ += out_elem_num;
+    }
+    segmentation_out_elem_num_per_batch_ =
+      static_cast<int>(segmentation_out_elem_num_ / batch_config[2]);
+    segmentation_out_prob_d_ = cuda_utils::make_unique<float[]>(segmentation_out_elem_num_);
+    segmentation_out_prob_h_ =
+      cuda_utils::make_unique_host<float[]>(segmentation_out_elem_num_, cudaHostAllocPortable);
+  }
   if (use_gpu_preprocess) {
     use_gpu_preprocess_ = true;
     image_buf_h_ = nullptr;
     image_buf_d_ = nullptr;
+    if (multitask_) {
+      argmax_buf_h_ = nullptr;
+      argmax_buf_d_ = nullptr;
+    }
   } else {
     use_gpu_preprocess_ = false;
   }
@@ -252,6 +337,9 @@ TrtYoloX::~TrtYoloX()
     if (image_buf_d_) {
       image_buf_d_.reset();
     }
+    if (argmax_buf_d_) {
+      argmax_buf_d_.reset();
+    }
   }
 }
 
@@ -294,6 +382,26 @@ void TrtYoloX::initPreprocessBuffer(int width, int height)
         width * height * 3 * batch_size_, cudaHostAllocWriteCombined);
       image_buf_d_ = cuda_utils::make_unique<unsigned char[]>(width * height * 3 * batch_size_);
     }
+    if (multitask_) {
+      size_t argmax_out_elem_num = 0;
+      for (int m = 0; m < multitask_; m++) {
+        const auto output_dims =
+          trt_common_->getBindingDimensions(m + 2);  // 0 : input, 1 : output for detections
+        const float scale = std::min(
+          output_dims.d[3] / static_cast<float>(width),
+          output_dims.d[2] / static_cast<float>(height));
+        int out_w = static_cast<int>(width * scale);
+        int out_h = static_cast<int>(height * scale);
+        // size_t out_elem_num = std::accumulate(
+        // output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies<int>());
+        // out_elem_num = out_elem_num * batch_size_;
+        size_t out_elem_num = out_w * out_h * batch_size_;
+        argmax_out_elem_num += out_elem_num;
+      }
+      argmax_buf_h_ =
+        cuda_utils::make_unique_host<unsigned char[]>(argmax_out_elem_num, cudaHostAllocPortable);
+      argmax_buf_d_ = cuda_utils::make_unique<unsigned char[]>(argmax_out_elem_num);
+    }
   }
 }
 
@@ -321,6 +429,12 @@ void TrtYoloX::preprocessGpu(const std::vector<cv::Mat> & images)
         if (image_buf_d_) {
           image_buf_d_.reset();
         }
+        if (argmax_buf_h_) {
+          argmax_buf_h_.reset();
+        }
+        if (argmax_buf_d_) {
+          argmax_buf_d_.reset();
+        }
       }
     }
     src_width_ = width;
@@ -333,6 +447,7 @@ void TrtYoloX::preprocessGpu(const std::vector<cv::Mat> & images)
   const float input_height = static_cast<float>(input_dims.d[2]);
   const float input_width = static_cast<float>(input_dims.d[3]);
   int b = 0;
+  size_t argmax_out_elem_num = 0;
   for (const auto & image : images) {
     if (!image_buf_h_) {
       const float scale = std::min(input_width / image.cols, input_height / image.rows);
@@ -348,7 +463,31 @@ void TrtYoloX::preprocessGpu(const std::vector<cv::Mat> & images)
       image_buf_h_.get() + index, &image.data[0],
       image.cols * image.rows * 3 * sizeof(unsigned char));
     b++;
+
+    if (multitask_) {
+      for (int m = 0; m < multitask_; m++) {
+        const auto output_dims =
+          trt_common_->getBindingDimensions(m + 2);  // 0: input, 1: output for detections
+        const float scale = std::min(
+          output_dims.d[3] / static_cast<float>(image.cols),
+          output_dims.d[2] / static_cast<float>(image.rows));
+        int out_w = static_cast<int>(image.cols * scale);
+        int out_h = static_cast<int>(image.rows * scale);
+        argmax_out_elem_num += out_w * out_h * batch_size;
+      }
+    }
+  }
+
+  if (multitask_) {
+    if (!argmax_buf_h_) {
+      argmax_buf_h_ =
+        cuda_utils::make_unique_host<unsigned char[]>(argmax_out_elem_num, cudaHostAllocPortable);
+    }
+    if (!argmax_buf_d_) {
+      argmax_buf_d_ = cuda_utils::make_unique<unsigned char[]>(argmax_out_elem_num);
+    }
   }
+
   // Copy into device memory
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     image_buf_d_.get(), image_buf_h_.get(),
@@ -406,7 +545,9 @@ void TrtYoloX::preprocess(const std::vector<cv::Mat> & images)
   // No Need for Sync
 }
 
-bool TrtYoloX::doInference(const std::vector<cv::Mat> & images, ObjectArrays & objects)
+bool TrtYoloX::doInference(
+  const std::vector<cv::Mat> & images, ObjectArrays & objects, std::vector<cv::Mat> & masks,
+  [[maybe_unused]] std::vector<cv::Mat> & color_masks)
 {
   if (!trt_common_->isInitialized()) {
     return false;
@@ -419,7 +560,7 @@ bool TrtYoloX::doInference(const std::vector<cv::Mat> & images, ObjectArrays & o
   }
 
   if (needs_output_decode_) {
-    return feedforwardAndDecode(images, objects);
+    return feedforwardAndDecode(images, objects, masks, color_masks);
   } else {
     return feedforward(images, objects);
   }
@@ -659,6 +800,8 @@ void TrtYoloX::multiScalePreprocess(const cv::Mat & image, const std::vector<cv:
 bool TrtYoloX::doInferenceWithRoi(
   const std::vector<cv::Mat> & images, ObjectArrays & objects, const std::vector<cv::Rect> & rois)
 {
+  std::vector<cv::Mat> masks;
+  std::vector<cv::Mat> color_masks;
   if (!trt_common_->isInitialized()) {
     return false;
   }
@@ -669,7 +812,7 @@ bool TrtYoloX::doInferenceWithRoi(
   }
 
   if (needs_output_decode_) {
-    return feedforwardAndDecode(images, objects);
+    return feedforwardAndDecode(images, objects, masks, color_masks);
   } else {
     return feedforward(images, objects);
   }
@@ -747,10 +890,14 @@ bool TrtYoloX::feedforward(const std::vector<cv::Mat> & images, ObjectArrays & o
   return true;
 }
 
-bool TrtYoloX::feedforwardAndDecode(const std::vector<cv::Mat> & images, ObjectArrays & objects)
+bool TrtYoloX::feedforwardAndDecode(
+  const std::vector<cv::Mat> & images, ObjectArrays & objects, std::vector<cv::Mat> & out_masks,
+  [[maybe_unused]] std::vector<cv::Mat> & color_masks)
 {
   std::vector<void *> buffers = {input_d_.get(), out_prob_d_.get()};
-
+  if (multitask_) {
+    buffers = {input_d_.get(), out_prob_d_.get(), segmentation_out_prob_d_.get()};
+  }
   trt_common_->enqueueV2(buffers.data(), *stream_, nullptr);
 
   const auto batch_size = images.size();
@@ -758,6 +905,11 @@ bool TrtYoloX::feedforwardAndDecode(const std::vector<cv::Mat> & images, ObjectA
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     out_prob_h_.get(), out_prob_d_.get(), sizeof(float) * out_elem_num_, cudaMemcpyDeviceToHost,
     *stream_));
+  if (multitask_ && !use_gpu_preprocess_) {
+    CHECK_CUDA_ERROR(cudaMemcpyAsync(
+      segmentation_out_prob_h_.get(), segmentation_out_prob_d_.get(),
+      sizeof(float) * segmentation_out_elem_num_, cudaMemcpyDeviceToHost, *stream_));
+  }
   cudaStreamSynchronize(*stream_);
   objects.clear();
 
@@ -766,7 +918,43 @@ bool TrtYoloX::feedforwardAndDecode(const std::vector<cv::Mat> & images, ObjectA
     float * batch_prob = out_prob_h_.get() + (i * out_elem_num_per_batch_);
     ObjectArray object_array;
     decodeOutputs(batch_prob, object_array, scales_[i], image_size);
+    // add refine mask using object
     objects.emplace_back(object_array);
+    if (multitask_) {
+      segmentation_masks_.clear();
+
+      size_t counter = 0;
+      int batch =
+        static_cast<int>(segmentation_out_elem_num_ / segmentation_out_elem_num_per_batch_);
+      for (int m = 0; m < multitask_; m++) {
+        const auto output_dims =
+          trt_common_->getBindingDimensions(m + 2);  // 0 : input, 1 : output for detections
+        size_t out_elem_num = std::accumulate(
+          output_dims.d + 1, output_dims.d + output_dims.nbDims, 1, std::multiplies<int>());
+        out_elem_num = out_elem_num * batch;
+        const float scale = std::min(
+          output_dims.d[3] / static_cast<float>(image_size.width),
+          output_dims.d[2] / static_cast<float>(image_size.height));
+        int out_w = static_cast<int>(image_size.width * scale);
+        int out_h = static_cast<int>(image_size.height * scale);
+        cv::Mat mask;
+        if (use_gpu_preprocess_) {
+          float * d_segmentation_results =
+            segmentation_out_prob_d_.get() + (i * segmentation_out_elem_num_per_batch_);
+          mask = getMaskImageGpu(&(d_segmentation_results[counter]), output_dims, out_w, out_h, i);
+        } else {
+          float * segmentation_results =
+            segmentation_out_prob_h_.get() + (i * segmentation_out_elem_num_per_batch_);
+          mask = getMaskImage(&(segmentation_results[counter]), output_dims, out_w, out_h);
+        }
+        segmentation_masks_.emplace_back(std::move(mask));
+        counter += out_elem_num;
+      }
+      // semantic segmentation was fixed as first task
+      out_masks.at(i) = segmentation_masks_.at(0);
+    } else {
+      continue;
+    }
   }
   return true;
 }
@@ -1036,4 +1224,73 @@ void TrtYoloX::nmsSortedBboxes(
   }
 }
 
+cv::Mat TrtYoloX::getMaskImageGpu(float * d_prob, nvinfer1::Dims dims, int out_w, int out_h, int b)
+{
+  // NCHW
+  int classes = dims.d[1];
+  int height = dims.d[2];
+  int width = dims.d[3];
+  cv::Mat mask = cv::Mat::zeros(out_h, out_w, CV_8UC1);
+  int index = b * out_w * out_h;
+  argmax_gpu(
+    (unsigned char *)argmax_buf_d_.get() + index, d_prob, out_w, out_h, width, height, classes, 1,
+    *stream_);
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    argmax_buf_h_.get(), argmax_buf_d_.get(), sizeof(unsigned char) * 1 * out_w * out_h,
+    cudaMemcpyDeviceToHost, *stream_));
+  cudaStreamSynchronize(*stream_);
+  std::memcpy(mask.data, argmax_buf_h_.get() + index, sizeof(unsigned char) * 1 * out_w * out_h);
+  return mask;
+}
+
+cv::Mat TrtYoloX::getMaskImage(float * prob, nvinfer1::Dims dims, int out_w, int out_h)
+{
+  // NCHW
+  int classes = dims.d[1];
+  int height = dims.d[2];
+  int width = dims.d[3];
+  cv::Mat mask = cv::Mat::zeros(out_h, out_w, CV_8UC1);
+  // argmax
+  // #pragma omp parallel for
+  for (int y = 0; y < out_h; y++) {
+    for (int x = 0; x < out_w; x++) {
+      float max = 0.0;
+      int index = 0;
+      for (int c = 0; c < classes; c++) {
+        float value = prob[c * height * width + y * width + x];
+        if (max < value) {
+          max = value;
+          index = c;
+        }
+      }
+      mask.at<unsigned char>(y, x) = index;
+    }
+  }
+  return mask;
+}
+
+int TrtYoloX::getMultitaskNum(void)
+{
+  return multitask_;
+}
+
+void TrtYoloX::getColorizedMask(
+  const std::vector<tensorrt_yolox::Colormap> & colormap, const cv::Mat & mask, cv::Mat & cmask)
+{
+  int width = mask.cols;
+  int height = mask.rows;
+  if ((cmask.cols != mask.cols) || (cmask.rows != mask.rows)) {
+    throw std::runtime_error("input and output image have difference size.");
+    return;
+  }
+  for (int y = 0; y < height; y++) {
+    for (int x = 0; x < width; x++) {
+      unsigned char id = mask.at<unsigned char>(y, x);
+      cmask.at<cv::Vec3b>(y, x)[0] = colormap[id].color[2];
+      cmask.at<cv::Vec3b>(y, x)[1] = colormap[id].color[1];
+      cmask.at<cv::Vec3b>(y, x)[2] = colormap[id].color[0];
+    }
+  }
+}
+
 }  // namespace tensorrt_yolox
diff --git a/perception/tensorrt_yolox/src/tensorrt_yolox_node.cpp b/perception/tensorrt_yolox/src/tensorrt_yolox_node.cpp
index 4ee18b99e4bc..f4e544c11ffb 100644
--- a/perception/tensorrt_yolox/src/tensorrt_yolox_node.cpp
+++ b/perception/tensorrt_yolox/src/tensorrt_yolox_node.cpp
@@ -15,6 +15,7 @@
 #include "tensorrt_yolox/tensorrt_yolox_node.hpp"
 
 #include "object_recognition_utils/object_classification.hpp"
+#include "perception_utils/run_length_encoder.hpp"
 
 #include <autoware_perception_msgs/msg/object_classification.hpp>
 
@@ -33,7 +34,7 @@ TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options)
     stop_watch_ptr_ =
       std::make_unique<autoware::universe_utils::StopWatch<std::chrono::milliseconds>>();
     debug_publisher_ =
-      std::make_unique<autoware::universe_utils::DebugPublisher>(this, "tensorrt_yolox");
+      std::make_unique<autoware::universe_utils::DebugPublisher>(this, this->get_name());
     stop_watch_ptr_->tic("cyclic_time");
     stop_watch_ptr_->tic("processing_time");
   }
@@ -96,25 +97,51 @@ TrtYoloXNode::TrtYoloXNode(const rclcpp::NodeOptions & node_options)
     ("Path to a file which contains path to images."
      "Those images will be used for int8 quantization."));
 
+  std::string color_map_path = declare_parameter_with_description(
+    "color_map_path", "", ("Path to a file which contains path to color map."));
   if (!readLabelFile(label_path)) {
     RCLCPP_ERROR(this->get_logger(), "Could not find label file");
     rclcpp::shutdown();
   }
+
+  is_roi_overlap_segment_ = declare_parameter<bool>("is_roi_overlap_segment");
+  is_publish_color_mask_ = declare_parameter<bool>("is_publish_color_mask");
+  overlap_roi_score_threshold_ = declare_parameter<float>("overlap_roi_score_threshold");
+  roi_overlay_segment_labels_.UNKNOWN =
+    declare_parameter<bool>("roi_overlay_segment_label.UNKNOWN");
+  roi_overlay_segment_labels_.CAR = declare_parameter<bool>("roi_overlay_segment_label.CAR");
+  roi_overlay_segment_labels_.TRUCK = declare_parameter<bool>("roi_overlay_segment_label.TRUCK");
+  roi_overlay_segment_labels_.BUS = declare_parameter<bool>("roi_overlay_segment_label.BUS");
+  roi_overlay_segment_labels_.MOTORCYCLE =
+    declare_parameter<bool>("roi_overlay_segment_label.MOTORCYCLE");
+  roi_overlay_segment_labels_.BICYCLE =
+    declare_parameter<bool>("roi_overlay_segment_label.BICYCLE");
+  roi_overlay_segment_labels_.PEDESTRIAN =
+    declare_parameter<bool>("roi_overlay_segment_label.PEDESTRIAN");
+  roi_overlay_segment_labels_.ANIMAL = declare_parameter<bool>("roi_overlay_segment_label.ANIMAL");
   replaceLabelMap();
 
   tensorrt_common::BuildConfig build_config(
     calibration_algorithm, dla_core_id, quantize_first_layer, quantize_last_layer,
     profile_per_layer, clip_value);
 
+  const double norm_factor = 1.0;
+  const std::string cache_dir = "";
+  const tensorrt_common::BatchConfig batch_config{1, 1, 1};
+  const size_t max_workspace_size = (1 << 30);
+
   trt_yolox_ = std::make_unique<tensorrt_yolox::TrtYoloX>(
     model_path, precision, label_map_.size(), score_threshold, nms_threshold, build_config,
-    preprocess_on_gpu, calibration_image_list_path);
+    preprocess_on_gpu, calibration_image_list_path, norm_factor, cache_dir, batch_config,
+    max_workspace_size, color_map_path);
 
   timer_ =
     rclcpp::create_timer(this, get_clock(), 100ms, std::bind(&TrtYoloXNode::onConnect, this));
 
   objects_pub_ = this->create_publisher<tier4_perception_msgs::msg::DetectedObjectsWithFeature>(
     "~/out/objects", 1);
+  mask_pub_ = image_transport::create_publisher(this, "~/out/mask");
+  color_mask_pub_ = image_transport::create_publisher(this, "~/out/color_mask");
   image_pub_ = image_transport::create_publisher(this, "~/out/image");
 
   if (declare_parameter("build_only", false)) {
@@ -129,7 +156,8 @@ void TrtYoloXNode::onConnect()
   if (
     objects_pub_->get_subscription_count() == 0 &&
     objects_pub_->get_intra_process_subscription_count() == 0 &&
-    image_pub_.getNumSubscribers() == 0) {
+    image_pub_.getNumSubscribers() == 0 && mask_pub_.getNumSubscribers() == 0 &&
+    color_mask_pub_.getNumSubscribers() == 0) {
     image_sub_.shutdown();
   } else if (!image_sub_) {
     image_sub_ = image_transport::create_subscription(
@@ -154,10 +182,16 @@ void TrtYoloXNode::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg)
   const auto height = in_image_ptr->image.rows;
 
   tensorrt_yolox::ObjectArrays objects;
-  if (!trt_yolox_->doInference({in_image_ptr->image}, objects)) {
+  std::vector<cv::Mat> masks = {cv::Mat(cv::Size(height, width), CV_8UC1, cv::Scalar(0))};
+  std::vector<cv::Mat> color_masks = {
+    cv::Mat(cv::Size(height, width), CV_8UC3, cv::Scalar(0, 0, 0))};
+
+  if (!trt_yolox_->doInference({in_image_ptr->image}, objects, masks, color_masks)) {
     RCLCPP_WARN(this->get_logger(), "Fail to inference");
     return;
   }
+  auto & mask = masks.at(0);
+
   for (const auto & yolox_object : objects.at(0)) {
     tier4_perception_msgs::msg::DetectedObjectWithFeature object;
     object.feature.roi.x_offset = yolox_object.x_offset;
@@ -177,9 +211,28 @@ void TrtYoloXNode::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg)
     cv::rectangle(
       in_image_ptr->image, cv::Point(left, top), cv::Point(right, bottom), cv::Scalar(0, 0, 255), 3,
       8, 0);
+    // Refine mask: replacing segmentation mask by roi class
+    // This should remove when the segmentation accuracy is high
+    if (is_roi_overlap_segment_ && trt_yolox_->getMultitaskNum() > 0) {
+      overlapSegmentByRoi(yolox_object, mask, width, height);
+    }
   }
-  image_pub_.publish(in_image_ptr->toImageMsg());
+  if (trt_yolox_->getMultitaskNum() > 0) {
+    sensor_msgs::msg::Image::SharedPtr out_mask_msg =
+      cv_bridge::CvImage(std_msgs::msg::Header(), sensor_msgs::image_encodings::MONO8, mask)
+        .toImageMsg();
+    out_mask_msg->header = msg->header;
 
+    std::vector<std::pair<uint8_t, int>> compressed_data = perception_utils::runLengthEncoder(mask);
+    int step = sizeof(uint8_t) + sizeof(int);
+    out_mask_msg->data.resize(static_cast<int>(compressed_data.size()) * step);
+    for (size_t i = 0; i < compressed_data.size(); ++i) {
+      std::memcpy(&out_mask_msg->data[i * step], &compressed_data.at(i).first, sizeof(uint8_t));
+      std::memcpy(&out_mask_msg->data[i * step + 1], &compressed_data.at(i).second, sizeof(int));
+    }
+    mask_pub_.publish(out_mask_msg);
+  }
+  image_pub_.publish(in_image_ptr->toImageMsg());
   out_objects.header = msg->header;
   objects_pub_->publish(out_objects);
 
@@ -198,6 +251,16 @@ void TrtYoloXNode::onImage(const sensor_msgs::msg::Image::ConstSharedPtr msg)
     debug_publisher_->publish<tier4_debug_msgs::msg::Float64Stamped>(
       "debug/pipeline_latency_ms", pipeline_latency_ms);
   }
+
+  if (is_publish_color_mask_ && trt_yolox_->getMultitaskNum() > 0) {
+    cv::Mat color_mask = cv::Mat::zeros(mask.rows, mask.cols, CV_8UC3);
+    trt_yolox_->getColorizedMask(trt_yolox_->getColorMap(), mask, color_mask);
+    sensor_msgs::msg::Image::SharedPtr output_color_mask_msg =
+      cv_bridge::CvImage(std_msgs::msg::Header(), sensor_msgs::image_encodings::BGR8, color_mask)
+        .toImageMsg();
+    output_color_mask_msg->header = msg->header;
+    color_mask_pub_.publish(output_color_mask_msg);
+  }
 }
 
 bool TrtYoloXNode::readLabelFile(const std::string & label_path)
@@ -234,6 +297,37 @@ void TrtYoloXNode::replaceLabelMap()
   }
 }
 
+int TrtYoloXNode::mapRoiLabel2SegLabel(const int32_t roi_label_index)
+{
+  if (roi_overlay_segment_labels_.isOverlay(static_cast<uint8_t>(roi_label_index))) {
+    std::string label = label_map_[roi_label_index];
+
+    return remap_roi_to_semantic_[label];
+  }
+  return -1;
+}
+
+void TrtYoloXNode::overlapSegmentByRoi(
+  const tensorrt_yolox::Object & roi_object, cv::Mat & mask, const int orig_width,
+  const int orig_height)
+{
+  if (roi_object.score < overlap_roi_score_threshold_) return;
+  int seg_class_index = mapRoiLabel2SegLabel(roi_object.type);
+  if (seg_class_index < 0) return;
+
+  const float scale_x = static_cast<float>(mask.cols) / static_cast<float>(orig_width);
+  const float scale_y = static_cast<float>(mask.rows) / static_cast<float>(orig_height);
+  const int roi_width = static_cast<int>(roi_object.width * scale_x);
+  const int roi_height = static_cast<int>(roi_object.height * scale_y);
+  const int roi_x_offset = static_cast<int>(roi_object.x_offset * scale_x);
+  const int roi_y_offset = static_cast<int>(roi_object.y_offset * scale_y);
+
+  cv::Mat replace_roi(
+    cv::Size(roi_width, roi_height), mask.type(), static_cast<uint8_t>(seg_class_index));
+  replace_roi.copyTo(mask.colRange(roi_x_offset, roi_x_offset + roi_width)
+                       .rowRange(roi_y_offset, roi_y_offset + roi_height));
+}
+
 }  // namespace tensorrt_yolox
 
 #include "rclcpp_components/register_node_macro.hpp"
diff --git a/perception/tensorrt_yolox/src/yolox_single_image_inference_node.cpp b/perception/tensorrt_yolox/src/yolox_single_image_inference_node.cpp
index 0657f0096b07..360f41e470e3 100644
--- a/perception/tensorrt_yolox/src/yolox_single_image_inference_node.cpp
+++ b/perception/tensorrt_yolox/src/yolox_single_image_inference_node.cpp
@@ -47,7 +47,9 @@ class YoloXSingleImageInferenceNode : public rclcpp::Node
     auto trt_yolox = std::make_unique<tensorrt_yolox::TrtYoloX>(model_path, precision);
     auto image = cv::imread(image_path);
     tensorrt_yolox::ObjectArrays objects;
-    trt_yolox->doInference({image}, objects);
+    std::vector<cv::Mat> masks;
+    std::vector<cv::Mat> color_masks;
+    trt_yolox->doInference({image}, objects, masks, color_masks);
     for (const auto & object : objects[0]) {
       const auto left = object.x_offset;
       const auto top = object.y_offset;