diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..ec4a626 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.pth filter=lfs diff=lfs merge=lfs -text diff --git a/demos/reid_demo_0.webm b/demos/reid_demo_0.webm new file mode 100644 index 0000000..d9f24af Binary files /dev/null and b/demos/reid_demo_0.webm differ diff --git a/demos/reid_demo_1.webm b/demos/reid_demo_1.webm new file mode 100644 index 0000000..abfd7bb Binary files /dev/null and b/demos/reid_demo_1.webm differ diff --git a/tracking_re_id/launch/reid_headless.launch.py b/tracking_re_id/launch/reid_headless.launch.py new file mode 100644 index 0000000..b7a0261 --- /dev/null +++ b/tracking_re_id/launch/reid_headless.launch.py @@ -0,0 +1,51 @@ +"""Reid pipeline in headless mode (no cv2 display windows). + +To view output: + ros2 run rqt_image_view rqt_image_view → select /reid/annotated + rviz2 → add MarkerArray on /reid/track_markers and /keypoint_markers +""" + +import os +from launch import LaunchDescription +from launch.actions import ExecuteProcess + + +def generate_launch_description(): + python_exe = os.path.expanduser('~/miniconda3/envs/mmpose/bin/python3') + keyreID_path = os.path.expanduser('~/KeyRe-ID') + + return LaunchDescription([ + + ExecuteProcess( + cmd=[ + python_exe, '-m', 'tracking_re_id.single_person_loc_node', + '--ros-args', + '-p', 'threshold:=0.3', + '-p', 'device:=cuda:0', + '-p', 'max_residual:=0.10', + '-p', 'headless:=true', + ], + output='screen', + env={**os.environ}, + ), + + ExecuteProcess( + cmd=[ + python_exe, '-m', 'tracking_re_id.reid_node', + '--ros-args', + '-p', f'keyreID_path:={keyreID_path}', + '-p', 'num_classes:=150', + '-p', 'camera_num:=2', + '-p', 'device:=cuda:0', + '-p', 'seq_len:=4', + '-p', 'kp_threshold:=0.3', + '-p', 'match_threshold:=0.65', + '-p', 'track_dist_px:=120.0', + '-p', 'track_timeout:=3.0', + '-p', 'headless:=true', + ], + output='screen', + env={**os.environ}, + ), + + ]) diff --git a/tracking_re_id/launch/reid_pipeline.launch.py b/tracking_re_id/launch/reid_pipeline.launch.py new file mode 100644 index 0000000..9ebba19 --- /dev/null +++ b/tracking_re_id/launch/reid_pipeline.launch.py @@ -0,0 +1,70 @@ +"""Launch the KeyRe-ID re-identification pipeline alongside the existing +stereo triangulation pipeline. + +Nodes started +───────────── + 1. single_person_loc_node (unchanged – stereo 3-D triangulation) + publishes: /keypoint_markers (MarkerArray) + /keypoints_3d (PointCloud2) + + 2. reid_node (self-contained – left-camera MMPose + KeyRe-ID) + publishes: /reid/annotated (Image) + /reid/track_markers (MarkerArray) + +The two nodes are independent: reid_node runs its own MMPose instance on +the left camera only and does not depend on single_person_loc_node output. +Run them together to get both 3-D triangulation and persistent person IDs, +or launch reid_node on its own if only re-identification is needed. + +Viewing the output +────────────────── + ros2 run rqt_image_view rqt_image_view → /reid/annotated + rviz2 → add MarkerArray /reid/track_markers and /keypoint_markers +""" + +import os +from launch import LaunchDescription +from launch.actions import ExecuteProcess + + +def generate_launch_description(): + python_exe = os.path.expanduser('~/miniconda3/envs/mmpose/bin/python3') + keyreID_path = os.path.expanduser('~/KeyRe-ID') + + return LaunchDescription([ + + # # ── 1. Stereo keypoint triangulator (3-D, unchanged) ───────────────── + # ExecuteProcess( + # cmd=[ + # python_exe, '-m', 'tracking_re_id.single_person_loc_node', + # '--ros-args', + # '-p', 'threshold:=0.3', + # '-p', 'device:=cuda:0', + # '-p', 'max_residual:=0.10', + # '-p', 'headless:=true', + # ], + # output='screen', + # env={**os.environ}, + # ), + + # ── 2. KeyRe-ID re-identification (self-contained) ─────────────────── + ExecuteProcess( + cmd=[ + python_exe, '-m', 'tracking_re_id.reid_node', + '--ros-args', + '-p', f'keyreID_path:={keyreID_path}', + '-p', 'num_classes:=150', + '-p', 'camera_num:=2', + '-p', 'device:=cuda:0', + '-p', 'seq_len:=4', + '-p', 'kp_threshold:=0.3', + '-p', 'match_threshold:=0.65', + '-p', 'track_dist_px:=120.0', + '-p', 'track_timeout:=3.0', + '-p', 'headless:=false', + ], + output='screen', + env={**os.environ}, + ), + + ]) diff --git a/tracking_re_id/setup.py b/tracking_re_id/setup.py index d8883fe..5456350 100644 --- a/tracking_re_id/setup.py +++ b/tracking_re_id/setup.py @@ -13,6 +13,7 @@ setup( ['resource/' + package_name]), ('share/' + package_name, ['package.xml']), (os.path.join('share', package_name, 'launch'), glob('launch/*.py')), + (os.path.join('share', package_name, 'weights'), glob('weights/*.pth')), ], install_requires=['setuptools'], zip_safe=True, @@ -30,6 +31,7 @@ setup( 'single_person_loc_node = tracking_re_id.single_person_loc_node:main', 'ground_plane_node = tracking_re_id.ground_plane_node:main', 'overlay_node = tracking_re_id.overlay_node:main', + 'reid_node = tracking_re_id.reid_node:main', ], }, ) diff --git a/tracking_re_id/tracking_re_id/reid_node.py b/tracking_re_id/tracking_re_id/reid_node.py new file mode 100644 index 0000000..cc02cfc --- /dev/null +++ b/tracking_re_id/tracking_re_id/reid_node.py @@ -0,0 +1,431 @@ +""" +reid_node.py + +Self-contained ROS 2 node: MMPose 2-D pose estimation on the left stereo +camera + KeyRe-ID person re-identification. + +Pipeline (per frame) +──────────────────── + /stereo/left/image_raw + │ + ▼ + MMPoseInferencer (pose2d='human') + │ per-person keypoints (17, 2) + scores (17,) + ▼ + bbox extraction → person crop (256×128) + keypoint → 6-channel body-part heatmap (matches training pipeline) + │ + ▼ + Hungarian matching → per-track Tracklet buffer (deque, maxlen=seq_len) + │ + ▼ (once buffer full and person_id is None) + KeyRe-ID inference → feature embedding + │ + ▼ + Gallery cosine match → assign / register persistent person_id + │ + ▼ + /reid/annotated (Image) – left frame annotated with IDs + /reid/track_markers (MarkerArray) – labelled text markers for RViz + +Parameters +────────── + weights_path str path to iLIDSVIDbest_CMC.pth (required) + keyreID_path str path to KeyRe-ID source directory + num_classes int training split size (150 for iLIDS-VID split-0) + camera_num int cameras in training set (2 for iLIDS-VID) + device str 'cuda:0' or 'cpu' + seq_len int frames per tracklet clip (default 4) + kp_threshold float min keypoint confidence + match_threshold float cosine-similarity threshold for gallery match + track_dist_px float max centroid distance (px) to keep a track alive + track_timeout float seconds before an unseen track is dropped + headless bool suppress cv2 display window +""" + +import os +import sys +import time +import colorsys + +from ament_index_python.packages import get_package_share_directory + +import cv2 +import numpy as np +import rclpy +from rclpy.node import Node +from sensor_msgs.msg import Image +from visualization_msgs.msg import Marker, MarkerArray +from cv_bridge import CvBridge + +import torch + +from .reid_utils import ( + keypoints_to_heatmap, + keypoints_to_bbox, + clamp_bbox, + transform_keypoints_to_crop, + preprocess_crop, + preprocess_heatmap, + Tracklet, + Gallery, +) + + +# ── Hungarian matching ──────────────────────────────────────────────────────── + +def _hungarian_match(tracks: dict, detections: list, max_dist: float): + """ + Associate existing tracks to new per-frame detections by 2-D centroid + distance using the Hungarian algorithm. + + Returns: + matched : list of (track_id, det_index) + unmatched_tracks : track_ids with no detection this frame + unmatched_dets : detection indices with no existing track + """ + from scipy.optimize import linear_sum_assignment + + track_ids = list(tracks.keys()) + n_t, n_d = len(track_ids), len(detections) + + if n_t == 0: + return [], [], list(range(n_d)) + if n_d == 0: + return [], track_ids, [] + + cost = np.full((n_t, n_d), max_dist + 1.0) + for i, tid in enumerate(track_ids): + tc = tracks[tid].centroid + if tc is None: + continue + for j, det in enumerate(detections): + dc = det['centroid'] + if dc is not None: + cost[i, j] = float(np.hypot(tc[0] - dc[0], tc[1] - dc[1])) + + row_ind, col_ind = linear_sum_assignment(cost) + + matched, used_t, used_d = [], set(), set() + for ri, ci in zip(row_ind, col_ind): + if cost[ri, ci] <= max_dist: + matched.append((track_ids[ri], ci)) + used_t.add(track_ids[ri]) + used_d.add(ci) + + return ( + matched, + [tid for tid in track_ids if tid not in used_t], + [j for j in range(n_d) if j not in used_d], + ) + + +# ── Stable per-person colour ────────────────────────────────────────────────── + +def _id_colour(person_id: int) -> tuple: + r, g, b = colorsys.hsv_to_rgb(((person_id * 0.37) % 1.0), 0.9, 1.0) + return (int(b * 255), int(g * 255), int(r * 255)) + + +# ═══════════════════════════════════════════════════════════════════════════════ + +class ReIDNode(Node): + """KeyRe-ID re-identification node with integrated MMPose 2-D detection.""" + + def __init__(self): + super().__init__('reid_node') + + # ── Parameters ────────────────────────────────────────────────────── + self.declare_parameter('weights_path', + os.path.join( + get_package_share_directory('tracking_re_id'), + 'weights', 'iLIDSVIDbest_CMC.pth')) + self.declare_parameter('keyreID_path', + os.path.expanduser('~/KeyRe-ID')) + self.declare_parameter('num_classes', 150) + self.declare_parameter('camera_num', 2) + self.declare_parameter('device', 'cuda:0') + self.declare_parameter('seq_len', 4) + self.declare_parameter('kp_threshold', 0.3) + self.declare_parameter('match_threshold', 0.65) + self.declare_parameter('track_dist_px', 120.0) + self.declare_parameter('track_timeout', 3.0) + self.declare_parameter('headless', False) + + weights_path = self.get_parameter('weights_path').value + keyreID_path = self.get_parameter('keyreID_path').value + num_classes = self.get_parameter('num_classes').value + camera_num = self.get_parameter('camera_num').value + device_str = self.get_parameter('device').value + self._seq_len = self.get_parameter('seq_len').value + self._kp_thresh = self.get_parameter('kp_threshold').value + self._match_thresh = self.get_parameter('match_threshold').value + self._track_dist = self.get_parameter('track_dist_px').value + self._track_timeout = self.get_parameter('track_timeout').value + self._headless = self.get_parameter('headless').value + + self._device = torch.device( + device_str if torch.cuda.is_available() else 'cpu') + self.get_logger().info(f'Using device: {self._device}') + + # ── MMPose ─────────────────────────────────────────────────────────── + from mmpose.apis import MMPoseInferencer # noqa: PLC0415 + self.get_logger().info(f'Loading MMPose on {device_str} …') + self._inferencer = MMPoseInferencer(pose2d='human', device=device_str) + self.get_logger().info('MMPose loaded.') + + # ── KeyRe-ID ───────────────────────────────────────────────────────── + if keyreID_path not in sys.path: + sys.path.insert(0, keyreID_path) + try: + from KeyRe_ID_model import KeyRe_ID # noqa: PLC0415 + except ImportError as exc: + self.get_logger().fatal( + f'Cannot import KeyRe_ID_model from {keyreID_path}: {exc}') + raise + + self.get_logger().info(f'Loading KeyRe-ID weights from {weights_path} …') + self._model = KeyRe_ID( + num_classes=num_classes, + camera_num=camera_num, + pretrainpath=None, + ) + self._model.load_param(weights_path, load=False) + self._model.to(self._device) + self._model.eval() + self.get_logger().info('KeyRe-ID model ready.') + + # ── ROS infrastructure ─────────────────────────────────────────────── + self._bridge = CvBridge() + + self.create_subscription( + Image, '/stereo/left/image_raw', self._image_cb, 10) + + self._vis_pub = self.create_publisher(Image, '/reid/annotated', 10) + self._marker_pub = self.create_publisher(MarkerArray, '/reid/track_markers', 10) + + # ── State ──────────────────────────────────────────────────────────── + self._tracks: dict[int, Tracklet] = {} + self._next_track_id: int = 0 + self._gallery = Gallery(threshold=self._match_thresh) + + self._display_frame = None + if not self._headless: + self.create_timer(1.0 / 30.0, self._display_timer_cb) + + self.get_logger().info( + 'reid_node ready. Waiting for /stereo/left/image_raw …') + + # ── MMPose helper ───────────────────────────────────────────────────────── + + def _run_mmpose(self, frame: np.ndarray) -> list: + """Return list of dicts {keypoints: (17,2), scores: (17,)}.""" + result = next(self._inferencer(frame, show=False, return_datasamples=False)) + people = [] + for pred in result.get('predictions', [[]])[0]: + kps = pred.get('keypoints', []) + scores = pred.get('keypoint_scores', []) + if len(kps) > 0: + people.append({ + 'keypoints': np.array(kps, dtype=np.float32), + 'scores': np.array(scores, dtype=np.float32), + }) + return people + + # ── Main image callback ─────────────────────────────────────────────────── + + def _image_cb(self, img_msg: Image): + now = time.time() + frame = self._bridge.imgmsg_to_cv2(img_msg, desired_encoding='bgr8') + frame_h, frame_w = frame.shape[:2] + + # Detect people and compute centroids / bboxes + raw_people = self._run_mmpose(frame) + detections = [] + for person in raw_people: + kps, scores = person['keypoints'], person['scores'] + bbox = keypoints_to_bbox(kps, scores, threshold=self._kp_thresh) + if bbox is None: + continue + visible = scores > self._kp_thresh + centroid = (float(kps[visible, 0].mean()), + float(kps[visible, 1].mean())) if np.any(visible) else None + if centroid is None: + continue + detections.append({ + 'keypoints': kps, + 'scores': scores, + 'bbox': bbox, + 'centroid': centroid, + }) + + # Associate detections to existing tracks + matched, unmatched_tracks, unmatched_dets = _hungarian_match( + self._tracks, detections, self._track_dist) + + for tid, det_idx in matched: + self._update_track( + self._tracks[tid], detections[det_idx], + frame, frame_w, frame_h, now) + + for det_idx in unmatched_dets: + tid = self._next_track_id + self._next_track_id += 1 + self._tracks[tid] = Tracklet(tid, seq_len=self._seq_len) + self._update_track( + self._tracks[tid], detections[det_idx], + frame, frame_w, frame_h, now) + + # Re-ID: only for tracks whose buffer just became full (person_id still None) + with torch.no_grad(): + for track in self._tracks.values(): + if track.is_ready(): + self._run_reid(track) + + # Drop stale tracks + for tid in [tid for tid, t in self._tracks.items() + if now - t.last_seen > self._track_timeout]: + del self._tracks[tid] + + # Publish + vis = self._build_visualisation(frame.copy()) + out = self._bridge.cv2_to_imgmsg(vis, encoding='bgr8') + out.header = img_msg.header + self._vis_pub.publish(out) + self._publish_markers(img_msg.header.stamp) + + if not self._headless: + self._display_frame = vis + + # ── Track update ────────────────────────────────────────────────────────── + + def _update_track(self, track: Tracklet, det: dict, + frame: np.ndarray, frame_w: int, frame_h: int, + timestamp: float): + x1, y1, x2, y2 = clamp_bbox(*det['bbox'], frame_w, frame_h) + if x2 - x1 < 10 or y2 - y1 < 10: + return + + crop_bgr = frame[y1:y2, x1:x2] + kp_xyc, crop_w, crop_h = transform_keypoints_to_crop( + det['keypoints'], det['scores'], x1, y1, x2, y2) + heatmap_np = keypoints_to_heatmap( + kp_xyc, crop_w, crop_h, vis_thresh=self._kp_thresh) + + try: + crop_t = preprocess_crop(crop_bgr) + heatmap_t = preprocess_heatmap(heatmap_np) + except Exception as exc: + self.get_logger().warn( + f'Preprocess failed for track {track.track_id}: {exc}') + return + + track.add_frame(crop_t, heatmap_t, + centroid=det['centroid'], + bbox=(x1, y1, x2, y2), + timestamp=timestamp) + + # ── KeyRe-ID inference ──────────────────────────────────────────────────── + + def _run_reid(self, track: Tracklet): + imgs, hmaps = track.get_model_inputs() + imgs = imgs.to(self._device) + hmaps = hmaps.to(self._device) + # cam_label=0: left stereo camera → iLIDS-VID cam1 index + feat = self._model(imgs, hmaps, None, cam_label=0) + feature = feat[0].cpu() + track.feature = feature + track.person_id, track.match_sim = self._gallery.match_or_register(feature) + + # ── Visualisation ───────────────────────────────────────────────────────── + + def _build_visualisation(self, frame: np.ndarray) -> np.ndarray: + for track in self._tracks.values(): + if track.bbox is None: + continue + x1, y1, x2, y2 = track.bbox + if track.person_id is not None: + colour = _id_colour(track.person_id) + label = f'P{track.person_id} ({track.match_sim:.2f})' + else: + colour = (160, 160, 160) + label = f'T{track.track_id} ({len(track.crops)}/{self._seq_len})' + + cv2.rectangle(frame, (x1, y1), (x2, y2), colour, 2) + (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.55, 1) + cv2.rectangle(frame, (x1, y1 - th - 6), (x1 + tw + 4, y1), colour, -1) + cv2.putText(frame, label, (x1 + 2, y1 - 4), + cv2.FONT_HERSHEY_SIMPLEX, 0.55, (0, 0, 0), 1, cv2.LINE_AA) + + cv2.putText(frame, + f'Known: {len(self._gallery)} Tracks: {len(self._tracks)}', + (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.6, + (255, 255, 255), 2, cv2.LINE_AA) + return frame + + # ── RViz markers ───────────────────────────────────────────────────────── + + def _publish_markers(self, stamp): + ma = MarkerArray() + delete = Marker() + delete.action = Marker.DELETEALL + delete.header.frame_id = 'left' + delete.header.stamp = stamp + ma.markers.append(delete) + + mid = 0 + for track in self._tracks.values(): + if track.centroid is None or track.person_id is None: + continue + colour = _id_colour(track.person_id) + m = Marker() + m.header.frame_id = 'left' + m.header.stamp = stamp + m.ns = 'reid_labels' + m.id = mid; mid += 1 + m.type = Marker.TEXT_VIEW_FACING + m.action = Marker.ADD + m.pose.position.x = float(track.centroid[0]) / 100.0 + m.pose.position.y = float(track.centroid[1]) / 100.0 + m.pose.position.z = 2.0 + m.pose.orientation.w = 1.0 + m.scale.z = 0.15 + m.color.r = colour[2] / 255.0 + m.color.g = colour[1] / 255.0 + m.color.b = colour[0] / 255.0 + m.color.a = 1.0 + m.text = f'P{track.person_id}' + m.lifetime.nanosec = 500_000_000 + ma.markers.append(m) + + self._marker_pub.publish(ma) + + # ── Display timer ───────────────────────────────────────────────────────── + + def _display_timer_cb(self): + if self._display_frame is not None: + cv2.imshow('KeyRe-ID', self._display_frame) + if cv2.waitKey(1) & 0xFF == ord('q'): + self.get_logger().info('Quit requested.') + self.destroy_node() + rclpy.shutdown() + + +# ── Entry point ─────────────────────────────────────────────────────────────── + +def main(args=None): + rclpy.init(args=args) + node = ReIDNode() + try: + rclpy.spin(node) + except KeyboardInterrupt: + pass + finally: + if not node._headless: + cv2.destroyAllWindows() + node.destroy_node() + rclpy.try_shutdown() + + +if __name__ == '__main__': + main() diff --git a/tracking_re_id/tracking_re_id/reid_utils.py b/tracking_re_id/tracking_re_id/reid_utils.py new file mode 100644 index 0000000..0d0b8dc --- /dev/null +++ b/tracking_re_id/tracking_re_id/reid_utils.py @@ -0,0 +1,326 @@ +""" +reid_utils.py + +Shared utilities for the KeyRe-ID ROS 2 node: + - keypoints_to_heatmap() -- matches the generate_heatmaps_ilids.py training pipeline + - preprocess_crop() -- matches val_transforms from heatmap_loader.py + - preprocess_heatmap() -- matches CustomHeatmapTransform from heatmap_loader.py + - keypoints_to_bbox() -- derive a bounding box from visible keypoints + - transform_keypoints_to_crop() -- remap full-frame kps to crop-relative coords + - Tracklet -- per-track frame buffer + - Gallery -- known-person feature store with cosine re-ID +""" + +from collections import deque + +import cv2 +import numpy as np +import torch +import torch.nn.functional as F +import torchvision.transforms as T +from scipy.ndimage import gaussian_filter +from torchvision.transforms import InterpolationMode + +# ── COCO keypoint indices ───────────────────────────────────────────────────── +NOSE, LEYE, REYE, LEAR, REAR = 0, 1, 2, 3, 4 +LS, RS, LE, RE, LW, RW = 5, 6, 7, 8, 9, 10 +LH, RH, LK, RK, LA, RA = 11, 12, 13, 14, 15, 16 + + +# ── Heatmap generation (identical to generate_heatmaps_ilids.py) ────────────── + +def keypoints_to_heatmap(kp_array, img_w, img_h, vis_thresh=0.1): + """ + Convert 17 COCO keypoints to 6-channel body-part heatmaps. + + This exactly reproduces the pipeline used to generate the training data so + the heatmap distribution seen at inference matches what the model trained on. + + Args: + kp_array : np.ndarray (17, 3) [x, y, confidence] + img_w : int width of the source image (crop) + img_h : int height of the source image (crop) + vis_thresh : float confidence threshold + + Returns: + np.ndarray (6, img_h, img_w) channels: head, torso, l-arm, r-arm, l-leg, r-leg + """ + heatmaps = np.zeros((6, img_h, img_w), dtype=np.float32) + blur_sigma = max(1.0, img_w / 16.0) + line_thickness = max(1, int(img_w / 8)) + + def vis(i): + return kp_array[i, 2] > vis_thresh + + def pt(i): + return (int(kp_array[i, 0]), int(kp_array[i, 1])) + + def in_bounds(i): + x, y = int(kp_array[i, 0]), int(kp_array[i, 1]) + return 0 <= x < img_w and 0 <= y < img_h + + def usable(i): + return vis(i) and in_bounds(i) + + # Channel 0: Head + head_pts = [pt(i) for i in [NOSE, LEYE, REYE, LEAR, REAR] if usable(i)] + if len(head_pts) >= 3: + hull = cv2.convexHull(np.array(head_pts, dtype=np.int32)) + cv2.fillConvexPoly(heatmaps[0], hull, 1.0) + elif head_pts: + for p in head_pts: + cv2.circle(heatmaps[0], p, line_thickness, 1.0, -1) + + # Channel 1: Torso + torso_order = [LS, RS, RH, LH] + torso_usable = [i for i in torso_order if usable(i)] + if len(torso_usable) == 4: + pts = np.array([pt(i) for i in torso_order], dtype=np.int32) + cv2.fillPoly(heatmaps[1], [pts], 1.0) + elif len(torso_usable) >= 3: + hull = cv2.convexHull(np.array([pt(i) for i in torso_usable], dtype=np.int32)) + cv2.fillConvexPoly(heatmaps[1], hull, 1.0) + elif len(torso_usable) == 2: + pts_list = [pt(i) for i in torso_usable] + cv2.line(heatmaps[1], pts_list[0], pts_list[1], 1.0, line_thickness) + elif len(torso_usable) == 1: + cv2.circle(heatmaps[1], pt(torso_usable[0]), line_thickness, 1.0, -1) + + # Channels 2-5: Limbs + limbs = [ + (2, LE, LW), # left arm + (3, RE, RW), # right arm + (4, LK, LA), # left leg + (5, RK, RA), # right leg + ] + for ch, j1, j2 in limbs: + u1, u2 = usable(j1), usable(j2) + if u1 and u2: + cv2.line(heatmaps[ch], pt(j1), pt(j2), 1.0, line_thickness) + elif u1: + cv2.circle(heatmaps[ch], pt(j1), line_thickness, 1.0, -1) + elif u2: + cv2.circle(heatmaps[ch], pt(j2), line_thickness, 1.0, -1) + + # Gaussian blur for soft edges + for i in range(6): + if heatmaps[i].max() > 0: + heatmaps[i] = gaussian_filter(heatmaps[i], sigma=blur_sigma) + + # L/R sanity check based on torso orientation + if usable(LS) and usable(RS): + shoulder_gap = abs(kp_array[LS, 0] - kp_array[RS, 0]) + if shoulder_gap > img_w * 0.05: + facing_camera = kp_array[LS, 0] > kp_array[RS, 0] + if usable(LE) and usable(RE): + if facing_camera != (kp_array[LE, 0] > kp_array[RE, 0]): + heatmaps[2], heatmaps[3] = heatmaps[3].copy(), heatmaps[2].copy() + if usable(LK) and usable(RK): + if facing_camera != (kp_array[LK, 0] > kp_array[RK, 0]): + heatmaps[4], heatmaps[5] = heatmaps[5].copy(), heatmaps[4].copy() + + # Normalize each channel to [0, 1] + for i in range(6): + mx = heatmaps[i].max() + if mx > 0: + heatmaps[i] /= mx + + return heatmaps + + +# ── Bounding box utilities ──────────────────────────────────────────────────── + +def keypoints_to_bbox(keypoints, scores, threshold=0.3, margin=0.15): + """ + Derive a bounding box (x1, y1, x2, y2) from visible COCO keypoints. + + Returns None if no keypoints are visible above threshold. + """ + visible = scores > threshold + if not np.any(visible): + return None + kps = keypoints[visible] + x1, y1 = float(kps[:, 0].min()), float(kps[:, 1].min()) + x2, y2 = float(kps[:, 0].max()), float(kps[:, 1].max()) + w, h = x2 - x1, y2 - y1 + x1 -= w * margin + y1 -= h * margin * 1.5 # extra headroom + x2 += w * margin + y2 += h * margin * 0.5 + return x1, y1, x2, y2 + + +def clamp_bbox(x1, y1, x2, y2, frame_w, frame_h): + """Clamp bbox to image boundaries, return as ints.""" + x1 = max(0, int(x1)) + y1 = max(0, int(y1)) + x2 = min(frame_w, int(x2)) + y2 = min(frame_h, int(y2)) + return x1, y1, x2, y2 + + +def transform_keypoints_to_crop(keypoints, scores, x1, y1, x2, y2): + """ + Re-map full-frame keypoint coordinates to crop-local coordinates so that + the heatmap is generated in the same coordinate space as the training data + (where MMPose was run on individual person crops, not the full frame). + + Returns: + kp_xyc : np.ndarray (17, 3) crop-relative [x, y, score] + crop_w : int + crop_h : int + """ + kp_crop = keypoints.copy().astype(np.float32) + kp_crop[:, 0] -= x1 + kp_crop[:, 1] -= y1 + kp_xyc = np.column_stack([kp_crop, scores.astype(np.float32)]) + return kp_xyc, int(x2 - x1), int(y2 - y1) + + +# ── Preprocessing (must match heatmap_loader.py transforms) ────────────────── + +_IMG_TRANSFORM = T.Compose([ + T.Resize([256, 128], interpolation=InterpolationMode.BILINEAR), + T.ToTensor(), + T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), +]) + +_HM_NORMALIZE = T.Normalize(mean=[0.5] * 6, std=[0.5] * 6) + + +def preprocess_crop(crop_bgr): + """ + Preprocess a BGR person crop (numpy HxWx3) to a model-ready tensor. + + Pipeline matches val_transforms in heatmap_loader.py: + Resize(256, 128) → ToTensor → Normalize(0.5, 0.5) + + Returns: + torch.Tensor (3, 256, 128) + """ + from PIL import Image as PILImage + rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB) + pil_img = PILImage.fromarray(rgb) + return _IMG_TRANSFORM(pil_img) + + +def preprocess_heatmap(heatmap_np): + """ + Preprocess a raw (6, H, W) numpy heatmap to a model-ready tensor. + + Pipeline matches CustomHeatmapTransform([256, 128]) in heatmap_loader.py: + Resize(256, 128) bilinear → MinMaxScale per channel → Normalize(0.5, 0.5) + + Returns: + torch.Tensor (6, 256, 128) + """ + t = torch.tensor(heatmap_np, dtype=torch.float32).unsqueeze(0) # (1, 6, H, W) + t = F.interpolate(t, size=(256, 128), mode='bilinear', align_corners=False) + t = t.squeeze(0) # (6, 256, 128) + # Min-max scale per channel + min_val = t.amin(dim=(1, 2), keepdim=True) + max_val = t.amax(dim=(1, 2), keepdim=True) + t = (t - min_val) / (max_val - min_val + 1e-6) + t = _HM_NORMALIZE(t) + return t + + +# ── Tracklet ────────────────────────────────────────────────────────────────── + +class Tracklet: + """ + Stores a rolling buffer of preprocessed (crop, heatmap) tensors for one + tracked person across time, plus the most recently assigned re-ID result. + """ + + def __init__(self, track_id: int, seq_len: int = 4): + self.track_id = track_id + self.seq_len = seq_len + + # Rolling buffers – when full a new frame displaces the oldest + self.crops: deque = deque(maxlen=seq_len) # Tensor (3, 256, 128) + self.heatmaps: deque = deque(maxlen=seq_len) # Tensor (6, 256, 128) + + self.centroid = None # (cx, cy) pixel coords in latest frame + self.bbox = None # (x1, y1, x2, y2) in latest frame + self.last_seen: float = 0.0 # time.time() of latest update + + # Re-ID assignment (filled after gallery matching) + self.person_id: int | None = None + self.match_sim: float = 0.0 + + # Latest computed feature embedding (feat_dim,) + self.feature: torch.Tensor | None = None + + def add_frame(self, crop_t: torch.Tensor, heatmap_t: torch.Tensor, + centroid, bbox, timestamp: float): + self.crops.append(crop_t) + self.heatmaps.append(heatmap_t) + self.centroid = centroid + self.bbox = bbox + self.last_seen = timestamp + + def is_ready(self) -> bool: + """True once the rolling buffer has been filled at least once.""" + return len(self.crops) >= self.seq_len + + def get_model_inputs(self): + """ + Returns (imgs, hmaps) tensors shaped for KeyRe_ID: + imgs : (1, seq_len, 3, 256, 128) + hmaps : (1, seq_len, 6, 256, 128) + """ + imgs = torch.stack(list(self.crops)).unsqueeze(0) + hmaps = torch.stack(list(self.heatmaps)).unsqueeze(0) + return imgs, hmaps + + +# ── Gallery ─────────────────────────────────────────────────────────────────── + +class Gallery: + """ + Manages a set of known-person feature embeddings and performs cosine- + similarity-based re-identification. + + New persons are registered when their best match falls below `threshold`. + Known persons' embeddings are updated with an exponential moving average. + """ + + def __init__(self, threshold: float = 0.65, ema_alpha: float = 0.9): + self.threshold = threshold + self.ema_alpha = ema_alpha + self._embeddings: dict[int, torch.Tensor] = {} # person_id → (feat_dim,) + self._next_id: int = 1 + + def match_or_register(self, feature: torch.Tensor) -> tuple[int, float]: + """ + Compare *feature* against all gallery entries. + + Returns: + (person_id, cosine_similarity) + If similarity < threshold the person is registered as new. + """ + feat_norm = F.normalize(feature.unsqueeze(0), dim=1) # (1, D) + + best_pid, best_sim = None, -1.0 + for pid, emb in self._embeddings.items(): + sim = float((feat_norm @ F.normalize(emb.unsqueeze(0), dim=1).T).item()) + if sim > best_sim: + best_sim, best_pid = sim, pid + + if best_pid is not None and best_sim >= self.threshold: + # Update existing entry with EMA + self._embeddings[best_pid] = ( + self.ema_alpha * self._embeddings[best_pid] + + (1.0 - self.ema_alpha) * feature.detach() + ) + return best_pid, best_sim + + # Register new person + pid = self._next_id + self._next_id += 1 + self._embeddings[pid] = feature.detach().clone() + return pid, best_sim if best_pid is not None else 1.0 + + def __len__(self) -> int: + return len(self._embeddings) diff --git a/tracking_re_id/weights/iLIDSVIDbest_CMC.pth b/tracking_re_id/weights/iLIDSVIDbest_CMC.pth new file mode 100644 index 0000000..ea1f9db --- /dev/null +++ b/tracking_re_id/weights/iLIDSVIDbest_CMC.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d539c7fc324323ef494ce713edc820d8cc4e2fb631c16b936cd1b9ae1c6424b +size 840297706