Update: I am having slightly better luck with the following Python:
import cv2
import numpy as np
target_image = cv2.imread("target.png", cv2.IMREAD_GRAYSCALE)
if target_image is None:
raise FileNotFoundError("Target image not found. Make sure 'target.jpg' exists in the script directory.")
sift = cv2.SIFT_create()
keypoints_target, descriptors_target = sift.detectAndCompute(target_image, None)
FLANN_INDEX_KDTREE = 1
index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
search_params = dict(checks=50)
matcher = cv2.FlannBasedMatcher(index_params, search_params)
cap = cv2.VideoCapture(0)
if not cap.isOpened():
raise RuntimeError("Could not open webcam.")
while True:
ret, frame = cap.read()
if not ret:
print("Failed to grab frame.")
break
gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
keypoints_frame, descriptors_frame = sift.detectAndCompute(gray_frame, None)
if descriptors_frame is None or len(keypoints_frame) < 10:
continue # Skip frame if not enough keypoints are detected
matches = matcher.knnMatch(descriptors_target, descriptors_frame, k=2)
good_matches = []
for m, n in matches:
if m.distance < 0.75 * n.distance:
good_matches.append(m)
if len(good_matches) > 10:
src_pts = np.float32([keypoints_target[m.queryIdx].pt for m in good_matches]).reshape(-1, 1, 2)
dst_pts = np.float32([keypoints_frame[m.trainIdx].pt for m in good_matches]).reshape(-1, 1, 2)
# Compute homography
H, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
if H is not None:
# Get dimensions of target image
h, w = target_image.shape
corners = np.float32([[0, 0], [w, 0], [w, h], [0, h]]).reshape(-1, 1, 2)
# Warp corners to the video frame
transformed_corners = cv2.perspectiveTransform(corners, H)
# Draw bounding box around detected object
frame = cv2.polylines(frame, [np.int32(transformed_corners)], True, (0, 255, 0), 3, cv2.LINE_AA)
cv2.imshow("SIFT Feature Matching & Homography", frame)
# Exit on 'q' key press
if cv2.waitKey(1) & 0xFF == ord('q'):
break
# Release resources
cap.release()
cv2.destroyAllWindows()
Differences
- Using SIFT instead of ORB
- FlannBasedMatcher
This performs fairly well when run just in Python. Honestly, it’s still not as great as I remember SIFT tracking being, but it’s within striking distance - enough so that I am confident that I can tweak it to get better results.
However, when I copy this code into TD, it doesn’t work at all. Here’s what I have in a Script TOP
# me - this DAT
# scriptOp - the OP which is cooking
import cv2
import numpy as np
target_image = cv2.imread("target.png", cv2.IMREAD_GRAYSCALE)
if target_image is None:
raise FileNotFoundError("Target image not found. Make sure 'target.jpg' exists in the script directory.")
sift = cv2.SIFT_create()
keypoints_target, descriptors_target = sift.detectAndCompute(target_image, None)
FLANN_INDEX_KDTREE = 1
index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
search_params = dict(checks=50)
matcher = cv2.FlannBasedMatcher(index_params, search_params)
# press 'Setup Parameters' in the OP to call this function to re-create the parameters.
def onSetupParameters(scriptOp):
return
# called whenever custom pulse parameter is pushed
def onPulse(par):
return
def onCook(scriptOp):
frame = scriptOp.inputs[0].numpyArray(delayed=False)
gray_frame = cv2.cvtColor(frame, cv2.COLOR_RGBA2GRAY)
gray_frame = (gray_frame*255).astype(np.uint8)
keypoints_frame, descriptors_frame = sift.detectAndCompute(gray_frame, None)
if descriptors_frame is None or len(keypoints_frame) < 10:
return # Skip frame if not enough keypoints are detected
matches = matcher.knnMatch(descriptors_target, descriptors_frame, k=2)
good_matches = []
for m, n in matches:
if m.distance < 0.75 * n.distance:
good_matches.append(m)
if len(good_matches) > 10:
src_pts = np.float32([keypoints_target[m.queryIdx].pt for m in good_matches]).reshape(-1, 1, 2)
dst_pts = np.float32([keypoints_frame[m.trainIdx].pt for m in good_matches]).reshape(-1, 1, 2)
# Compute homography
H, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)
if H is not None:
# Get dimensions of target image
h, w = target_image.shape
corners = np.float32([[0, 0], [w, 0], [w, h], [0, h]]).reshape(-1, 1, 2)
# Warp corners to the video frame
transformed_corners = cv2.perspectiveTransform(corners, H)
# Draw bounding box around detected object
frame = cv2.polylines(frame, [np.int32(transformed_corners)], True, (0, 255, 0), 3, cv2.LINE_AA)
gray_frame = np.expand_dims(gray_frame, axis=-1)
scriptOp.copyNumpyArray(frame)
return
This leads me to believe that I am doing something bad with the input frame in TD that is done better with cv2.VideoCapture.
Once I solve this problem, I’m going to move on to CUDA accelerated methods, because this is still sloooooow.