Hello everyone,
I am currently working on a project using a Jetson Nano where I am trying to perform inference using TensorRT (Yolov5 object detection) in a ROS (Melodic) node. I have encountered a Bus Error(Core dumped) / segmentation fault (core dumped), which seems to arise from a memory conflict between ROS and TensorRT. I am seeking advice or solutions from anyone who might have faced and resolved a similar issue.
I have provided the code to my inference node below. I would greatly appreciate any insights, suggestions, or solutions.
Environment Details:
Jetpack Version: 4.6.
ROS Version: ROS Melodic
TensorRT Version: 8.2.1.8
PyCUDA: 2019.1.2
Torch version: 1.10.0
CUDA: 10.2
`import cv2
import numpy as np
import tensorrt as trt
import pycuda.autoinit
import random
import ctypes
import pycuda.driver as cuda
import time
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
host_inputs = []
cuda_inputs = []
host_outputs = []
cuda_outputs = []
bindings = []
class YoloTRT():
def init(self, library, engine, conf, yolo_ver):
self.CONF_THRESH = conf
self.IOU_THRESHOLD = 0.4
self.LEN_ALL_RESULT = 38001
self.LEN_ONE_RESULT = 38
self.yolo_version = yolo_ver
self.categories = ["pedestrian", "people", "bicycle", "car", "van", "truck", "tricycle", "awning-tricycle", "bus", "motor"]
'''
self.categories = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
"hair drier", "toothbrush"]
'''
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
ctypes.CDLL(library)
with open(engine, 'rb') as f:
serialized_engine = f.read()
runtime = trt.Runtime(TRT_LOGGER)
self.engine = runtime.deserialize_cuda_engine(serialized_engine)
self.batch_size = self.engine.max_batch_size
for binding in self.engine:
size = trt.volume(self.engine.get_binding_shape(binding)) * self.batch_size
dtype = trt.nptype(self.engine.get_binding_dtype(binding))
host_mem = cuda.pagelocked_empty(size, dtype)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(cuda_mem))
if self.engine.binding_is_input(binding):
self.input_w = self.engine.get_binding_shape(binding)[-1]
self.input_h = self.engine.get_binding_shape(binding)[-2]
host_inputs.append(host_mem)
cuda_inputs.append(cuda_mem)
else:
host_outputs.append(host_mem)
cuda_outputs.append(cuda_mem)
def PreProcessImg(self, img):
image_raw = img
h, w, c = image_raw.shape
image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
r_w = self.input_w / w
r_h = self.input_h / h
if r_h > r_w:
tw = self.input_w
th = int(r_w * h)
tx1 = tx2 = 0
ty1 = int((self.input_h - th) / 2)
ty2 = self.input_h - th - ty1
else:
tw = int(r_h * w)
th = self.input_h
tx1 = int((self.input_w - tw) / 2)
tx2 = self.input_w - tw - tx1
ty1 = ty2 = 0
image = cv2.resize(image, (tw, th))
image = cv2.copyMakeBorder(image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128))
image = image.astype(np.float32)
image /= 255.0
image = np.transpose(image, [2, 0, 1])
image = np.expand_dims(image, axis=0)
image = np.ascontiguousarray(image)
return image, image_raw, h, w
def Inference(self, img):
input_image, image_raw, origin_h, origin_w = self.PreProcessImg(img)
np.copyto(host_inputs[0], input_image.ravel())
stream = cuda.Stream()
self.context = self.engine.create_execution_context()
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
t1 = time.time()
self.context.execute_async(self.batch_size, bindings, stream_handle=stream.handle)
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
stream.synchronize()
t2 = time.time()
output = host_outputs[0]
for i in range(self.batch_size):
result_boxes, result_scores, result_classid = self.PostProcess(output[i * self.LEN_ALL_RESULT: (i + 1) * self.LEN_ALL_RESULT], origin_h, origin_w)
det_res = []
for j in range(len(result_boxes)):
box = result_boxes[j]
det = dict()
det["class"] = self.categories[int(result_classid[j])]
det["conf"] = result_scores[j]
det["box"] = box
det_res.append(det)
self.PlotBbox(box, img, label="{}:{:.2f}".format(self.categories[int(result_classid[j])], result_scores[j]),)
return det_res, t2-t1
def PostProcess(self, output, origin_h, origin_w):
num = int(output[0])
if self.yolo_version == "v5":
pred = np.reshape(output[1:], (-1, self.LEN_ONE_RESULT))[:num, :]
pred = pred[:, :6]
elif self.yolo_version == "v7":
pred = np.reshape(output[1:], (-1, 6))[:num, :]
boxes = self.NonMaxSuppression(pred, origin_h, origin_w, conf_thres=self.CONF_THRESH, nms_thres=self.IOU_THRESHOLD)
result_boxes = boxes[:, :4] if len(boxes) else np.array([])
result_scores = boxes[:, 4] if len(boxes) else np.array([])
result_classid = boxes[:, 5] if len(boxes) else np.array([])
return result_boxes, result_scores, result_classid
def NonMaxSuppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
boxes = prediction[prediction[:, 4] >= conf_thres]
boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w -1)
boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w -1)
boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h -1)
boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h -1)
confs = boxes[:, 4]
boxes = boxes[np.argsort(-confs)]
keep_boxes = []
while boxes.shape[0]:
large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
label_match = boxes[0, -1] == boxes[:, -1]
# Indices of boxes with lower confidence scores, large IOUs and matching labels
invalid = large_overlap & label_match
keep_boxes += [boxes[0]]
boxes = boxes[~invalid]
boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
return boxes
def xywh2xyxy(self, origin_h, origin_w, x):
y = np.zeros_like(x)
r_w = self.input_w / origin_w
r_h = self.input_h / origin_h
if r_h > r_w:
y[:, 0] = x[:, 0] - x[:, 2] / 2
y[:, 2] = x[:, 0] + x[:, 2] / 2
y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.input_h - r_w * origin_h) / 2
y /= r_w
else:
y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.input_w - r_h * origin_w) / 2
y[:, 1] = x[:, 1] - x[:, 3] / 2
y[:, 3] = x[:, 1] + x[:, 3] / 2
y /= r_h
return y
def bbox_iou(self, box1, box2, x1y1x2y2=True):
if not x1y1x2y2:
# Transform from center and width to exact coordinates
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
else:
# Get the coordinates of bounding boxes
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
inter_rect_x1 = np.maximum(b1_x1, b2_x1)
inter_rect_y1 = np.maximum(b1_y1, b2_y1)
inter_rect_x2 = np.minimum(b1_x2, b2_x2)
inter_rect_y2 = np.minimum(b1_y2, b2_y2)
inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \
np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
return iou
def PlotBbox(self, x, img, color=None, label=None, line_thickness=None):
tl = (line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1) # line/font thickness
color = color or [random.randint(0, 255) for _ in range(3)]
c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
if label:
tf = max(tl - 1, 1) # font thickness
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled
cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA,)
I am using the following node(camera.py) to give the input (both the above node and this node are in the same rospackage):
import sys
import cv2
import imutils
import time
from yoloDet import YoloTRT
use path for library and engine file
model = YoloTRT(library="yolov5/build/libmyplugins.so", engine="yolov5/build/yolov5s_1504.engine", conf=0.5, yolo_ver="v5")
cap = cv2.VideoCapture("./DJI_trimmed.mp4")
frame_process_time = []
while True:
ret, frame = cap.read()
if not ret:
print("Failed to read the frame or end of video reached")
break
frame = imutils.resize(frame, width=1504)
detections, t = model.Inference(frame)
for obj in detections:
print(obj['class'], obj['conf'], obj['box'])
print("FPS: {} sec".format(1/t))
frame_process_time.append(t) # Store the time taken for this frame
cv2.imshow("Output", frame)
key = cv2.waitKey(1)
if key == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
Calculate Average FPS
if frame_process_time:
average_time_per_frame = sum(frame_process_time) / len(frame_process_time)
average_fps = 1.0 / average_time_per_frame
print(f"Average FPS: {average_fps}")
else:
print("No frames were processed.")`