Python MediaPipe, как создавать точки 3D-модели на основе точек 2D-изображения

Python MediaPipe, как создавать точки 3D-модели на основе точек 2D-изображения ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Python MediaPipe, как создавать точки 3D-модели на основе точек 2D-изображения

Цитата

Сообщение Anonymous » 07 янв 2026, 02:00

Ну, у меня возникли проблемы при попытке реализовать код, который проверяет, куда смотрит человек на видео. Цель состоит в том, чтобы увидеть, смотрит ли человек в 6/8 положений (вверх, влево, вправо, вниз и их комбинации).
Самая хаотичная проблема заключается в том, что Rotate_vector не дает хорошего приближения того, куда смотрит человек. Я думаю, что эта проблема должна быть связана с model_points (я нашел их в руководстве). При попытке персонализировать model_points (lm.{axis:x,y,z} * {width/height/width}) ситуация ухудшилась.
Вторая проблема, с которой я столкнулся, заключается в том, что при использовании видео формата .MOV (снятого на iPhone), независимо от того, включена или выключена блокировка портретной ориентации, вертикальное видео (1080x1920) поворачивается на 90 градусов против часовой стрелки, и при попытке повторного поворота возникают проблемы с попыткой найди, куда смотрит лицо. А этого не происходит при съемке в формате MP4 (снимается на Android).
Третья проблема в том, что какой бы широкий параметр углов я ни вводил (для рыскания и тангажа), он никогда не показывает, когда человек смотрит вперед.

import cv2
import mediapipe as mp
import numpy as np

root_file_path = 'E:\\Path\\to\\file\\'

def get_euler_angles(rotation_matrix):
sy_ = np.sqrt(rotation_matrix[0,0]**2 + rotation_matrix[1,0]**2)
singular = sy_ < 1e-6
if not singular:
x = np.arctan2(rotation_matrix[2,1], rotation_matrix[2,2]) # pitch
y = np.arctan2(rotation_matrix[2,0], sy_) # yaw
z = np.arctan2(rotation_matrix[1,0], rotation_matrix[0,0]) # roll
else:
x = np.arctan2(-rotation_matrix[1,2], rotation_matrix[1,1])
y = np.arctan2(-rotation_matrix[2,0], sy_)
z = 0
return np.degrees([x,y,z])

mp_face_mesh = mp.solutions.face_mesh
mp_drawing = mp.solutions.drawing_utils

cap = cv2.VideoCapture(f'{root_file_path}video_1.MOV')

rotation_of_face = {"Up":False,"LeftUp":False,"RightUp":False,"Left":False,"Right":False,"LeftDown":False,"RightDown":False,"Down":False}
forward_facing_frames = []

with mp_face_mesh.FaceMesh(
static_image_mode=False,
max_num_faces=1,
min_detection_confidence=0.5) as face_mesh:

while cap.isOpened():
ret, frame = cap.read()
if not ret:
break

height, width, img_c = frame.shape
if width/height > 1:
frame___ = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = face_mesh.process(frame_rgb)

if results.multi_face_landmarks is not None:
for face_landmarks in results.multi_face_landmarks:
mp_drawing.draw_landmarks(frame, face_landmarks, mp_face_mesh.FACEMESH_CONTOURS)
landmarks_2d = np.array([(lm.x * width, lm.y * height) for lm in face_landmarks.landmark])
image_points = np.array([
landmarks_2d[1], # Nose tip
landmarks_2d[152], # Chin
landmarks_2d[33],# Left eye outer corner
landmarks_2d[133], # Left eye inner corner
landmarks_2d[263],# Right eye right corner
landmarks_2d[362], # Right eye inner corner
landmarks_2d[61], # Left mouth corner
landmarks_2d[291] # Right mouth corner
], dtype="double")
model_points = np.array([
(0.0, 0.0, 0.0), # Nose tip
(0.0, -330.0, -65.0), # Chin
(-225.0, 170.0, -135.0), # Left eye outer corner
(-75.0, 170.0, -135.0), # Lft eye inner corner
(225.0, 170.0, -135.0), # Right eye outer corner
(75.0, 170.0, -135.0), # Right eye inner corner
(-150.0, -150.0, -125.0),# Left mouth corner
(150.0, -150.0, -125.0) # Right mouth corner
])
focal_length = width
center = (width / 2, height / 2)
camera_matrix = np.array(
[[focal_length, 0, center[0]],
[0, focal_length, center[1]],
[0, 0, 1]], dtype="double"
)
dist_coeffs = np.zeros((4, 1))

success, rotation_vector, translation_vector = cv2.solvePnP(model_points, image_points, camera_matrix, dist_coeffs, flags = cv2.SOLVEPNP_ITERATIVE)

rotation_matrix, _ = cv2.Rodrigues(rotation_vector)

pitch, yaw, roll = get_euler_angles(rotation_matrix)

direction = ""
if yaw < -50:
direction += "Left"
elif yaw > 50:
direction += "Right"

if pitch < -50:
direction += "Down"
elif pitch > 50:
direction += "Up"

if direction == "":
direction = "Forward"

for key in rotation_of_face.keys():
if key == direction:
rotation_of_face[key] = True

cv2.putText(frame, f"Head Direction: {direction}", (30, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

nose_end_point3D = np.array([(0.0, 0.0, 1000.0)])
nose_end_point2D, _ = cv2.projectPoints(nose_end_point3D, rotation_vector, translation_vector, camera_matrix, dist_coeffs)

p1 = (int(image_points[0][0]), int(image_points[0][1]))
p2 = (int(nose_end_point2D[0][0][0]), int(nose_end_point2D[0][0][1]))
cv2.line(frame, p1, p2, (255, 0, 0), 2)

cv2.imshow('Face Matching', frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break

cap.release()
cv2.destroyAllWindows()

Подробнее здесь: https://stackoverflow.com/questions/796 ... age-points

1767740403

Anonymous

Ну, у меня возникли проблемы при попытке реализовать код, который проверяет, куда смотрит человек на видео. Цель состоит в том, чтобы увидеть, смотрит ли человек в 6/8 положений (вверх, влево, вправо, вниз и их комбинации).
Самая хаотичная проблема заключается в том, что Rotate_vector не дает хорошего приближения того, куда смотрит человек. Я думаю, что эта проблема должна быть связана с model_points (я нашел их в руководстве). При попытке персонализировать model_points (lm.{axis:x,y,z} * {width/height/width}) ситуация ухудшилась.
Вторая проблема, с которой я столкнулся, заключается в том, что при использовании видео формата .MOV (снятого на iPhone), независимо от того, включена или выключена блокировка портретной ориентации, вертикальное видео (1080x1920) поворачивается на 90 градусов против часовой стрелки, и при попытке повторного поворота возникают проблемы с попыткой найди, куда смотрит лицо.  А этого не происходит при съемке в формате MP4 (снимается на Android).
Третья проблема в том, что какой бы широкий параметр углов я ни вводил (для рыскания и тангажа), он никогда не показывает, когда человек смотрит вперед.

import cv2
import mediapipe as mp
import numpy as np

root_file_path = 'E:\\Path\\to\\file\\'

def get_euler_angles(rotation_matrix):
sy_ = np.sqrt(rotation_matrix[0,0]**2 + rotation_matrix[1,0]**2)
singular = sy_ < 1e-6
if not singular:
x = np.arctan2(rotation_matrix[2,1], rotation_matrix[2,2]) # pitch
y = np.arctan2(rotation_matrix[2,0], sy_) # yaw
z = np.arctan2(rotation_matrix[1,0], rotation_matrix[0,0]) # roll
else:
x = np.arctan2(-rotation_matrix[1,2], rotation_matrix[1,1])
y = np.arctan2(-rotation_matrix[2,0], sy_)
z = 0
return np.degrees([x,y,z])

mp_face_mesh = mp.solutions.face_mesh
mp_drawing = mp.solutions.drawing_utils

cap = cv2.VideoCapture(f'{root_file_path}video_1.MOV')

rotation_of_face = {"Up":False,"LeftUp":False,"RightUp":False,"Left":False,"Right":False,"LeftDown":False,"RightDown":False,"Down":False}
forward_facing_frames = []

with mp_face_mesh.FaceMesh(
static_image_mode=False,
max_num_faces=1,
min_detection_confidence=0.5) as face_mesh:

while cap.isOpened():
ret, frame = cap.read()
if not ret:
break

height, width, img_c = frame.shape
if width/height > 1:
frame___ = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = face_mesh.process(frame_rgb)

if results.multi_face_landmarks is not None:
for face_landmarks in results.multi_face_landmarks:
mp_drawing.draw_landmarks(frame, face_landmarks, mp_face_mesh.FACEMESH_CONTOURS)
landmarks_2d = np.array([(lm.x * width, lm.y * height) for lm in face_landmarks.landmark])
image_points = np.array([
landmarks_2d[1],  # Nose tip
landmarks_2d[152], # Chin
landmarks_2d[33],# Left eye outer corner
landmarks_2d[133], # Left eye inner corner
landmarks_2d[263],# Right eye right corner
landmarks_2d[362], # Right eye inner corner
landmarks_2d[61], # Left mouth corner
landmarks_2d[291] # Right mouth corner
], dtype="double")
model_points = np.array([
(0.0, 0.0, 0.0),        # Nose tip
(0.0, -330.0, -65.0),    # Chin
(-225.0, 170.0, -135.0), # Left eye outer corner
(-75.0, 170.0, -135.0), # Lft eye inner corner
(225.0, 170.0, -135.0),  # Right eye outer corner
(75.0, 170.0, -135.0), # Right eye inner corner
(-150.0, -150.0, -125.0),# Left mouth corner
(150.0, -150.0, -125.0)  # Right mouth corner
])
focal_length = width
center = (width / 2, height / 2)
camera_matrix = np.array(
[[focal_length, 0, center[0]],
[0, focal_length, center[1]],
[0, 0, 1]], dtype="double"
)
dist_coeffs = np.zeros((4, 1))

success, rotation_vector, translation_vector = cv2.solvePnP(model_points, image_points, camera_matrix, dist_coeffs, flags = cv2.SOLVEPNP_ITERATIVE)

rotation_matrix, _ = cv2.Rodrigues(rotation_vector)

pitch, yaw, roll = get_euler_angles(rotation_matrix)

direction = ""
if yaw < -50:
direction += "Left"
elif yaw > 50:
direction += "Right"

if pitch < -50:
direction += "Down"
elif pitch >  50:
direction += "Up"

if direction == "":
direction = "Forward"

for key in rotation_of_face.keys():
if key == direction:
rotation_of_face[key] = True

cv2.putText(frame, f"Head Direction: {direction}", (30, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

nose_end_point3D = np.array([(0.0, 0.0, 1000.0)])
nose_end_point2D, _ = cv2.projectPoints(nose_end_point3D, rotation_vector, translation_vector, camera_matrix, dist_coeffs)

p1 = (int(image_points[0][0]), int(image_points[0][1]))
p2 = (int(nose_end_point2D[0][0][0]), int(nose_end_point2D[0][0][1]))
cv2.line(frame, p1, p2, (255, 0, 0), 2)

cv2.imshow('Face Matching', frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break

cap.release()
cv2.destroyAllWindows()
 

Подробнее здесь: [url]https://stackoverflow.com/questions/79641047/python-mediapipe-how-to-create-3d-model-points-based-on-the-2d-image-points[/url]