实现一个虚拟直播形象VTuber

导入

现在直播领域有一种比较新奇的直播形式,那就是虚拟直播,一般都叫做VTuber,就是不用真人入镜,而是用虚拟形象,比如一个动漫人物,但是虚拟形象的面部表情还是由真人控制的。我最近看到之后,发现还蛮有意思,正好前段时间,就在上篇博客,我介绍了用谷歌的mediapipe库实现一个简单的”动捕“程序,用到的是手势识别模块,同时后面也说明了这个模块还可以做很多有趣的事,比如虚拟键盘,虚拟绘画,手势控制游戏等。而这个库还有很多其他模块,比如面部特征检测,就正好用来实现一个自己的简单的“面捕”程序,自己实现一个虚拟直播程序。

完整代码

import cv2
import mediapipe as mp
import numpy as np


# 数学与姿态解算工具模块
class FaceGeometry:
    def __init__(self):
        self.INDEX_NOSE = 1
        self.INDEX_CHIN = 152
        self.INDEX_LEFT_EYE_CORNER = 33
        self.INDEX_RIGHT_EYE_CORNER = 263
        self.INDEX_LEFT_MOUTH_CORNER = 61
        self.INDEX_RIGHT_MOUTH_CORNER = 291

        # 3D 标定点(通用人脸模型的相对坐标)
        self.face_3d = np.array([
            [0.0, 0.0, 0.0],            # 鼻尖
            [0.0, -330.0, -65.0],       # 下巴
            [-225.0, 170.0, -135.0],    # 左眼角
            [225.0, 170.0, -135.0],     # 右眼角
            [-150.0, -150.0, -125.0],   # 左嘴角
            [150.0, -150.0, -125.0]     # 右嘴角
        ], dtype=np.float64)

    def get_head_pose(self, landmarks, img_w, img_h):
        face_2d = []
        points_idx = [self.INDEX_NOSE, self.INDEX_CHIN, self.INDEX_LEFT_EYE_CORNER, 
                      self.INDEX_RIGHT_EYE_CORNER, self.INDEX_LEFT_MOUTH_CORNER, self.INDEX_RIGHT_MOUTH_CORNER]

        for idx in points_idx:
            lm = landmarks.landmark[idx]
            x, y = int(lm.x * img_w), int(lm.y * img_h)
            face_2d.append([x, y])

        face_2d = np.array(face_2d, dtype=np.float64)

        focal_length = 1 * img_w
        cam_matrix = np.array([
            [focal_length, 0, img_h / 2],
            [0, focal_length, img_w / 2],
            [0, 0, 1]
        ])
        dist_matrix = np.zeros((4, 1), dtype=np.float64)

        success, rot_vec, trans_vec = cv2.solvePnP(self.face_3d, face_2d, cam_matrix, dist_matrix)

        if not success:
            return 0, 0, 0

        rmat, _ = cv2.Rodrigues(rot_vec)
        angles, _, _, _, _, _ = cv2.RQDecomp3x3(rmat)

        x_angle = angles[0]  # Pitch
        y_angle = angles[1]  # Yaw
        z_angle = angles[2]  # Roll

        # 限制角度范围
        x_angle = np.clip(x_angle, -50, 50)
        y_angle = np.clip(y_angle, -50, 50)
        z_angle = np.clip(z_angle, -50, 50)

        return x_angle, y_angle, z_angle

    @staticmethod
    def calculate_mar(landmarks):
        top = np.array([landmarks[13].x, landmarks[13].y])
        bottom = np.array([landmarks[14].x, landmarks[14].y])
        return np.linalg.norm(top - bottom) * 100 


# 虚拟形象绘制模块
class AvatarRenderer:
    def __init__(self, width=800, height=600):
        self.W = width
        self.H = height
        self.C_BG = (255, 240, 230)      
        self.C_SKIN = (220, 235, 255)    
        self.C_HAIR = (80, 60, 50)       
        self.C_EYE_WHITE = (250, 250, 250)
        self.C_EYE_IRIS = (200, 100, 50) 
        self.C_BLUSH = (180, 180, 255)   

        self.pose_buffer = []

    def draw(self, frame, pitch, yaw, roll, ear_left, ear_right, mar):
        self.pose_buffer.append([pitch, yaw, roll])
        if len(self.pose_buffer) > 5: self.pose_buffer.pop(0)
        pitch, yaw, roll = np.mean(self.pose_buffer, axis=0)

        canvas = np.zeros((self.H, self.W, 3), dtype=np.uint8)
        canvas[:] = self.C_BG

        cx, cy = self.W // 2, self.H // 2 + 50
        offset_x = int(yaw * 3.0)  
        offset_y = int(pitch * 3.0)
        offset_x = np.clip(offset_x, -80, 80)
        offset_y = np.clip(offset_y, -60, 60)

        M = cv2.getRotationMatrix2D((cx, cy), roll, 1.0)

        # 1. 后发
        cv2.ellipse(canvas, (cx, cy), (160, 170), 0, 0, 360, self.C_HAIR, -1, cv2.LINE_AA)

        # 2. 脸部轮廓
        cv2.ellipse(canvas, (cx, cy), (140, 160), 0, 0, 360, self.C_SKIN, -1, cv2.LINE_AA)

        face_cx = cx - offset_x
        face_cy = cy + offset_y

        # 3. 眼睛
        eye_spacing = 60
        self._draw_eye(canvas, face_cx - eye_spacing, face_cy - 10, 35, 45, ear_left)
        self._draw_eye(canvas, face_cx + eye_spacing, face_cy - 10, 35, 45, ear_right)

        # 4. 腮红
        cv2.ellipse(canvas, (face_cx - 70, face_cy + 40), (20, 15), 0, 0, 360, self.C_BLUSH, -1, cv2.LINE_AA)
        cv2.ellipse(canvas, (face_cx + 70, face_cy + 40), (20, 15), 0, 0, 360, self.C_BLUSH, -1, cv2.LINE_AA)

        # 5. 嘴巴
        self._draw_mouth(canvas, face_cx, face_cy + 60, mar)

        # 6. 鼻子
        cv2.circle(canvas, (face_cx, face_cy + 20), 3, (160, 180, 210), -1, cv2.LINE_AA)

        # 7. 前发
        hair_cx = cx - int(offset_x * 0.6)
        hair_cy = cy + int(offset_y * 0.6) - 140
        pts = np.array([[hair_cx-140, hair_cy+80], [hair_cx, hair_cy], [hair_cx+140, hair_cy+80], 
                        [hair_cx+150, hair_cy-60], [hair_cx-150, hair_cy-60]], np.int32)
        cv2.fillPoly(canvas, [pts], self.C_HAIR, cv2.LINE_AA)

        canvas = cv2.warpAffine(canvas, M, (self.W, self.H), borderValue=self.C_BG)
        return canvas

    def _draw_eye(self, canvas, x, y, w, h, ear):
        if ear < 0.15: 
            cv2.ellipse(canvas, (x, y), (w, h//4), 0, 180, 360, (50, 40, 40), 3, cv2.LINE_AA)
            cv2.line(canvas, (x-w, y), (x-w-5, y+5), (50,40,40), 2, cv2.LINE_AA)
        else:
            cv2.ellipse(canvas, (x, y), (w, h), 0, 0, 360, self.C_EYE_WHITE, -1, cv2.LINE_AA)
            cv2.ellipse(canvas, (x, y), (w, h), 0, 0, 360, (0, 0, 0), 2, cv2.LINE_AA)
            cv2.circle(canvas, (x, y+5), int(w*0.5), self.C_EYE_IRIS, -1, cv2.LINE_AA)
            cv2.circle(canvas, (x - 10, y - 5), 8, (255, 255, 255), -1, cv2.LINE_AA)
            cv2.ellipse(canvas, (x, y - h - 15), (w, 5), 0, 180, 360, self.C_HAIR, 3, cv2.LINE_AA)

    def _draw_mouth(self, canvas, x, y, mar):
        if mar > 3.0: 
            h = min(int(mar * 5), 50)
            cv2.ellipse(canvas, (x, y), (30, h), 0, 0, 360, (100, 100, 200), -1, cv2.LINE_AA)
            cv2.ellipse(canvas, (x, y), (30, h), 0, 0, 360, (80, 80, 150), 2, cv2.LINE_AA)
        else:
            cv2.ellipse(canvas, (x, y - 5), (20, 10), 0, 0, 180, (100, 80, 80), 3, cv2.LINE_AA)


def main():
    cap = cv2.VideoCapture(0)

    mp_face_mesh = mp.solutions.face_mesh
    mp_drawing = mp.solutions.drawing_utils 
    mp_drawing_styles = mp.solutions.drawing_styles

    face_mesh = mp_face_mesh.FaceMesh(
        max_num_faces=1,
        refine_landmarks=True,
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5
    )

    renderer = AvatarRenderer()
    geometry = FaceGeometry()

    # True: 显示摄像头画面, False: 显示黑底网格
    show_real_face = True 

    print("启动中...")
    print("按 'e' 键切换 隐私模式/真人模式")
    print("按 'q' 键退出")

    while cap.isOpened():
        success, image = cap.read()
        if not success: continue

        image = cv2.flip(image, 1)
        h, w, _ = image.shape
        rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # 始终使用真实图像进行计算,保证数据捕捉
        results = face_mesh.process(rgb_image)

        # 准备用于显示的预览图 
        if show_real_face:
            # 模式A: 复制摄像头画面
            preview_img = image.copy()
        else:
            # 模式B: 创建一个全黑的画布,大小与摄像头一致
            preview_img = np.zeros((h, w, 3), dtype=np.uint8)

        pitch, yaw, roll = 0, 0, 0
        ear_left, ear_right = 0.3, 0.3
        mar = 0.0

        if results.multi_face_landmarks:
            for face_landmarks in results.multi_face_landmarks:
                # 绘制网格:画在 preview_img 上
                mp_drawing.draw_landmarks(
                    image=preview_img,
                    landmark_list=face_landmarks,
                    connections=mp_face_mesh.FACEMESH_TESSELATION,
                    landmark_drawing_spec=None,
                    connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_tesselation_style())

                mp_drawing.draw_landmarks(
                    image=preview_img,
                    landmark_list=face_landmarks,
                    connections=mp_face_mesh.FACEMESH_CONTOURS,
                    landmark_drawing_spec=None,
                    connection_drawing_spec=mp_drawing_styles.get_default_face_mesh_contours_style())

                # 数据计算
                pitch, yaw, roll = geometry.get_head_pose(face_landmarks, w, h)
                lm = face_landmarks.landmark

                left_v = np.linalg.norm(np.array([lm[159].x, lm[159].y]) - np.array([lm[145].x, lm[145].y]))
                left_h = np.linalg.norm(np.array([lm[33].x, lm[33].y]) - np.array([lm[133].x, lm[133].y]))
                ear_left = left_v / (left_h + 1e-6)

                right_v = np.linalg.norm(np.array([lm[386].x, lm[386].y]) - np.array([lm[374].x, lm[374].y]))
                right_h = np.linalg.norm(np.array([lm[362].x, lm[362].y]) - np.array([lm[263].x, lm[263].y]))
                ear_right = right_v / (right_h + 1e-6)

                mar = geometry.calculate_mar(lm)

        # 绘制 Avatar
        avatar_img = renderer.draw(image, pitch, yaw, roll, ear_left, ear_right, mar)

        # 拼接画面:把 preview_img 放到左上角
        cam_h, cam_w = 225, 300
        cam_preview = cv2.resize(preview_img, (cam_w, cam_h))
        avatar_img[0:cam_h, 0:cam_w] = cam_preview

        # 显示状态文字
        mode_text = "REAL" if show_real_face else "PRIVACY (Mesh Only)"
        color_text = (0, 255, 0) if show_real_face else (0, 0, 255) # 绿/红
        cv2.putText(avatar_img, f"Mode: {mode_text} (Press 'e')", (400, 30), 
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, color_text, 2)

        cv2.putText(avatar_img, f"Pitch:{int(pitch)} Yaw:{int(yaw)} Roll:{int(roll)}", 
                    (400, 50), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (50, 50, 50), 2)

        cv2.imshow('Virtual Avatar VTuber', avatar_img)

        # 按键监听
        key = cv2.waitKey(5) & 0xFF
        if key == ord('q'):
            break
        elif key == ord('e'):
            show_real_face = not show_real_face  # 切换状态

    cap.release()
    cv2.destroyAllWindows()


if __name__ == "__main__":
    main()

代码很简单,只依赖mediapipe,numpy,opencv这三个库,需要注意利用脸部关键点进行脸部姿态估计,也就是Pitch,Yaw,Roll这三个角的估计的算法叫:PnP(Perspective-n-Point)算法,专门用于将3D位置变换到2D姿态,感兴趣自己去了解即可。

效果

实现了2个模式,一个是隐私模式,一个是真人模式,上面显示的是隐私模式,也就是只能看到面部特征绘制到黑色背景。按E键可以切换到真人模式,此时可以看到真人绘制在真实背景下。