update

xuan3986 · xuan3986 · commit 49c2d9cd50df · 2025-12-20T21:53:41.000+08:00
diff --git a/FunCineForge/speaker_diarization/local/vision_tools/api.py b/FunCineForge/speaker_diarization/local/vision_tools/api.py
@@ -0,0 +1,308 @@
+"""
+Modified from face-alignment v1.4.1 api.py
+Original source: https://github.com/1adrianb/face-alignment
+License: BSD-3-Clause License
+"""
+import torch
+import warnings
+from enum import IntEnum
+from skimage import io
+import numpy as np
+from packaging import version
+from tqdm import tqdm
+
+from face_alignment.utils import *
+from face_alignment.folder_data import FolderData
+from face_alignment.detection import sfd
+
+class LandmarksType(IntEnum):
+    """Enum class defining the type of landmarks to detect.
+
+    ``TWO_D`` - the detected points ``(x,y)`` are detected in a 2D space and follow the visible contour of the face
+    ``TWO_HALF_D`` - this points represent the projection of the 3D points into 3D
+    ``THREE_D`` - detect the points ``(x,y,z)``` in a 3D space
+
+    """
+    TWO_D = 1
+    TWO_HALF_D = 2
+    THREE_D = 3
+
+
+class NetworkSize(IntEnum):
+    # TINY = 1
+    # SMALL = 2
+    # MEDIUM = 3
+    LARGE = 4
+
+
+default_model_urls = {
+    '2DFAN-4': 'https://www.adrianbulat.com/downloads/python-fan/2DFAN4-cd938726ad.zip',
+    '3DFAN-4': 'https://www.adrianbulat.com/downloads/python-fan/3DFAN4-4a694010b9.zip',
+    'depth': 'https://www.adrianbulat.com/downloads/python-fan/depth-6c4283c0e0.zip',
+}
+
+models_urls = {
+    '1.6': {
+        '2DFAN-4': 'https://www.adrianbulat.com/downloads/python-fan/2DFAN4_1.6-c827573f02.zip',
+        '3DFAN-4': 'https://www.adrianbulat.com/downloads/python-fan/3DFAN4_1.6-ec5cf40a1d.zip',
+        'depth': 'https://www.adrianbulat.com/downloads/python-fan/depth_1.6-2aa3f18772.zip',
+    },
+    '1.5': {
+        '2DFAN-4': 'https://www.adrianbulat.com/downloads/python-fan/2DFAN4_1.5-a60332318a.zip',
+        '3DFAN-4': 'https://www.adrianbulat.com/downloads/python-fan/3DFAN4_1.5-176570af4d.zip',
+        'depth': 'https://www.adrianbulat.com/downloads/python-fan/depth_1.5-bc10f98e39.zip',
+    },
+}
+
+
+class FaceAlignment:
+    def __init__(self, landmarks_type, network_size=NetworkSize.LARGE, net_path=None,
+                 device='cuda', dtype=torch.float32, flip_input=False, face_detector_kwargs=None, verbose=False):
+        self.device = device
+        self.flip_input = flip_input
+        self.landmarks_type = landmarks_type
+        self.verbose = verbose
+        self.dtype = dtype
+
+        if version.parse(torch.__version__) < version.parse('1.5.0'):
+            raise ImportError(f'Unsupported pytorch version detected. Minimum supported version of pytorch: 1.5.0\
+                            Either upgrade (recommended) your pytorch setup, or downgrade to face-alignment 1.2.0')
+
+        network_size = int(network_size)
+        pytorch_version = torch.__version__
+        if 'dev' in pytorch_version:
+            pytorch_version = pytorch_version.rsplit('.', 2)[0]
+        else:
+            pytorch_version = pytorch_version.rsplit('.', 1)[0]
+
+        if 'cuda' in device:
+            torch.backends.cudnn.benchmark = True
+
+        # Get the face detector
+        # face_detector_module = __import__('face_alignment.detection.' + face_detector,
+        #                                   globals(), locals(), [face_detector], 0)
+        face_detector_kwargs = face_detector_kwargs or {}
+        self.face_detector = sfd.FaceDetector(device=device, verbose=verbose, **face_detector_kwargs)
+
+        # Initialise the face alignemnt networks
+        if landmarks_type == LandmarksType.TWO_D:
+            network_name = '2DFAN-' + str(network_size)
+        else:
+            network_name = '3DFAN-' + str(network_size)
+        if net_path is None:
+            net_path = load_file_from_url(models_urls.get(pytorch_version, default_model_urls)[network_name])
+        self.face_alignment_net = torch.jit.load(net_path)
+
+        self.face_alignment_net.to(device, dtype=dtype)
+        self.face_alignment_net.eval()
+
+        # Initialiase the depth prediciton network
+        if landmarks_type == LandmarksType.THREE_D:
+            self.depth_prediciton_net = torch.jit.load(
+                load_file_from_url(models_urls.get(pytorch_version, default_model_urls)['depth']))
+
+            self.depth_prediciton_net.to(device, dtype=dtype)
+            self.depth_prediciton_net.eval()
+
+    def get_landmarks(self, image_or_path, detected_faces=None, return_bboxes=False, return_landmark_score=False):
+        """Deprecated, please use get_landmarks_from_image
+
+        Arguments:
+            image_or_path {string or numpy.array or torch.tensor} -- The input image or path to it
+
+        Keyword Arguments:
+            detected_faces {list of numpy.array} -- list of bounding boxes, one for each face found
+            in the image (default: {None})
+            return_bboxes {boolean} -- If True, return the face bounding boxes in addition to the keypoints.
+            return_landmark_score {boolean} -- If True, return the keypoint scores along with the keypoints.
+        """
+        return self.get_landmarks_from_image(image_or_path, detected_faces, return_bboxes, return_landmark_score)
+
+    @torch.no_grad()
+    def get_landmarks_from_image(self, image_or_path, detected_faces=None, return_bboxes=False,
+                                 return_landmark_score=False):
+        """Predict the landmarks for each face present in the image.
+
+        This function predicts a set of 68 2D or 3D images, one for each image present.
+        If detect_faces is None the method will also run a face detector.
+
+         Arguments:
+            image_or_path {string or numpy.array or torch.tensor} -- The input image or path to it.
+
+        Keyword Arguments:
+            detected_faces {list of numpy.array} -- list of bounding boxes, one for each face found
+            in the image (default: {None})
+            return_bboxes {boolean} -- If True, return the face bounding boxes in addition to the keypoints.
+            return_landmark_score {boolean} -- If True, return the keypoint scores along with the keypoints.
+
+        Return:
+            result:
+                1. if both return_bboxes and return_landmark_score are False, result will be:
+                    landmark
+                2. Otherwise, result will be one of the following, depending on the actual value of return_* arguments.
+                    (landmark, landmark_score, detected_face)
+                    (landmark, None,           detected_face)
+                    (landmark, landmark_score, None         )
+        """
+        image = get_image(image_or_path)
+
+        if detected_faces is None:
+            detected_faces = self.face_detector.detect_from_image(image.copy())
+
+        if len(detected_faces) == 0:
+            warnings.warn("No faces were detected.")
+            if return_bboxes or return_landmark_score:
+                return None, None, None
+            else:
+                return None
+
+        landmarks = []
+        landmarks_scores = []
+        for i, d in enumerate(detected_faces):
+            center = torch.tensor(
+                [d[2] - (d[2] - d[0]) / 2.0, d[3] - (d[3] - d[1]) / 2.0])
+            center[1] = center[1] - (d[3] - d[1]) * 0.12
+            scale = (d[2] - d[0] + d[3] - d[1]) / self.face_detector.reference_scale
+
+            inp = crop(image, center, scale)
+            inp = torch.from_numpy(inp.transpose(
+                (2, 0, 1))).float()
+
+            inp = inp.to(self.device, dtype=self.dtype)
+            inp.div_(255.0).unsqueeze_(0)
+
+            out = self.face_alignment_net(inp).detach()
+            if self.flip_input:
+                out += flip(self.face_alignment_net(flip(inp)).detach(), is_label=True)
+            out = out.to(device='cpu', dtype=torch.float32).numpy()
+
+            pts, pts_img, scores = get_preds_fromhm(out, center.numpy(), scale)
+            pts, pts_img = torch.from_numpy(pts), torch.from_numpy(pts_img)
+            pts, pts_img = pts.view(68, 2) * 4, pts_img.view(68, 2)
+            scores = scores.squeeze(0)
+
+            if self.landmarks_type == LandmarksType.THREE_D:
+                heatmaps = np.zeros((68, 256, 256), dtype=np.float32)
+                for i in range(68):
+                    if pts[i, 0] > 0 and pts[i, 1] > 0:
+                        heatmaps[i] = draw_gaussian(
+                            heatmaps[i], pts[i], 2)
+                heatmaps = torch.from_numpy(
+                    heatmaps).unsqueeze_(0)
+
+                heatmaps = heatmaps.to(self.device, dtype=self.dtype)
+                depth_pred = self.depth_prediciton_net(
+                    torch.cat((inp, heatmaps), 1)).data.cpu().view(68, 1).to(dtype=torch.float32)
+                pts_img = torch.cat(
+                    (pts_img, depth_pred * (1.0 / (256.0 / (200.0 * scale)))), 1)
+
+            landmarks.append(pts_img.numpy())
+            landmarks_scores.append(scores)
+
+        if not return_bboxes:
+            detected_faces = None
+        if not return_landmark_score:
+            landmarks_scores = None
+        if return_bboxes or return_landmark_score:
+            return landmarks, landmarks_scores, detected_faces
+        else:
+            return landmarks
+
+    @torch.no_grad()
+    def get_landmarks_from_batch(self, image_batch, detected_faces=None, return_bboxes=False,
+                                 return_landmark_score=False):
+        """Predict the landmarks for each face present in the image.
+
+        This function predicts a set of 68 2D or 3D images, one for each image in a batch in parallel.
+        If detect_faces is None the method will also run a face detector.
+
+         Arguments:
+            image_batch {torch.tensor} -- The input images batch
+
+        Keyword Arguments:
+            detected_faces {list of numpy.array} -- list of bounding boxes, one for each face found
+            in the image (default: {None})
+            return_bboxes {boolean} -- If True, return the face bounding boxes in addition to the keypoints.
+            return_landmark_score {boolean} -- If True, return the keypoint scores along with the keypoints.
+
+        Return:
+            result:
+                1. if both return_bboxes and return_landmark_score are False, result will be:
+                    landmarks
+                2. Otherwise, result will be one of the following, depending on the actual value of return_* arguments.
+                    (landmark, landmark_score, detected_face)
+                    (landmark, None,           detected_face)
+                    (landmark, landmark_score, None         )
+        """
+
+        if detected_faces is None:
+            detected_faces = self.face_detector.detect_from_batch(image_batch)
+
+        if len(detected_faces) == 0:
+            warnings.warn("No faces were detected.")
+            if return_bboxes or return_landmark_score:
+                return None, None, None
+            else:
+                return None
+
+        landmarks = []
+        landmarks_scores_list = []
+        # A batch for each frame
+        for i, faces in enumerate(detected_faces):
+            res = self.get_landmarks_from_image(
+                image_batch[i].cpu().numpy().transpose(1, 2, 0),
+                detected_faces=faces,
+                return_landmark_score=return_landmark_score,
+            )
+            if return_landmark_score:
+                landmark_set, landmarks_scores, _ = res
+                landmarks_scores_list.append(landmarks_scores)
+            else:
+                landmark_set = res
+            # Bacward compatibility
+            if landmark_set is not None:
+                landmark_set = np.concatenate(landmark_set, axis=0)
+            else:
+                landmark_set = []
+            landmarks.append(landmark_set)
+
+        if not return_bboxes:
+            detected_faces = None
+        if not return_landmark_score:
+            landmarks_scores_list = None
+        if return_bboxes or return_landmark_score:
+            return landmarks, landmarks_scores_list, detected_faces
+        else:
+            return landmarks
+
+    def get_landmarks_from_directory(self, path, extensions=['.jpg', '.png'], recursive=True, show_progress_bar=True,
+                                     return_bboxes=False, return_landmark_score=False):
+        """Scan a directory for images with a given extension type(s) and predict the landmarks for each
+            face present in the images found.
+
+         Arguments:
+            path {str} -- path to the target directory containing the images
+
+        Keyword Arguments:
+            extensions {list of str} -- list containing the image extensions considered (default: ['.jpg', '.png'])
+            recursive {boolean} -- If True, scans for images recursively (default: True)
+            show_progress_bar {boolean} -- If True displays a progress bar (default: True)
+            return_bboxes {boolean} -- If True, return the face bounding boxes in addition to the keypoints.
+            return_landmark_score {boolean} -- If True, return the keypoint scores along with the keypoints.
+        """
+        dataset = FolderData(path, self.face_detector.tensor_or_path_to_ndarray, extensions, recursive, self.verbose)
+        dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, num_workers=2, prefetch_factor=4)
+      
+        predictions = {}
+        for (image_path, image) in tqdm(dataloader, disable=not show_progress_bar):
+            image_path, image = image_path[0], image[0]
+            bounding_boxes = self.face_detector.detect_from_image(image)
+            if return_bboxes or return_landmark_score:
+                preds, bbox, score = self.get_landmarks_from_image(
+                    image, bounding_boxes, return_bboxes=return_bboxes, return_landmark_score=return_landmark_score)
+                predictions[image_path] = (preds, bbox, score)
+            else:
+                preds = self.get_landmarks_from_image(image, bounding_boxes)
+                predictions[image_path] = preds
+
+        return predictions
diff --git a/FunCineForge/speaker_diarization/local/vision_tools/lip_detection.py b/FunCineForge/speaker_diarization/local/vision_tools/lip_detection.py
@@ -1,12 +1,12 @@
 import os
 import cv2
-from face_alignment import FaceAlignment, LandmarksType
+from .api import FaceAlignment, LandmarksType
 import numpy as np
 
 class LipDetector:
     """
-    在 face crop 上检测嘴巴位置。
-    使用 face_alignment 包的 FAN 模型实现
+    在 face crop 上检测唇部位置，
+    基于修改的 face_alignment api 调用 FAN 模型实现。
     """
 
     def __init__(
@@ -23,7 +23,7 @@ def __init__(
         
         if model_dir is not None:
             model_path = os.path.join(model_dir, 'fun_2d.pth')
-            net_path = os.path.join(model_dir, 'fun_2d.zip')
+            net_path = os.path.join(model_dir, 'fun_2d.zip') # 使用预下载模型避免长时间下载
             print(f"Loading FAN model from {model_path} on {device_str}...")
         else:
             model_path = None
diff --git a/FunCineForge/speaker_diarization/local/vision_tools/lip_encoder.py b/FunCineForge/speaker_diarization/local/vision_tools/lip_encoder.py