Optimize facial detection

lldacing · Feb 19, 2025 · 5f1e91d · 5f1e91d
1 parent 0d3c85b
commit 5f1e91d
Show file tree

Hide file tree

Showing 6 changed files with 135 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -10,6 +10,8 @@ Must uninstall or disable `ComfyUI-PuLID-Flux` and other PuLID-Flux nodes before
 Need upgrade ComfyUI Version>=0.3.7
 
 ## Update logs
+### 2025.02.19
+- Fix: when selecting a face from multiple faces as a reference, embeddings and alignment features maybe not from the same face.
 ### 2025.02.18
 - Supported selecting a face from multiple faces as a reference. [Example workflow](examples/PuLID_select_ref_face.png).
 ### 2025.01.27
@@ -85,9 +87,15 @@ Failed to build insightface
     - `small-large`: Sort the area of bbox from small to large.
     - `large-small`: Sort the area of bbox from large to small.
   - `input_faces_index` - The target index of the sorted bboxes.
+  - `input_faces_align_mode` - Choose the detection method for aligning facial features. 
+    - `0`: Old version method, When there is a face in an image, the selected facial embedding amount and alignment features maybe not consistent. 
+    - `1`: Keep the selected facial embedding amount and alignment features consistent.
+    - There is a slight difference between the two mode, with the `align_face` value of `1` resulting smaller area than the `embed_face` value of `0`.
 - PulidFluxFaceDetector
   - Can check the facial features applied in `ApplyPulidFlux`.
-  - The `embed_face` and `align_face` should be the same face, but they are generated by different detectors, and the number detected may be not consistent, so they may be not the same face.
+  - When `input_faces_align_mode = 0`, the `embed_face` and `align_face` should be the same face, but they are generated by different detectors, and the number detected may be not consistent, so they may be not the same face.
+  - When `input_faces_align_mode = 1`, the `embed_face` and `align_face` are always the same face, they are generated by same detectors.
+  - `face_bbox_image` - Draw the detected facial bounding box (the result of the `embed_face`'s detector).
 
 ## Thanks
 

diff --git a/README_CN.md b/README_CN.md
@@ -10,6 +10,8 @@
 ComfyUI主体版本需要>=0.3.7
 
 ## 更新日志
+### 2025.02.19
+- 解决多张人脸时选择的人脸嵌入量和对齐特征不是同一个人脸的问题。
 ### 2025.02.18
 - 支持从含有多张脸的图片中选择一张脸作为参考。[示例工作流](examples/PuLID_select_ref_face.png).
 ### 2025.01.27
@@ -81,9 +83,15 @@ Failed to build insightface
     - `small-large`: 按bbox的面积从小到大排序。
     - `large-small`: 按bbox的面积从大到小排序。
   - `input_faces_index` - 从排序后的bbox选取的索引号。
+  - `input_faces_align_mode` - 选择对齐脸部特征的检测方式。
+    - `0`: 旧版本方式，一张图片中有张脸时选择的脸部嵌入量和对齐特征可能不一致。
+    - `1`: 保持选择的脸部嵌入量和对齐特征一致。
+    - 两种出图有细微差别，值`1`的`align_face`结果图比`0`的`embed_face`范围小一点。
 - PulidFluxFaceDetector
   - 用来检查在`ApplyPulidFlux`实际使用的面部特征。
-  - `embed_face` 和 `align_face` 理论上应该是同一张脸，但它们由不同的检测器产生，可能检测到的数量不一致，因此两张图可能不是同一张脸。
+  - `input_faces_align_mode = 0`时，`embed_face` 和 `align_face` 理论上应该是同一张脸，但它们由不同的检测器产生，可能检测到的数量不一致，因此两张图可能不是同一张脸。
+  - `input_faces_align_mode = 1`时，`embed_face` 和 `align_face` 由相同的检测器产生，两张图始终是同一张脸。
+  - `face_bbox_image` - 画出检测到的脸部边界框（`embed_face`的检测器结果）。
 
 ## 感谢
 

diff --git a/examples/PuLID_select_ref_face.png b/examples/PuLID_select_ref_face.png
diff --git a/face_restoration_helper.py b/face_restoration_helper.py
@@ -37,8 +37,8 @@ def get_face_by_index(det_faces, face_sort_rule, face_index=0):
     if not 0 <= face_index < len(sorted_faces):
         # 返回第一个
         face_index = 0
-    # 返回选择的脸部、原始索引值和排序后的bbox列表
-    return sorted_faces[face_index][1], sorted_faces[face_index][0], [face[1].bbox if has_bbox_attr else face[1] for face in sorted_faces]
+    # 返回选择的脸部、原始索引值和排序后的列表
+    return sorted_faces[face_index][1], sorted_faces[face_index][0], [face[1] for face in sorted_faces]
 
 
 def get_largest_face(det_faces, h, w):
@@ -172,7 +172,8 @@ def get_face_landmarks_5(self,
             input_img = cv2.resize(self.input_img, (w, h), interpolation=cv2.INTER_LANCZOS4)
 
         with torch.no_grad():
-            bboxes = self.face_det.detect_faces(input_img, 0.97) * scale
+            # use 0.5 (old value is 0.97), keep consistent with Insightface, but still cannot ensure consistent quantity of bboxes.
+            bboxes = self.face_det.detect_faces(input_img, 0.5) * scale
         for bbox in bboxes:
             # remove faces with too small eye distance: side faces or too small faces
             eye_dist = np.linalg.norm([bbox[5] - bbox[7], bbox[6] - bbox[8]])
@@ -420,3 +421,36 @@ def clean_all(self):
         self.inverse_affine_matrices = []
         self.det_faces = []
         self.pad_input_imgs = []
+
+def draw_on(img, faces):
+    dimg = img.copy()
+    for i in range(len(faces)):
+        face = faces[i]
+        box = face.bbox.astype(np.int32)
+        color = (0, 0, 255)
+        cv2.rectangle(dimg, (box[0], box[1]), (box[2], box[3]), color, 2)
+        if face.kps is not None:
+            kps = face.kps.astype(np.int32)
+            #print(landmark.shape)
+            for l in range(kps.shape[0]):
+                color = (0, 0, 255)
+                if l == 0 or l == 3:
+                    color = (0, 255, 0)
+                cv2.circle(dimg, (kps[l][0], kps[l][1]), 1, color,
+                           2)
+
+        cv2.putText(dimg,'index: %d'%i, (box[0]-1, box[1]-4),cv2.FONT_HERSHEY_COMPLEX,0.7,(0,255,0),1)
+
+        # if face.gender is not None and face.age is not None:
+        #     cv2.putText(dimg,'%s,%d'%(face.sex,face.age), (box[0]-1, box[1]-4),cv2.FONT_HERSHEY_COMPLEX,0.7,(0,255,0),1)
+
+        #for key, value in face.items():
+        #    if key.startswith('landmark_3d'):
+        #        print(key, value.shape)
+        #        print(value[0:10,:])
+        #        lmk = np.round(value).astype(np.int)
+        #        for l in range(lmk.shape[0]):
+        #            color = (255, 0, 0)
+        #            cv2.circle(dimg, (lmk[l][0], lmk[l][1]), 1, color,
+        #                       2)
+    return dimg
diff --git a/pulidflux.py b/pulidflux.py
@@ -1,9 +1,11 @@
 import types
 import zipfile
 
+import cv2
 import torch
 from insightface.utils.download import download_file
 from insightface.utils.storage import BASE_REPO_URL
+from insightface.utils import face_align
 from torch import nn
 from torchvision import transforms
 from torchvision.transforms import functional
@@ -12,7 +14,7 @@
 import folder_paths
 import comfy
 from insightface.app import FaceAnalysis
-from .face_restoration_helper import FaceRestoreHelper, get_face_by_index
+from .face_restoration_helper import FaceRestoreHelper, get_face_by_index, draw_on
 
 from comfy import model_management
 from .eva_clip.constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
@@ -280,6 +282,7 @@ def apply_pulid_flux(self, model, pulid_flux, eva_clip, face_analysis, image, we
 
         input_face_sort = options.get('input_faces_order', "large-small")
         input_face_index = options.get('input_faces_index', 0)
+        input_face_align_mode = options.get('input_faces_align_mode', 1)
         # Analyse multiple images at multiple sizes and combine largest area embeddings
         for i in range(image.shape[0]):
             # get insightface embeddings
@@ -289,26 +292,35 @@ def apply_pulid_flux(self, model, pulid_flux, eva_clip, face_analysis, image, we
                 face_analysis.det_model.input_size = size
                 face_info = face_analysis.get(image[i])
                 if face_info:
-                    face_info, index, bboxes = get_face_by_index(face_info, face_sort_rule=input_face_sort, face_index=input_face_index)
+                    face_info, index, sorted_faces = get_face_by_index(face_info, face_sort_rule=input_face_sort, face_index=input_face_index)
+                    bboxes = [face.bbox for face in sorted_faces]
                     iface_embeds = torch.from_numpy(face_info.embedding).unsqueeze(0).to(device, dtype=dtype)
                     break
             else:
                 # No face detected, skip this image
                 logging.warning(f'Warning: No face detected in image {str(i)}')
                 continue
 
-            # get eva_clip embeddings
-            face_helper.clean_all()
-            face_helper.read_image(image[i])
-            face_helper.get_face_landmarks_5(ref_sort_bboxes=bboxes, face_index=input_face_index)
-            face_helper.align_warp_face()
-
-            if len(face_helper.cropped_faces) == 0:
-                # No face detected, skip this image
-                continue
-
-            # Get aligned face image
-            align_face = face_helper.cropped_faces[0]
+            if input_face_align_mode == 1:
+                image_size = 512
+                M = face_align.estimate_norm(face_info.kps, image_size=image_size)
+                align_face = cv2.warpAffine(image[i], M, (image_size, image_size), borderMode=cv2.BORDER_CONSTANT,
+                                            borderValue=(135, 133, 132))
+                # align_face = face_align.norm_crop(image[i], landmark=face_info.kps, image_size=image_size)
+                del M
+            else:
+                # get eva_clip embeddings
+                face_helper.clean_all()
+                face_helper.read_image(image[i])
+                face_helper.get_face_landmarks_5(ref_sort_bboxes=bboxes, face_index=input_face_index)
+                face_helper.align_warp_face()
+
+                if len(face_helper.cropped_faces) == 0:
+                    # No face detected, skip this image
+                    continue
+
+                # Get aligned face image
+                align_face = face_helper.cropped_faces[0]
             # Convert bgr face image to tensor
             align_face = image_to_tensor(align_face).unsqueeze(0).permute(0, 3, 1, 2).to(device)
             parsing_out = face_helper.face_parse(functional.normalize(align_face, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]))[0]
@@ -341,6 +353,7 @@ def apply_pulid_flux(self, model, pulid_flux, eva_clip, face_analysis, image, we
         if not cond:
             # No faces detected, return the original model
             logging.warning("PuLID warning: No faces detected in any of the given images, returning unmodified model.")
+            del eva_clip, face_analysis, pulid_flux, face_helper, attn_mask
             return (model,)
 
         # average embeddings
@@ -437,17 +450,25 @@ def INPUT_TYPES(s):
                                           "default": 0, "min": 0, "max": 1000, "step": 1,
                                           "tooltip": "If the value is greater than the size of bboxes, will set value to 0."
                                       }),
+                "input_faces_align_mode": ("INT",
+                                      {
+                                          "default": 1, "min": 0, "max": 1, "step": 1,
+                                          "tooltip": "Align face mode.\n"
+                                                     "0: align_face and embed_face use different detectors. The results maybe different.\n"
+                                                     "1: align_face and embed_face use the same detector."
+                                      }),
             }
         }
 
     RETURN_TYPES = ("OPTIONS",)
     FUNCTION = "execute"
     CATEGORY = "pulid"
 
-    def execute(self,input_faces_order, input_faces_index):
+    def execute(self,input_faces_order, input_faces_index, input_faces_align_mode=1):
         options: dict = {
             "input_faces_order": input_faces_order,
             "input_faces_index": input_faces_index,
+            "input_faces_align_mode": input_faces_align_mode,
         }
         return (options, )
 
@@ -463,68 +484,83 @@ def INPUT_TYPES(s):
             }
         }
 
-    RETURN_TYPES = ("IMAGE", "IMAGE",)
-    RETURN_NAMES = ("embed_face", "align_face")
+    RETURN_TYPES = ("IMAGE", "IMAGE", "IMAGE",)
+    RETURN_NAMES = ("embed_face", "align_face", "face_bbox_image",)
     FUNCTION = "execute"
     CATEGORY = "pulid"
-    OUTPUT_IS_LIST = (True, True,)
+    OUTPUT_IS_LIST = (True, True, True,)
 
     def execute(self, face_analysis, image, options):
 
         device = comfy.model_management.get_torch_device()
-        face_helper = FaceRestoreHelper(
-            upscale_factor=1,
-            face_size=512,
-            crop_ratio=(1, 1),
-            det_model='retinaface_resnet50',
-            parsing_model='bisenet',
-            save_ext='png',
-            device=device,
-            model_rootpath=FACEXLIB_DIR
-        )
 
         input_face_sort = options.get('input_faces_order', "large-small")
         input_face_index = options.get('input_faces_index', 0)
+        input_face_align_mode = options.get('input_faces_align_mode', 1)
+
+        if input_face_align_mode == 0:
+            face_helper = FaceRestoreHelper(
+                upscale_factor=1,
+                face_size=512,
+                crop_ratio=(1, 1),
+                det_model='retinaface_resnet50',
+                parsing_model='bisenet',
+                save_ext='png',
+                device=device,
+                model_rootpath=FACEXLIB_DIR
+            )
+
         # Analyse multiple images at multiple sizes and combine largest area embeddings
         embed_faces=[]
         align_faces=[]
+        draw_embed_face_bbox=[]
         image = tensor_to_image(image)
         for i in range(image.shape[0]):
             bboxes = []
             for size in [(size, size) for size in range(640, 256, -64)]:
                 face_analysis.det_model.input_size = size
                 face_info = face_analysis.get(image[i])
                 if face_info:
-                    face_info, index, bboxes = get_face_by_index(face_info, face_sort_rule=input_face_sort,
+                    face_info, index, sorted_faces = get_face_by_index(face_info, face_sort_rule=input_face_sort,
                                                          face_index=input_face_index)
+                    bboxes = [face.bbox for face in sorted_faces]
                     embed_faces.append(crop_image(image[i], face_info.bbox, margin=10))
+                    draw_embed_face_bbox.append(image_to_tensor(draw_on(image[i], sorted_faces)).unsqueeze(0))
                     break
             else:
                 # No face detected, skip this image
                 logging.warning(f'Warning: No face detected in image {str(i)}')
                 continue
 
-            # get eva_clip embeddings
-            face_helper.clean_all()
-            face_helper.read_image(image[i])
-            face_helper.get_face_landmarks_5(ref_sort_bboxes=bboxes, face_index=input_face_index)
-            face_helper.align_warp_face()
-
-            if len(face_helper.cropped_faces) == 0:
-                # No face detected, skip this image
-                continue
-
-            # Get aligned face image
-            align_face = face_helper.cropped_faces[0]
+            if input_face_align_mode == 1:
+                image_size = 512
+                M = face_align.estimate_norm(face_info.kps, image_size=image_size)
+                align_face = cv2.warpAffine(image[i], M, (image_size, image_size), borderMode=cv2.BORDER_CONSTANT, borderValue=(135, 133, 132))
+                # align_face = face_align.norm_crop(image[i], landmark=face_info.kps, image_size=image_size)
+                del M
+            else:
+                # get eva_clip embeddings
+                face_helper.clean_all()
+                face_helper.read_image(image[i])
+                face_helper.get_face_landmarks_5(ref_sort_bboxes=bboxes, face_index=input_face_index)
+                face_helper.align_warp_face()
+
+                if len(face_helper.cropped_faces) == 0:
+                    # No face detected, skip this image
+                    continue
+
+                # Get aligned face image
+                align_face = face_helper.cropped_faces[0]
+                del face_helper
             align_faces.append(image_to_tensor(align_face).unsqueeze(0))
             del bboxes, align_face
-        del face_helper, image
+        del image
         if len(embed_faces) == 0:
             # No face detected, skip this image
             logging.warning(f'Warning: No embed face detected in image')
         if  len(align_faces) == 0:
             logging.warning(f'Warning: No align face detected in image')
-        return embed_faces, align_faces,
+        return embed_faces, align_faces, draw_embed_face_bbox,
 
 
 def crop_image(image, bbox, margin=0):

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "comfyui_pulid_flux_ll"
 description = "The implementation for PuLID-Flux, support use with TeaCache and WaveSpeed, no model pollution."
-version = "1.1.3"
+version = "1.1.4"
 license = {file = "LICENSE"}
 dependencies = ['cython', 'facexlib', 'insightface', 'onnxruntime', 'onnxruntime-gpu; sys_platform != "darwin" and (platform_machine == "x86_64" or platform_machine == "AMD64")', 'ftfy', 'timm']