Skip to content

Commit

Permalink
GMT model architecture uploaded
Browse files Browse the repository at this point in the history
  • Loading branch information
fengchen025 authored Jan 29, 2025
1 parent 8d71453 commit 3f430c6
Show file tree
Hide file tree
Showing 88 changed files with 12,138 additions and 0 deletions.
27 changes: 27 additions & 0 deletions mask2former/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Copyright (c) Facebook, Inc. and its affiliates.
from . import data # register all new datasets
from . import modeling

# config
from .config import add_maskformer2_config

# dataset loading
from .data.dataset_mappers.coco_instance_new_baseline_dataset_mapper import COCOInstanceNewBaselineDatasetMapper
from .data.dataset_mappers.coco_panoptic_new_baseline_dataset_mapper import COCOPanopticNewBaselineDatasetMapper
from .data.dataset_mappers.mask_former_instance_dataset_mapper import (
MaskFormerInstanceDatasetMapper,
)
from .data.dataset_mappers.mask_former_panoptic_dataset_mapper import (
MaskFormerPanopticDatasetMapper,
)
from .data.dataset_mappers.mask_former_semantic_dataset_mapper import (
MaskFormerSemanticDatasetMapper,
)

# models
from .maskformer_model import MaskFormer
from .guide_maskformer_model import GuideMaskFormer
from .test_time_augmentation import SemanticSegmentorWithTTA

# evaluation
from .evaluation.instance_evaluation import InstanceSegEvaluator
Binary file added mask2former/__pycache__/__init__.cpython-39.pyc
Binary file not shown.
Binary file added mask2former/__pycache__/config.cpython-39.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
130 changes: 130 additions & 0 deletions mask2former/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# -*- coding: utf-8 -*-
from detectron2.config import CfgNode as CN


def add_maskformer2_config(cfg):
"""
Add config for MASK_FORMER.
"""
# NOTE: configs from original maskformer
# data config
# select the dataset mapper
cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
# Color augmentation
cfg.INPUT.COLOR_AUG_SSD = False
# We retry random cropping until no single category in semantic segmentation GT occupies more
# than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
# Pad image and segmentation GT in dataset mapper.
cfg.INPUT.SIZE_DIVISIBILITY = -1

# solver config
# weight decay on embedding
cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
# optimizer
cfg.SOLVER.OPTIMIZER = "ADAMW"
cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1

# for grad accumulation
cfg.SOLVER.GRAD_ACCU_STEPS = 1

# mask_former model config
cfg.MODEL.MASK_FORMER = CN()

# loss
cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
cfg.MODEL.MASK_FORMER.CLASS_WEIGHT = 1.0
cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0

# transformer config
cfg.MODEL.MASK_FORMER.NHEADS = 8
cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
cfg.MODEL.MASK_FORMER.PRE_NORM = False

cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100

cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False

# mask_former inference config
cfg.MODEL.MASK_FORMER.TEST = CN()
cfg.MODEL.MASK_FORMER.TEST.SEMANTIC_ON = True
cfg.MODEL.MASK_FORMER.TEST.INSTANCE_ON = False
cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False

# Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
# you can use this config to override
cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32

# for harmonic embedding
cfg.MODEL.MASK_FORMER.LOGGER = 'wandb'
cfg.MODEL.MASK_FORMER.USE_EDGE = False
cfg.MODEL.MASK_FORMER.DILATE_KERNEL_SIZE = 5
cfg.MODEL.MASK_FORMER.GUIDE_FUNCTION_PATH = None
cfg.MODEL.MASK_FORMER.EMB_WEIGHT = 5.0
cfg.MODEL.MASK_FORMER.USE_HPE = True # fixed harmonic embedding as pe
cfg.MODEL.MASK_FORMER.USE_DPE = True # dynamic pe generation
cfg.MODEL.MASK_FORMER.USE_AUX_SUP = True
# cfg.MODEL.MASK_FORMER.USE_EMB = False
# cfg.MODEL.MASK_FORMER.USE_QUERY_INIT = False
# cfg.MODEL.MASK_FORMER.PE_TYPE = "cart"
# cfg.MODEL.MASK_FORMER.OUT_MOD_TYPE = None

# pixel decoder config
cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
# adding transformer in pixel decoder
cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
# pixel decoder
cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"

# swin transformer backbone
cfg.MODEL.SWIN = CN()
cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
cfg.MODEL.SWIN.PATCH_SIZE = 4
cfg.MODEL.SWIN.EMBED_DIM = 96
cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
cfg.MODEL.SWIN.WINDOW_SIZE = 7
cfg.MODEL.SWIN.MLP_RATIO = 4.0
cfg.MODEL.SWIN.QKV_BIAS = True
cfg.MODEL.SWIN.QK_SCALE = None
cfg.MODEL.SWIN.DROP_RATE = 0.0
cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
cfg.MODEL.SWIN.APE = False
cfg.MODEL.SWIN.PATCH_NORM = True
cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
cfg.MODEL.SWIN.USE_CHECKPOINT = False

# NOTE: maskformer2 extra configs
# transformer module
cfg.MODEL.MASK_FORMER.TRANSFORMER_DECODER_NAME = "MultiScaleMaskedTransformerDecoder"

# LSJ aug
cfg.INPUT.IMAGE_SIZE = 1024
cfg.INPUT.MIN_SCALE = 0.1
cfg.INPUT.MAX_SCALE = 2.0

# MSDeformAttn encoder configs
cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8

# point loss configs
# Number of points sampled during training for a mask point head.
cfg.MODEL.MASK_FORMER.TRAIN_NUM_POINTS = 112 * 112
# Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
# original paper.
cfg.MODEL.MASK_FORMER.OVERSAMPLE_RATIO = 3.0
# Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
# the original paper.
cfg.MODEL.MASK_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75
2 changes: 2 additions & 0 deletions mask2former/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (c) Facebook, Inc. and its affiliates.
from . import datasets
Binary file added mask2former/data/__pycache__/__init__.cpython-39.pyc
Binary file not shown.
1 change: 1 addition & 0 deletions mask2former/data/dataset_mappers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Copyright (c) Facebook, Inc. and its affiliates.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/d2/detr/dataset_mapper.py
import copy
import logging

import numpy as np
import torch

from detectron2.config import configurable
from detectron2.data import detection_utils as utils
from detectron2.data import transforms as T
from detectron2.data.transforms import TransformGen
from detectron2.structures import BitMasks, Instances

from pycocotools import mask as coco_mask

__all__ = ["COCOInstanceNewBaselineDatasetMapper"]


def convert_coco_poly_to_mask(segmentations, height, width):
masks = []
for polygons in segmentations:
rles = coco_mask.frPyObjects(polygons, height, width)
mask = coco_mask.decode(rles)
if len(mask.shape) < 3:
mask = mask[..., None]
mask = torch.as_tensor(mask, dtype=torch.uint8)
mask = mask.any(dim=2)
masks.append(mask)
if masks:
masks = torch.stack(masks, dim=0)
else:
masks = torch.zeros((0, height, width), dtype=torch.uint8)
return masks


def build_transform_gen(cfg, is_train):
"""
Create a list of default :class:`Augmentation` from config.
Now it includes resizing and flipping.
Returns:
list[Augmentation]
"""
assert is_train, "Only support training augmentation"
image_size = cfg.INPUT.IMAGE_SIZE
min_scale = cfg.INPUT.MIN_SCALE
max_scale = cfg.INPUT.MAX_SCALE

augmentation = []

if cfg.INPUT.RANDOM_FLIP != "none":
augmentation.append(
T.RandomFlip(
horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
)
)

augmentation.extend([
T.ResizeScale(
min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
),
T.FixedSizeCrop(crop_size=(image_size, image_size)),
])

return augmentation


# This is specifically designed for the COCO dataset.
class COCOInstanceNewBaselineDatasetMapper:
"""
A callable which takes a dataset dict in Detectron2 Dataset format,
and map it into a format used by MaskFormer.
This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
The callable currently does the following:
1. Read the image from "file_name"
2. Applies geometric transforms to the image and annotation
3. Find and applies suitable cropping to the image and annotation
4. Prepare image and annotation to Tensors
"""

@configurable
def __init__(
self,
is_train=True,
*,
tfm_gens,
image_format,
):
"""
NOTE: this interface is experimental.
Args:
is_train: for training or inference
augmentations: a list of augmentations or deterministic transforms to apply
tfm_gens: data augmentation
image_format: an image format supported by :func:`detection_utils.read_image`.
"""
self.tfm_gens = tfm_gens
logging.getLogger(__name__).info(
"[COCOInstanceNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(str(self.tfm_gens))
)

self.img_format = image_format
self.is_train = is_train

@classmethod
def from_config(cls, cfg, is_train=True):
# Build augmentation
tfm_gens = build_transform_gen(cfg, is_train)

ret = {
"is_train": is_train,
"tfm_gens": tfm_gens,
"image_format": cfg.INPUT.FORMAT,
}
return ret

def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
utils.check_image_size(dataset_dict, image)

# TODO: get padding mask
# by feeding a "segmentation mask" to the same transforms
padding_mask = np.ones(image.shape[:2])

image, transforms = T.apply_transform_gens(self.tfm_gens, image)
# the crop transformation has default padding value 0 for segmentation
padding_mask = transforms.apply_segmentation(padding_mask)
padding_mask = ~ padding_mask.astype(bool)

image_shape = image.shape[:2] # h, w

# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))

if not self.is_train:
# USER: Modify this if you want to keep them for some reason.
dataset_dict.pop("annotations", None)
return dataset_dict

if "annotations" in dataset_dict:
# USER: Modify this if you want to keep them for some reason.
for anno in dataset_dict["annotations"]:
# Let's always keep mask
# if not self.mask_on:
# anno.pop("segmentation", None)
anno.pop("keypoints", None)

# USER: Implement additional transformations if you have other types of data
annos = [
utils.transform_instance_annotations(obj, transforms, image_shape)
for obj in dataset_dict.pop("annotations")
if obj.get("iscrowd", 0) == 0
]
# NOTE: does not support BitMask due to augmentation
# Current BitMask cannot handle empty objects
instances = utils.annotations_to_instances(annos, image_shape)
# After transforms such as cropping are applied, the bounding box may no longer
# tightly bound the object. As an example, imagine a triangle object
# [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
# bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
# the intersection of original bounding box and the cropping box.
instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
# Need to filter empty instances first (due to augmentation)
instances = utils.filter_empty_instances(instances)
# Generate masks from polygon
h, w = instances.image_size
# image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
if hasattr(instances, 'gt_masks'):
gt_masks = instances.gt_masks
gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
instances.gt_masks = gt_masks
dataset_dict["instances"] = instances

return dataset_dict
Loading

0 comments on commit 3f430c6

Please sign in to comment.