opendatalab · Geo99pro · Mar 24, 2025 · Mar 24, 2025 · Mar 24, 2025 · Mar 24, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,41 @@
+# Change the version of cuda and cudnn according to user cuda and cudnn version
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install libraries and dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    software-properties-common \
+    ffmpeg \
+    libsm6 \
+    libxext6 \
+    git \
+    curl \
+    unzip \
+    wget \
+    tar \
+    build-essential \
+    libopenmpi-dev \
+    libcairo2-dev \
+    pkg-config \
+    cmake \
+    libpoppler-cpp-dev \
+    poppler-utils \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update \
+    && apt-get install -y --no-install-recommends python3.9 python3.9-distutils python3.9-dev python3-pip \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip install --upgrade setuptools
+
+# Update Python symlink to point to Python 3.9
+RUN ln -sf /usr/bin/python3.9 /usr/bin/python \
+    && ln -sf /usr/bin/python3.9 /usr/bin/python3 \
+    && pip install huggingface-hub
+
+COPY requirements.txt .
+
+RUN pip install -r requirements.txt
+
+CMD [ "bash" ]
diff --git a/doclayout_yolo/engine/trainer.py b/doclayout_yolo/engine/trainer.py
@@ -19,6 +19,7 @@
 import torch
 from torch import distributed as dist
 from torch import nn, optim
+import torch.amp
 
 from doclayout_yolo.cfg import get_cfg, get_save_dir
 from doclayout_yolo.data.utils import check_cls_dataset, check_det_dataset
@@ -226,7 +227,7 @@ def _setup_ddp(self, world_size):
         torch.cuda.set_device(RANK)
         self.device = torch.device("cuda", RANK)
         # LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
-        os.environ["NCCL_BLOCKING_WAIT"] = "1"  # set to enforce timeout
+        os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1"  # set to enforce timeout
         dist.init_process_group(
             backend="nccl" if dist.is_nccl_available() else "gloo",
             timeout=timedelta(seconds=32400),  # 3 hours
@@ -377,7 +378,7 @@ def _do_train(self, world_size=1):
                             x["momentum"] = np.interp(ni, xi, [self.args.warmup_momentum, self.args.momentum])
 
                 # Forward
-                with torch.cuda.amp.autocast(self.amp):
+                with torch.amp.autocast(device_type="cuda", enabled=self.amp): #https://pytorch.org/docs/stable/amp.html#
                     batch = self.preprocess_batch(batch)
                     self.loss, self.loss_items = self.model(batch)
                     if RANK != -1:

diff --git a/doclayout_yolo/nn/modules/g2l_crm.py b/doclayout_yolo/nn/modules/g2l_crm.py
@@ -33,10 +33,15 @@ def __init__(self, c, dilation, k, fuse="sum", shortcut=True):
 
     def dilated_conv(self, x, dilation):
         act = self.dcv.act
-        bn = self.dcv.bn
         weight = self.dcv.conv.weight
         padding = dilation * (self.k//2)
-        return act(bn(F.conv2d(x, weight, stride=1, padding=padding, dilation=dilation)))
+        if hasattr(self.dcv, "bn") and self.dcv.bn is not None:
+            bn = self.dcv.bn
+            x = bn(F.conv2d(x, weight, stride=1, padding=padding, dilation=dilation))
+        else:
+            x = F.conv2d(x, weight, stride=1, padding=padding, dilation=dilation)
+
+        return act(x) # Apply activation function
 
     def forward(self, x):
         """'forward()' applies the YOLO FPN to input data."""

diff --git a/doclayout_yolo/utils/autobatch.py b/doclayout_yolo/utils/autobatch.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import torch
+import torch.amp
 
 from doclayout_yolo.utils import DEFAULT_CFG, LOGGER, colorstr
 from doclayout_yolo.utils.torch_utils import profile
@@ -22,8 +23,7 @@ def check_train_batch_size(model, imgsz=640, amp=True):
     Returns:
         (int): Optimal batch size computed using the autobatch() function.
     """
-
-    with torch.cuda.amp.autocast(amp):
+    with torch.amp.autocast(device_type="cuda", enabled=amp):
         return autobatch(deepcopy(model).train(), imgsz)  # compute optimal batch size
 
 

diff --git a/doclayout_yolo/utils/checks.py b/doclayout_yolo/utils/checks.py
@@ -13,11 +13,11 @@
 from importlib import metadata
 from pathlib import Path
 from typing import Optional
-
 import cv2
 import numpy as np
 import requests
 import torch
+import torch.amp
 from matplotlib import font_manager
 
 from doclayout_yolo.utils import (
@@ -638,7 +638,7 @@ def check_amp(model):
     def amp_allclose(m, im):
         """All close FP32 vs AMP results."""
         a = m(im, device=device, verbose=False)[0].boxes.data  # FP32 inference
-        with torch.cuda.amp.autocast(True):
+        with torch.amp.autocast(device_type="cuda", enabled=True):
             b = m(im, device=device, verbose=False)[0].boxes.data  # AMP inference
         del m
         return a.shape == b.shape and torch.allclose(a, b.float(), atol=0.5)  # close to 0.5 absolute tolerance

diff --git a/doclayout_yolo/utils/loss.py b/doclayout_yolo/utils/loss.py
@@ -1,6 +1,7 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license
 
 import torch
+import torch.amp
 import torch.nn as nn
 import torch.nn.functional as F
 
@@ -30,7 +31,7 @@ def __init__(self):
     def forward(pred_score, gt_score, label, alpha=0.75, gamma=2.0):
         """Computes varfocal loss."""
         weight = alpha * pred_score.sigmoid().pow(gamma) * (1 - label) + gt_score * label
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast(device_type="cuda", enabled=False):
             loss = (
                 (F.binary_cross_entropy_with_logits(pred_score.float(), gt_score.float(), reduction="none") * weight)
                 .mean(1)

diff --git a/doclayout_yolo/utils/torch_utils.py b/doclayout_yolo/utils/torch_utils.py
@@ -462,6 +462,15 @@ def update_attr(self, model, include=(), exclude=("process_group", "reducer")):
         if self.enabled:
             copy_attr(self.ema, model, include, exclude)
 
+def convert_to_fp16(model):
+    """Convert model to FP16 (half precision) and return original device."""
+    for layers in model.modules():
+        if isinstance(layers, (nn.BatchNorm2d, nn.BatchNorm1d, nn.SyncBatchNorm)):
+            try:
+                layers.half()
+            except Exception:
+                LOGGER.warning(f"Warning: layer {layers} not supported for FP16 conversion")
+    return model
 
 def strip_optimizer(f: Union[str, Path] = "best.pt", s: str = "") -> None:
     """
@@ -483,21 +492,23 @@ def strip_optimizer(f: Union[str, Path] = "best.pt", s: str = "") -> None:
             strip_optimizer(f)
         ```
     """
-    x = torch.load(f, map_location=torch.device("cpu"))
+    x = torch.load(f, map_location=torch.device("cpu"), weights_only=False)
     if "model" not in x:
         LOGGER.info(f"Skipping {f}, not a valid Ultralytics model.")
         return
 
-    if hasattr(x["model"], "args"):
-        x["model"].args = dict(x["model"].args)  # convert from IterableSimpleNamespace to dict
+    model = x["model"]
+    if hasattr(model, "args"):
+        model.args = dict(model.args)  # convert from IterableSimpleNamespace to dict
     args = {**DEFAULT_CFG_DICT, **x["train_args"]} if "train_args" in x else None  # combine args
     if x.get("ema"):
-        x["model"] = x["ema"]  # replace model with ema
-    for k in "optimizer", "best_fitness", "ema", "updates":  # keys
+        model = x["ema"]  # replace model with ema
+    for k in ["optimizer", "best_fitness", "ema", "updates"]:  # keys
         x[k] = None
     x["epoch"] = -1
-    x["model"].half()  # to FP16
-    for p in x["model"].parameters():
+    model = convert_to_fp16(model)  # to FP16
+    #x["model"].half()  # to FP16
+    for p in model.parameters():
         p.requires_grad = False
     x["train_args"] = {k: v for k, v in args.items() if k in DEFAULT_CFG_KEYS}  # strip non-default keys
     # x['model'].args = x['train_args']

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,83 @@
+albucore==0.0.23
+albumentations==2.0.5
+annotated-types==0.7.0
+blinker==1.4
+certifi==2025.1.31
+charset-normalizer==3.4.1
+contourpy==1.3.0
+cryptography==3.4.8
+cycler==0.12.1
+dbus-python==1.2.18
+distro==1.7.0
+eval_type_backport==0.2.2
+filelock==3.18.0
+fonttools==4.56.0
+fsspec==2025.3.0
+httplib2==0.20.2
+huggingface-hub==0.29.3
+idna==3.10
+importlib-metadata==4.6.4
+importlib_resources==6.5.2
+jeepney==0.7.1
+Jinja2==3.1.6
+keyring==23.5.0
+kiwisolver==1.4.7
+launchpadlib==1.10.16
+lazr.restfulclient==0.14.4
+lazr.uri==1.0.6
+MarkupSafe==3.0.2
+matplotlib==3.9.4
+more-itertools==8.10.0
+mpmath==1.3.0
+networkx==3.2.1
+numpy==2.0.2
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+oauthlib==3.2.0
+opencv-python==4.11.0.86
+opencv-python-headless==4.11.0.86
+packaging==24.2
+pandas==2.2.3
+pillow==11.1.0
+psutil==7.0.0
+py-cpuinfo==9.0.0
+pydantic==2.11.0
+pydantic_core==2.33.0
+PyGObject==3.42.1
+PyJWT==2.3.0
+pyparsing==2.4.7
+python-apt==2.4.0+ubuntu4
+python-dateutil==2.9.0.post0
+pytz==2025.2
+PyYAML==6.0.2
+requests==2.32.3
+scipy==1.13.1
+seaborn==0.13.2
+SecretStorage==3.3.1
+simsimd==6.2.1
+six==1.16.0
+stringzilla==3.12.3
+sympy==1.13.1
+thop==0.1.1.post2209072238
+torch==2.6.0
+torchvision==0.21.0
+tqdm==4.67.1
+triton==3.2.0
+typing-inspection==0.4.0
+typing_extensions==4.13.0
+tzdata==2025.2
+urllib3==2.3.0
+#pycairo
+wadllib==1.3.6
+zipp==3.21.0