New Function: sample_balanced_dataset() in dataset.py

alessiosavi · alessiosavi · commit b06ef8f35f94 · 2025-12-28T15:13:25.000+01:00
sample_balanced_dataset(
      data,
      sample_size=0.1,          # 10% of data (or absolute count like 1000)
      balance_classes=True,      # Enable class balancing
      balance_strategy="undersample",  # Strategy for balancing
      seed=42
  )

  Three Balancing Strategies:

  | Strategy     | Description                         | Use Case                                    |
  |--------------|-------------------------------------|---------------------------------------------|
  | undersample  | Caps all classes to equal counts    | Maximum class balance                       |
  | sqrt         | Uses √(count) weighting             | Gentler balancing, keeps more majority data |
  | proportional | Maintains ratios with min guarantee | Light balancing with minimum representation |

  Config Parameters (added to DataConfig):

  - sample_size: Float (0-1) for percentage or int for absolute count
  - balance_classes: Enable/disable class balancing
  - balance_strategy: Choose balancing algorithm

  CLI Arguments:

  # Sample 10% with balanced classes
  python scripts/train.py --config configs/cpu_training.yaml \
      --sample-size 0.1 --balance-classes

  # Sample 500 items with sqrt balancing
  python scripts/train.py --config configs/cpu_training.yaml \
      --sample-size 500 --balance-classes --balance-strategy sqrt

  Example Results:

  Original:  6997 samples - {1: 39, 2: 69, ... 7: 1741, ...}
  5% balanced: 349 samples - {1: 38, 2: 38, 3: 38, 4: 40, 5: 38, 6: 39, 7: 41, 8: 39, 9: 38}

  The class weights become nearly equal (~1.0) after balancing, which can help training on imbalanced datasets.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -32,6 +32,16 @@ jobs:
         python-version: ['3.12']
 
     steps:
+      - name: Free up disk space
+        if: runner.os == 'Linux'
+        run: |
+          sudo apt-get clean
+          sudo apt-get autoclean
+          sudo apt-get autoremove -y
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /usr/local/lib/android
+          rm -rf ~/.cache/pip ~/.cache/pipenv ~/.npm
+          df -h
       - name: Checkout repository
         uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
 
diff --git a/beauty_scorer/config.py b/beauty_scorer/config.py
@@ -129,6 +129,25 @@ class DataConfig(BaseModel):
         default=(0.229, 0.224, 0.225),
         description="Normalization std (ImageNet default)",
     )
+    # Sampling configuration
+    sample_size: float | int | None = Field(
+        default=None,
+        description=(
+            "Dataset sample size. Float (0-1) = fraction of data, "
+            "int >= 1 = absolute count, None = use all data"
+        ),
+    )
+    balance_classes: bool = Field(
+        default=False,
+        description="Balance class distribution when sampling",
+    )
+    balance_strategy: Literal["undersample", "sqrt", "proportional"] = Field(
+        default="undersample",
+        description=(
+            "Strategy for balancing: 'undersample' caps each class equally, "
+            "'sqrt' uses square root weighting, 'proportional' maintains ratios"
+        ),
+    )
 
     @field_validator("image_size", "face_size", mode="before")
     @classmethod
diff --git a/beauty_scorer/data/dataset.py b/beauty_scorer/data/dataset.py
@@ -412,3 +412,232 @@ def compute_class_weights(
     weights = torch.clamp(weights, max=max_weight)
 
     return weights
+
+
+def sample_balanced_dataset(
+    data: list[dict],
+    sample_size: int | float | None = None,
+    balance_classes: bool = True,
+    balance_strategy: str = "undersample",
+    min_samples_per_class: int = 1,
+    seed: int = 42,
+) -> list[dict]:
+    """
+    Sample a subset of the dataset with optional class balancing.
+
+    This function is useful for:
+    - Quick experimentation with smaller datasets
+    - Reducing training time while maintaining class representation
+    - Handling class imbalance by undersampling majority classes
+    - Creating balanced mini-datasets for debugging or prototyping
+
+    Args:
+        data: Full dataset list with 'score' key for each item.
+        sample_size: Target sample size.
+            - If float in (0.0, 1.0]: fraction of total data (e.g., 0.1 = 10%)
+            - If int >= 1: absolute number of samples
+            - If None: use all data (only balance if balance_classes=True)
+        balance_classes: If True, attempts to balance class distribution.
+            Each class will have roughly equal representation, limited by
+            the smallest class size or the balance_strategy.
+        balance_strategy: Strategy for balancing classes.
+            - "undersample": Cap each class to target_per_class samples.
+              Ensures balanced classes but may lose data from majority classes.
+            - "sqrt": Use square root of original counts as weights. Reduces
+              imbalance while preserving more majority class data.
+            - "proportional": Maintain original distribution ratios but with
+              guaranteed minimum representation per class.
+        min_samples_per_class: Minimum samples to keep per class when possible.
+            Ensures very small classes aren't completely dropped.
+        seed: Random seed for reproducibility.
+
+    Returns:
+        Sampled dataset list.
+
+    Example:
+        >>> # Sample 10% of data with balanced classes
+        >>> sampled = sample_balanced_dataset(data, sample_size=0.1, balance_classes=True)
+
+        >>> # Sample exactly 1000 items with balanced classes
+        >>> sampled = sample_balanced_dataset(data, sample_size=1000, balance_classes=True)
+
+        >>> # Sample 20% maintaining original class distribution
+        >>> sampled = sample_balanced_dataset(data, sample_size=0.2, balance_classes=False)
+
+        >>> # Just balance classes without reducing total size
+        >>> sampled = sample_balanced_dataset(data, sample_size=None, balance_classes=True)
+
+        >>> # Use sqrt balancing for gentler rebalancing
+        >>> sampled = sample_balanced_dataset(
+        ...     data, sample_size=0.5, balance_classes=True, balance_strategy="sqrt"
+        ... )
+    """
+    if not data:
+        return []
+
+    random.seed(seed)
+
+    # Group data by class (score)
+    class_groups: dict[int, list[dict]] = {}
+    for item in data:
+        score = item["score"]
+        if score not in class_groups:
+            class_groups[score] = []
+        class_groups[score].append(item)
+
+    num_classes = len(class_groups)
+    total_data = len(data)
+
+    # Determine target total samples
+    if sample_size is None:
+        target_total = total_data
+    elif isinstance(sample_size, float) and 0 < sample_size <= 1.0:
+        target_total = max(1, int(total_data * sample_size))
+    elif isinstance(sample_size, (int, float)) and sample_size >= 1:
+        target_total = max(1, min(int(sample_size), total_data))
+    else:
+        raise ValueError(
+            f"sample_size must be float in (0, 1], int >= 1, or None. Got: {sample_size}"
+        )
+
+    # Log class distribution before sampling
+    class_counts = {cls: len(items) for cls, items in sorted(class_groups.items())}
+    logger.debug(f"Original class distribution: {class_counts}")
+
+    if not balance_classes:
+        # Simple random sampling without balancing
+        if target_total >= total_data:
+            sampled = data.copy()
+        else:
+            sampled = random.sample(data, target_total)
+        random.shuffle(sampled)
+        logger.info(
+            f"Sampled {len(sampled)} items without balancing "
+            f"({len(sampled)/total_data*100:.1f}% of {total_data})"
+        )
+        return sampled
+
+    # Balanced sampling
+    sampled: list[dict] = []
+
+    if balance_strategy == "undersample":
+        # Find the smallest class size for true balancing
+        min_class_size = min(len(items) for items in class_groups.values())
+
+        # Determine target per class based on mode:
+        # - If sample_size specified: balance within the budget (target_total / num_classes)
+        # - If sample_size=None (balance only): use min class size for true balancing
+        if sample_size is None:
+            # Balance-only mode: undersample all classes to match smallest
+            target_per_class = max(min_samples_per_class, min_class_size)
+            # Also cap total to balanced amount
+            target_total = target_per_class * num_classes
+        else:
+            # Size-limited mode: distribute budget evenly
+            ideal_per_class = target_total // num_classes
+            target_per_class = max(min_samples_per_class, min(ideal_per_class, min_class_size))
+
+        # Sample up to target_per_class from each class
+        remaining_quota = target_total
+        sorted_classes = sorted(class_groups.keys())
+
+        for cls in sorted_classes:
+            items = class_groups[cls]
+            n_available = len(items)
+
+            # Take min of target and available
+            n_samples = min(target_per_class, n_available, remaining_quota)
+            n_samples = max(n_samples, min(min_samples_per_class, n_available))
+
+            if n_samples > 0:
+                sampled.extend(random.sample(items, n_samples))
+                remaining_quota -= n_samples
+
+        # Second pass: if we have remaining quota (only when sample_size was specified),
+        # fill from larger classes to hit target
+        if remaining_quota > 0 and sample_size is not None:
+            sampled_ids = {item["id"] for item in sampled}
+            remaining_items = []
+            for items in class_groups.values():
+                for item in items:
+                    if item["id"] not in sampled_ids:
+                        remaining_items.append(item)
+
+            if remaining_items:
+                extra = random.sample(remaining_items, min(remaining_quota, len(remaining_items)))
+                sampled.extend(extra)
+
+    elif balance_strategy == "sqrt":
+        # Use square root of counts to determine sampling weights
+        # This reduces imbalance while preserving more majority class data
+        sqrt_counts = {cls: np.sqrt(len(items)) for cls, items in class_groups.items()}
+        total_sqrt = sum(sqrt_counts.values())
+
+        # Calculate target samples per class based on sqrt weights
+        for cls in sorted(class_groups.keys()):
+            items = class_groups[cls]
+            weight = sqrt_counts[cls] / total_sqrt
+            n_samples = max(
+                min_samples_per_class,
+                min(int(target_total * weight), len(items)),
+            )
+            sampled.extend(random.sample(items, n_samples))
+
+        # Adjust to hit target
+        if len(sampled) > target_total:
+            sampled = random.sample(sampled, target_total)
+
+    elif balance_strategy == "proportional":
+        # Maintain original distribution but ensure minimum representation
+        for cls in sorted(class_groups.keys()):
+            items = class_groups[cls]
+            proportion = len(items) / total_data
+            n_samples = max(
+                min_samples_per_class,
+                min(int(target_total * proportion), len(items)),
+            )
+            sampled.extend(random.sample(items, n_samples))
+
+        # Adjust to hit target
+        if len(sampled) > target_total:
+            sampled = random.sample(sampled, target_total)
+
+    else:
+        raise ValueError(
+            f"Unknown balance_strategy: {balance_strategy}. "
+            f"Choose from: 'undersample', 'sqrt', 'proportional'"
+        )
+
+    random.shuffle(sampled)
+
+    # Log resulting distribution
+    result_counts: dict[int, int] = {}
+    for item in sampled:
+        score = item["score"]
+        result_counts[score] = result_counts.get(score, 0) + 1
+    result_counts = dict(sorted(result_counts.items()))
+
+    logger.info(
+        f"Sampled {len(sampled)} items with '{balance_strategy}' balancing "
+        f"({len(sampled)/total_data*100:.1f}% of {total_data})"
+    )
+    logger.debug(f"Balanced class distribution: {result_counts}")
+
+    return sampled
+
+
+def get_class_distribution(data: list[dict]) -> dict[int, int]:
+    """
+    Get the class distribution of a dataset.
+
+    Args:
+        data: Dataset list with 'score' key.
+
+    Returns:
+        Dictionary mapping score to count.
+    """
+    distribution: dict[int, int] = {}
+    for item in data:
+        score = item["score"]
+        distribution[score] = distribution.get(score, 0) + 1
+    return dict(sorted(distribution.items()))
diff --git a/configs/advanced_model.yaml b/configs/advanced_model.yaml
@@ -38,7 +38,7 @@ training:
   pin_memory: true
 
 data:
-  image_size: [384, 384]  # Higher resolution for ViT
+  image_size: [448, 448]  # Higher resolution for ViT
   face_size: [224, 224]
   max_photos: 9
   augmentation: true
@@ -48,16 +48,20 @@ data:
   face_confidence_threshold: 0.5
   normalize_mean: [0.485, 0.456, 0.406]
   normalize_std: [0.229, 0.224, 0.225]
+  # Dataset sampling (optional - for quick experiments or class balancing)
+  # sample_size: 0.1  # Use 10% of data (float 0-1) or absolute count (int)
+  # balance_classes: true  # Balance class distribution when sampling
+  # balance_strategy: undersample  # 'undersample', 'sqrt', or 'proportional'
 
 paths:
-  dataset_file: datasets/beauty_dataset.csv
-  dataset_base_path: datasets/images
+  dataset_file: data/dataset.csv
+  dataset_base_path: /opt/SP/DATA/beauty_dataset
   output_dir: outputs/advanced
   checkpoint_path:
   export_dir: exports
 
 logging:
-  log_level: INFO
+  log_level: DEBUG
   log_dir: logs/advanced
   use_tensorboard: true
   use_wandb: true
diff --git a/configs/cpu_training.yaml b/configs/cpu_training.yaml
@@ -48,6 +48,10 @@ data:
   face_confidence_threshold: 0.5
   normalize_mean: [0.485, 0.456, 0.406]
   normalize_std: [0.229, 0.224, 0.225]
+  # Dataset sampling (optional - for quick experiments or class balancing)
+  # sample_size: 0.1  # Use 10% of data (float 0-1) or absolute count (int)
+  # balance_classes: true  # Balance class distribution when sampling
+  # balance_strategy: undersample  # 'undersample', 'sqrt', or 'proportional'
 
 paths:
   dataset_file: data/dataset.csv
diff --git a/configs/default.yaml b/configs/default.yaml
@@ -48,9 +48,13 @@ data:
   face_confidence_threshold: 0.3
   normalize_mean: [0.485, 0.456, 0.406]
   normalize_std: [0.229, 0.224, 0.225]
+  # Dataset sampling (optional - for quick experiments or class balancing)
+  # sample_size: 0.1  # Use 10% of data (float 0-1) or absolute count (int)
+  # balance_classes: true  # Balance class distribution when sampling
+  # balance_strategy: undersample  # 'undersample', 'sqrt', or 'proportional'
 
 paths:
-  dataset_file: datasets/beauty_dataset.csv
+  dataset_file: data/dataset.csv
   dataset_base_path: /opt/SP/DATA/beauty_dataset
   output_dir: outputs
   checkpoint_path:
diff --git a/scripts/train.py b/scripts/train.py