Skip to content

Commit

Permalink
Merge branch 'main' into text_tower_refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
mitchellnw authored Nov 3, 2022
2 parents 90a890f + cdb5e20 commit 9093d5e
Show file tree
Hide file tree
Showing 27 changed files with 2,670 additions and 33 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ Below are checkpoints of models trained on YFCC-15M, along with their zero-shot

We offer a simple model interface to instantiate both pre-trained and untrained models.

NOTE: Many existing checkpoints use the QuickGELU activation from the original OpenAI models. This activation is actually less efficient that native torch.nn.GELU in recent versions of PyTorch. The model defaults are now nn.GELU, so one should use model definitions with `-quickgelu` postfix for the OpenCLIP pretrained weights. All OpenAI pretrained weights will always default to QuickGELU. One can also use the non `-quickgelu` model definitions with pretrained weights using QuickGELU but there will be an accuracy drop, for fine-tune that will likely vanish for longer runs.
NOTE: Many existing checkpoints use the QuickGELU activation from the original OpenAI models. This activation is actually less efficient than native torch.nn.GELU in recent versions of PyTorch. The model defaults are now nn.GELU, so one should use model definitions with `-quickgelu` postfix for the OpenCLIP pretrained weights. All OpenAI pretrained weights will always default to QuickGELU. One can also use the non `-quickgelu` model definitions with pretrained weights using QuickGELU but there will be an accuracy drop, for fine-tune that will likely vanish for longer runs.

Future trained models will use nn.GELU.

Expand Down
20 changes: 20 additions & 0 deletions scripts.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/private/home/mitchellw/miniconda3/envs/open_clip/bin/torchrun --nproc_per_node 2 -m training.main \
--train-data '/datasets01/laion400m/laion400m-met-release/laion400m-dataset/{00000..41627}.tar' \
--train-num-samples 10968539 \
--dataset-type webdataset \
--batch-size 10 \
--precision amp \
--workers 4 --model ViT-B/32 \
--imagenet-val /datasets01/imagenet_full_size/061417/val

wandb login --relogin --host https://api.wandb.ai
8c7af12c0467de2ba16c0109d7666b00b0baea89

srun --gpus-per-node=8 --nodes=1 --partition=devlab --time=72:00:00 -C volta32gb --cpus-per-task 48 --pty /bin/bash -l
/private/home/mitchellw/miniconda3/envs/open_clip/bin/torchrun --nproc_per_node 8 -m training.main --ddp-static-graph --local-loss --dataset-resampled --gather-with-grad --grad-checkpointing --train-data '/datasets01/laion400m/laion400m-met-release/laion400m-dataset/{00000..41627}.tar' --train-num-samples 10968539 --dataset-type webdataset --precision amp --workers 4 --model ViT-H/14 --imagenet-val /datasets01/imagenet_full_size/061417/val --batch-size 16

/private/home/mitchellw/miniconda3/envs/open_clip/bin/torchrun --nproc_per_node 8 -m training.main --ddp-static-graph --local-loss --dataset-resampled --gather-with-grad --grad-checkpointing --train-data '/datasets01/laion400m/laion400m-met-release/laion400m-dataset/{00000..41627}.tar' --train-num-samples 25600 --dataset-type webdataset --precision amp --workers 4 --model ViT-B/32 --batch-size 64 --report-to wandb --name wdsdebug2

# vit l runs 512 for 32gb, 256 for 16gb
# vit h 128 / ?
# vit g 32 / ?
74 changes: 74 additions & 0 deletions src/flows/current/b32-4096.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import argparse
import os
from training.params import get_default_params

from run_with_submitit import main_with_args, parse_args


if __name__ == "__main__":

args = parse_args()

args.model = 'ViT-B/32'
default_params = get_default_params(args.model)
for name, val in default_params.items():
if getattr(args, name) is None:
setattr(args, name, val)
print('setting default', name, val)

args.ngpus = 8
args.batch_size = 128
args.nodes = 4
args.lr = 1e-3

args.partition = 'devlab'
args.use_volta32 = False

args.imagenet_val = '/datasets01/imagenet_full_size/061417/val'
args.train_data = '/datasets01/laion400m/laion400m-met-release/laion400m-dataset/{00000..41627}.tar'
args.train_num_samples = 5000000
args.dataset_type = 'webdataset'

args.precision = 'amp'
args.workers = 6

args.epochs = int(16 * 400000000 / args.train_num_samples)
args.report_to = 'wandb'
args.seed = 1
args.ddp_static_graph = True
args.local_loss = True
args.dataset_resampled = True
args.gather_with_grad = True
args.grad_checkpointing = True
args.save_frequency = 1
args.zeroshot_frequency = 2
args.warmup = 10000

name = f'b32-400m-opt-{args.lr}-{args.beta1}-{args.beta2}-{args.eps}-bs-{args.batch_size * args.ngpus * args.nodes}-{args.precision}-v{args.seed}'
if os.path.exists('/checkpoint/mitchellw/experiments/open_clip'):
args.logs = '/checkpoint/mitchellw/experiments/open_clip'
args.name = name
args.job_dir = name
main_with_args(args)

"""
srun --cpu_bind=none,v --accel-bind=gn python -u src/training/main.py \
--save-frequency 1 \
--zeroshot-frequency 1 \
--train-data="/p/fastdata/mmlaion/laion2B-en/{00000..23295}.tar" \
--train-num-samples=200000000 \
--warmup 10000 \
--lr "1e-3" \
--batch-size=208 \
--epochs=160 \
--workers=6 \
--model ViT-L-14 \
--name "L14-laion2B" \
--report-to "tensorboard" \
--seed 0 \
--ddp-static-graph \
--local-loss \
--dataset-resampled \
--gather-with-grad \
--grad-checkpointing
"""
74 changes: 74 additions & 0 deletions src/flows/current/b32-8192.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import argparse
import os
from training.params import get_default_params

from run_with_submitit import main_with_args, parse_args


if __name__ == "__main__":

args = parse_args()

args.model = 'ViT-B/32'
default_params = get_default_params(args.model)
for name, val in default_params.items():
if getattr(args, name) is None:
setattr(args, name, val)
print('setting default', name, val)

args.ngpus = 8
args.batch_size = 256
args.nodes = 4
args.lr = 1e-3

args.partition = 'devlab'
args.use_volta32 = False

args.imagenet_val = '/datasets01/imagenet_full_size/061417/val'
args.train_data = '/datasets01/laion400m/laion400m-met-release/laion400m-dataset/{00000..41627}.tar'
args.train_num_samples = 5000000
args.dataset_type = 'webdataset'

args.precision = 'amp'
args.workers = 6

args.epochs = int(16 * 400000000 / args.train_num_samples)
args.report_to = 'wandb'
args.seed = 1
args.ddp_static_graph = True
args.local_loss = True
args.dataset_resampled = True
args.gather_with_grad = True
args.grad_checkpointing = True
args.save_frequency = 1
args.zeroshot_frequency = 2
args.warmup = 10000

name = f'b32-400m-opt-{args.lr}-{args.beta1}-{args.beta2}-{args.eps}-bs-{args.batch_size * args.ngpus * args.nodes}-{args.precision}-v{args.seed}'
if os.path.exists('/checkpoint/mitchellw/experiments/open_clip'):
args.logs = '/checkpoint/mitchellw/experiments/open_clip'
args.name = name
args.job_dir = name
main_with_args(args)

"""
srun --cpu_bind=none,v --accel-bind=gn python -u src/training/main.py \
--save-frequency 1 \
--zeroshot-frequency 1 \
--train-data="/p/fastdata/mmlaion/laion2B-en/{00000..23295}.tar" \
--train-num-samples=200000000 \
--warmup 10000 \
--lr "1e-3" \
--batch-size=208 \
--epochs=160 \
--workers=6 \
--model ViT-L-14 \
--name "L14-laion2B" \
--report-to "tensorboard" \
--seed 0 \
--ddp-static-graph \
--local-loss \
--dataset-resampled \
--gather-with-grad \
--grad-checkpointing
"""
74 changes: 74 additions & 0 deletions src/flows/current/h14-4096.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import argparse
import os
from training.params import get_default_params

from run_with_submitit import main_with_args, parse_args


if __name__ == "__main__":

args = parse_args()

args.model = 'ViT-H/14'
default_params = get_default_params(args.model)
for name, val in default_params.items():
if getattr(args, name) is None:
setattr(args, name, val)
print('setting default', name, val)

args.ngpus = 8
args.batch_size = 32
args.nodes = 32
args.lr = 1e-3

args.partition = 'learnlab'
args.use_volta32 = False

args.imagenet_val = '/datasets01/imagenet_full_size/061417/val'
args.train_data = '/datasets01/laion400m/laion400m-met-release/laion400m-dataset/{00000..41627}.tar'
args.train_num_samples = 5000000
args.dataset_type = 'webdataset'

args.precision = 'amp'
args.workers = 6

args.epochs = int(16 * 400000000 / args.train_num_samples)
args.report_to = 'wandb'
args.seed = 1
args.ddp_static_graph = True
args.local_loss = True
args.dataset_resampled = True
args.gather_with_grad = True
args.grad_checkpointing = True
args.save_frequency = 1
args.zeroshot_frequency = 2
args.warmup = 10000

name = f'l14-400m-opt-{args.lr}-{args.beta1}-{args.beta2}-{args.eps}-bs-{args.batch_size * args.ngpus * args.nodes}-{args.precision}-v{args.seed}'
if os.path.exists('/checkpoint/mitchellw/experiments/open_clip'):
args.logs = '/checkpoint/mitchellw/experiments/open_clip'
args.name = name
args.job_dir = name
main_with_args(args)

"""
srun --cpu_bind=none,v --accel-bind=gn python -u src/training/main.py \
--save-frequency 1 \
--zeroshot-frequency 1 \
--train-data="/p/fastdata/mmlaion/laion2B-en/{00000..23295}.tar" \
--train-num-samples=200000000 \
--warmup 10000 \
--lr "1e-3" \
--batch-size=208 \
--epochs=160 \
--workers=6 \
--model ViT-L-14 \
--name "L14-laion2B" \
--report-to "tensorboard" \
--seed 0 \
--ddp-static-graph \
--local-loss \
--dataset-resampled \
--gather-with-grad \
--grad-checkpointing
"""
74 changes: 74 additions & 0 deletions src/flows/current/l14-4096.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import argparse
import os
from training.params import get_default_params

from run_with_submitit import main_with_args, parse_args


if __name__ == "__main__":

args = parse_args()

args.model = 'ViT-L/14'
default_params = get_default_params(args.model)
for name, val in default_params.items():
if getattr(args, name) is None:
setattr(args, name, val)
print('setting default', name, val)

args.ngpus = 8
args.batch_size = 32
args.nodes = 16
args.lr = 1e-3

args.partition = 'learnlab'
args.use_volta32 = False

args.imagenet_val = '/datasets01/imagenet_full_size/061417/val'
args.train_data = '/datasets01/laion400m/laion400m-met-release/laion400m-dataset/{00000..41627}.tar'
args.train_num_samples = 5000000
args.dataset_type = 'webdataset'

args.precision = 'amp'
args.workers = 6

args.epochs = int(16 * 400000000 / args.train_num_samples)
args.report_to = 'wandb'
args.seed = 1
args.ddp_static_graph = True
args.local_loss = True
args.dataset_resampled = True
args.gather_with_grad = True
args.grad_checkpointing = True
args.save_frequency = 1
args.zeroshot_frequency = 2
args.warmup = 10000

name = f'l14-400m-opt-{args.lr}-{args.beta1}-{args.beta2}-{args.eps}-bs-{args.batch_size * args.ngpus * args.nodes}-{args.precision}-v{args.seed}'
if os.path.exists('/checkpoint/mitchellw/experiments/open_clip'):
args.logs = '/checkpoint/mitchellw/experiments/open_clip'
args.name = name
args.job_dir = name
main_with_args(args)

"""
srun --cpu_bind=none,v --accel-bind=gn python -u src/training/main.py \
--save-frequency 1 \
--zeroshot-frequency 1 \
--train-data="/p/fastdata/mmlaion/laion2B-en/{00000..23295}.tar" \
--train-num-samples=200000000 \
--warmup 10000 \
--lr "1e-3" \
--batch-size=208 \
--epochs=160 \
--workers=6 \
--model ViT-L-14 \
--name "L14-laion2B" \
--report-to "tensorboard" \
--seed 0 \
--ddp-static-graph \
--local-loss \
--dataset-resampled \
--gather-with-grad \
--grad-checkpointing
"""
Loading

0 comments on commit 9093d5e

Please sign in to comment.