From 6e17b5c798cc2f71cc35b53e6f48a9ba439038e6 Mon Sep 17 00:00:00 2001
From: Alex Raistrick <araistrick@princeton.edu>
Date: Tue, 1 Aug 2023 17:53:31 -0400
Subject: [PATCH] Rendering improvements (princeton-vl/infinigen_internal/#39)

* Tweak render_video_final

* Remove random config choosing from core.py

* Create tools/pipeline_configs/base.gin, move scenetype distribution configs into it

* Create noshortrender config to test on IONIC

* Implement slurm niceness override, add it to render_video_final.sh

* Only include camera 0 in parse_video output

* Read slurm partition from ENVVAR by default

* Fix config postprocess

* Fix slurm envvar

* Typo fixes

* Use roundrobin by default

* Rendering tweaks

* Change trailer.gin to video.gin with 720p res

* Fix niceness

* Set exclude_nodes list via envvar, move niceness configs into slurm.gin

* Create render_video_720p.sh, start off experimental.gin but more needs adding

* Add dryrun options

* Fix --override vs --overrides

* Move legacy task.Fine

* Retool upload func

* Add slurm_1h and stereo config

* Rendering & typo fixes

* Update render script and slurm.gin mem amounts

* Fix excluded gpus

* Add queues stats to wandb, add pandas to requirements.txt

* Fix num_concurrent reset 24h later

* Dont keep working on scenes which have had a fatal crash

* Add new timeout message to error parsing

* Fix overly nested upload dirs

* Add thread limit to local jobs
---
 docs/CHANGELOG.md                             |   2 +-
 docs/ConfiguringInfinigen.md                  |  17 +-
 requirements.txt                              |   2 +
 worldgen/config/base.gin                      |   5 +-
 worldgen/config/experimental.gin              |   3 +
 .../config/scene_types/snowy_mountain.gin     |   2 +-
 worldgen/config/trailer.gin                   |  20 -
 worldgen/core.py                              |  70 ++-
 worldgen/generate.py                          |   4 +-
 .../{ => dev}/generate_terrain_assets.py      |   0
 .../tools/{ => dev}/kernelize_surfaces.py     |   0
 worldgen/tools/{ => dev}/palette/.gitignore   |   0
 worldgen/tools/{ => dev}/palette/demo1.png    | Bin
 worldgen/tools/{ => dev}/palette/demo2.png    | Bin
 worldgen/tools/{ => dev}/palette/demo3.png    | Bin
 worldgen/tools/{ => dev}/palette/demo4.png    | Bin
 worldgen/tools/{ => dev}/palette/palette.py   |   0
 worldgen/tools/{ => dev}/palette/readme.md    |   0
 worldgen/tools/manage_datagen_jobs.py         | 431 ++++++++++++------
 worldgen/tools/pipeline_configs/base.gin      |  16 +
 .../compute_platform/local_256GB.gin          |   5 +-
 .../compute_platform/slurm.gin                |  24 +-
 .../compute_platform/slurm_1h.gin             |  12 +
 .../{ => gt_options}/blender_gt.gin           |   0
 .../{ => gt_options}/gt_test.gin              |   0
 .../{ => gt_options}/opengl_gt.gin            |   0
 .../gt_options/opengl_gt_noshortrender.gin    |   7 +
 .../opengl_gt_noshortrender.gin               |   7 +
 worldgen/tools/render_video_final.sh          |   4 -
 worldgen/tools/results/parse_videos.py        |   2 +-
 worldgen/tools/{ => results}/summarize.py     |  45 +-
 worldgen/tools/scripts/render_video_1080p.sh  |   8 +
 worldgen/tools/scripts/render_video_720p.sh   |   8 +
 worldgen/tools/scripts/render_video_stereo.sh |   7 +
 worldgen/tools/{ => util}/cancel_jobs.py      |   0
 worldgen/tools/util/show_gpu_table.py         |   2 +-
 worldgen/tools/util/submitit_emulator.py      |   2 +-
 worldgen/tools/util/upload_util.py            | 118 +++--
 worldgen/util/organization.py                 |   1 -
 39 files changed, 514 insertions(+), 310 deletions(-)
 create mode 100644 worldgen/config/experimental.gin
 delete mode 100644 worldgen/config/trailer.gin
 rename worldgen/tools/{ => dev}/generate_terrain_assets.py (100%)
 rename worldgen/tools/{ => dev}/kernelize_surfaces.py (100%)
 rename worldgen/tools/{ => dev}/palette/.gitignore (100%)
 rename worldgen/tools/{ => dev}/palette/demo1.png (100%)
 rename worldgen/tools/{ => dev}/palette/demo2.png (100%)
 rename worldgen/tools/{ => dev}/palette/demo3.png (100%)
 rename worldgen/tools/{ => dev}/palette/demo4.png (100%)
 rename worldgen/tools/{ => dev}/palette/palette.py (100%)
 rename worldgen/tools/{ => dev}/palette/readme.md (100%)
 create mode 100644 worldgen/tools/pipeline_configs/base.gin
 create mode 100644 worldgen/tools/pipeline_configs/compute_platform/slurm_1h.gin
 rename worldgen/tools/pipeline_configs/{ => gt_options}/blender_gt.gin (100%)
 rename worldgen/tools/pipeline_configs/{ => gt_options}/gt_test.gin (100%)
 rename worldgen/tools/pipeline_configs/{ => gt_options}/opengl_gt.gin (100%)
 create mode 100644 worldgen/tools/pipeline_configs/gt_options/opengl_gt_noshortrender.gin
 create mode 100644 worldgen/tools/pipeline_configs/opengl_gt_noshortrender.gin
 delete mode 100644 worldgen/tools/render_video_final.sh
 rename worldgen/tools/{ => results}/summarize.py (89%)
 create mode 100644 worldgen/tools/scripts/render_video_1080p.sh
 create mode 100644 worldgen/tools/scripts/render_video_720p.sh
 create mode 100644 worldgen/tools/scripts/render_video_stereo.sh
 rename worldgen/tools/{ => util}/cancel_jobs.py (100%)

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 70b336c91..7d4193631 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -1,3 +1,3 @@
 v1.0.0 - Beta code release <br>
-v1.0.1 - BSD-3 license, expanded ground-truth docs, show line-credits, miscellaneous fixes
+v1.0.1 - BSD-3 license, expanded ground-truth docs, show line-credits, miscellaneous fixes <br>
 v1.0.2 - New documentation, plant improvements, disk and reproducibility improvements
\ No newline at end of file
diff --git a/docs/ConfiguringInfinigen.md b/docs/ConfiguringInfinigen.md
index d1fa2124b..be9ea583b 100644
--- a/docs/ConfiguringInfinigen.md
+++ b/docs/ConfiguringInfinigen.md
@@ -39,7 +39,7 @@ If you find a useful and related combination of these commandline overrides, you
 
 Our `generate.py` driver always loads [`worldgen/configs/base.gin`][../worldgen/configs/base.gin], and you can inspect / modify this file to see many common and useful gin override options.
 
-`generate.py` also expects that one file from (configs/scene_types/)[worldgen/config/scene_types] will be loaded, and if one is not specified on the commandline it will choose one randomly according to the keys and weights in `worldgen/core.py`. These scene_type configs contain gin overrides designed to encode the semantic constraints of real natural habitats (e.g. `worldgen/scene_types/desert.gin` causes sand to appear and cacti to be more likely).
+`generate.py` also expects that one file from (configs/scene_types/)[worldgen/config/scene_types] will be loaded. These scene_type configs contain gin overrides designed to encode the semantic constraints of real natural habitats (e.g. `worldgen/scene_types/desert.gin` causes sand to appear and cacti to be more likely).
 
 ### Moving beyond "Hello World"
 
@@ -58,7 +58,7 @@ Here is a breakdown of what every commandline argument does, and ideas for how y
    - `--num_scenes` decides how many unique scenes the program will attempt to generate before terminating. Once you have removed `--specific_seed`, you can increase this to generate many scenes in sequence or in paralell. 
    - `--configs desert.gin simple.gin` forces the command to generate a desert scene, and to do so with relatively low mesh detail, low render resolution, low render samples, and some asset types disabled.
       - Do `--configs snowy_mountain.gin simple.gin` to try out a different scene type (`snowy_mountain.gin` can instead be any scene_type option from `worldgen/configs/scene_types/`)
-      - Remove the `desert.gin` and just specify `--configs simple.gin` to use random scene types according to the weighted list in `worldgen/core.py`.
+      - Remove the `desert.gin` and just specify `--configs simple.gin` to use random scene types according to the weighted list in `worldgen/tools/pipeline.py`.
       - You have the option of removing `simple.gin` and specify neither of the original configs. This turns off the many detail-reduction options included in `simple.gin`, and will create scenes closer to those in our intro video, albeit at significant compute costs. Removing `simple.gin` will likely cause crashes unless using a workstation/server with large amounts of RAM and VRAM. You can find more details on optimizing scene content for performance [here](#config-overrides-for-mesh-detail-and-performance).
    - `--pipeline_configs local_16GB.gin monocular.gin blender_gt.gin`
       - `local_16GB.gin` specifies to run only a single scene at a time, and to run each task as a local python process. See [here](#configuring-available-computing-resources) for more options
@@ -126,7 +126,6 @@ You will also encounter configs using what we term a "registry pattern", e.g. `w
    - For example, in `base_surface_registry.gin`, `surface.registry.beach` specifies `("sand", 10)` to indicate that sand has high weight to be chosen to be assigned for the beach category. 
    - Weights are normalized by their overall sum to obtain a probability distribution. 
    - Name strings undergo lookup in the relevant source code folders, e.g. the name "sand" in a surface registry maps to `worldgen/surfaces/templates/sand.py`.
-   - The random choice among scene_type configs is itself a registry, although it is hardcoded in `core.py` currently, since the choice of what configs are loaded cannot depend on a config file. This will be improved soon.
 
 ### Config Overrides for mesh detail and performance
 
@@ -153,7 +152,7 @@ If you find yourself bottlenecked by GPU time, you should consider the following
    - Reduce `base.gin`'s `full/render_image.num_samples = 8192` or `compose_scene.generate_resolution = (1920, 1080)`. This proportionally reduces rendering FLOPS, with some diminishing returns due to BVH setup time.
    - If your GPU(s) are _underutilized_, try the reverse of these tips.
 
-Some scene type configs are also generally more expensive than others. `forest.gin` and `coral.gin` are very expensive due to dense detailed fauna, wheras `artic` and `snowy_mountain` are very cheap. Low-resource compute settings (<64GB) of RAM may only be able to handle a subset of our `worldgen/config/scene_type/` options, and you may wish to tune the ratios of scene_types by editing `worldgen/core.py`. 
+Some scene type configs are also generally more expensive than others. `forest.gin` and `coral.gin` are very expensive due to dense detailed fauna, wheras `artic` and `snowy_mountain` are very cheap. Low-resource compute settings (<64GB) of RAM may only be able to handle a subset of our `worldgen/config/scene_type/` options, and you may wish to tune the ratios of scene_types by editing `worldgen/tools/pipeline_configs/base.gin` or otherwise overriding `sample_scene_spec.config_distribution`. 
 
 ### Other `manage_datagen_jobs.py` commandline options
 
@@ -174,7 +173,7 @@ Most videos in the "Introducing Infinigen" launch video were made using commands
 ````
 python -m tools.manage_datagen_jobs --output_folder outputs/my_videos --num_scenes 500 \
     --pipeline_config slurm monocular_video cuda_terrain opengl_gt \
-    --cleanup big_files --warmup_sec 60000 --config trailer high_quality_terrain
+    --cleanup big_files --warmup_sec 60000 --config video high_quality_terrain
 ````
 
 #### Creating large-scale stereo datasets
@@ -182,7 +181,7 @@ python -m tools.manage_datagen_jobs --output_folder outputs/my_videos --num_scen
 ````
 python -m tools.manage_datagen_jobs --output_folder outputs/stereo_data --num_scenes 10000 \
     --pipeline_config slurm stereo cuda_terrain opengl_gt \
-    --cleanup big_files --warmup_sec 60000 --config trailer high_quality_terrain
+    --cleanup big_files --warmup_sec 60000 --config high_quality_terrain
 ````
 
 #### Creating a few low-resolution images to your test changes
@@ -220,7 +219,7 @@ python -m tools.manage_datagen_jobs --output_folder outputs/my_videos --num_scen
 ```
 python -m tools.manage_datagen_jobs --output_folder outputs/my_videos --num_scenes 500 \
     --pipeline_config slurm monocular_video cuda_terrain opengl_gt \
-    --cleanup big_files --warmup_sec 30000 --config trailer high_quality_terrain \
+    --cleanup big_files --warmup_sec 30000 --config video high_quality_terrain \
     --overrides camera.camera_pose_proposal.altitude=["uniform", 20, 30]
 ```
 
@@ -230,8 +229,8 @@ python -m tools.manage_datagen_jobs --output_folder outputs/my_videos --num_scen
 ```
 python -m tools.manage_datagen_jobs --output_folder outputs/my_videos --num_scenes 500 \
     --pipeline_config slurm monocular_video cuda_terrain opengl_gt \
-    --cleanup big_files --warmup_sec 30000 --config trailer high_quality_terrain \
+    --cleanup big_files --warmup_sec 30000 --config video high_quality_terrain \
     --pipeline_overrides iterate_scene_tasks.frame_range=[1,25]
 ```
 
-:bulb: This command uses `--pipeline_overrides` rather than `--overrides` since it is providing a gin override to the `manage_datagen_jobs.py` process, not some part main `generate.py` driver.
+:bulb: This command uses `--pipeline_overrides` rather than `--overrides` since it is providing a gin override to the `manage_datagen_jobs.py` process, not some part of the main `generate.py` driver.
diff --git a/requirements.txt b/requirements.txt
index 156ca419b..604885870 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,3 +27,5 @@ landlab==2.4.1
 scikit-learn
 psutil
 pyrender
+pytest
+pandas
diff --git a/worldgen/config/base.gin b/worldgen/config/base.gin
index 8daa82582..e828603f2 100644
--- a/worldgen/config/base.gin
+++ b/worldgen/config/base.gin
@@ -130,9 +130,9 @@ flat/render_image.passes_to_save = [
 
 render_image.exposure = 1
 
-render_image.use_dof = 'IF_TARGET_SET'
+render_image.use_dof = False
 render_image.dof_aperture_fstop = 3
-render_image.motion_blur = True
+render_image.motion_blur = False
 render_image.motion_blur_shutter = 0.15
 
 compositor_postprocessing.distort = False
@@ -142,7 +142,6 @@ compose_scene.generate_resolution = (1920, 1080)
 get_sensor_coords.H = 720
 get_sensor_coords.W = 1280
 
-
 min_terrain_distance = 2
 keep_cam_pose_proposal.min_terrain_distance = %min_terrain_distance
 SphericalMesher.r_min = %min_terrain_distance
diff --git a/worldgen/config/experimental.gin b/worldgen/config/experimental.gin
new file mode 100644
index 000000000..cb54b452b
--- /dev/null
+++ b/worldgen/config/experimental.gin
@@ -0,0 +1,3 @@
+# things that are not quite fully working correctly, but you can use if you please
+
+render_image.motion_blur = True # not fully supported in ground truth
diff --git a/worldgen/config/scene_types/snowy_mountain.gin b/worldgen/config/scene_types/snowy_mountain.gin
index 8c8884150..a8b6b74a7 100644
--- a/worldgen/config/scene_types/snowy_mountain.gin
+++ b/worldgen/config/scene_types/snowy_mountain.gin
@@ -27,7 +27,7 @@ compose_scene.flying_creature_registry = [
 ]
 
 surfaces.templates.mountain.shader.layered_mountain = 0
-surfaces.templates.mountain.shader.snowy = 1
+surfaces.templates.mountain.shader.snowy = 0 # TODO: re-enable once terrain flickering resolved
 compose_scene.boulders_chance = 1
 
 camera.camera_pose_proposal.pitch = ("clip_gaussian", 90, 30, 90, 100)
diff --git a/worldgen/config/trailer.gin b/worldgen/config/trailer.gin
deleted file mode 100644
index 3e2cb26e0..000000000
--- a/worldgen/config/trailer.gin
+++ /dev/null
@@ -1,20 +0,0 @@
-full/render_image.passes_to_save = [
-    #['diffuse_direct', 'DiffDir'],
-    #['diffuse_color', 'DiffCol'],
-    #['diffuse_indirect', 'DiffInd'],
-    #['glossy_direct', 'GlossDir'],
-    #['glossy_color', 'GlossCol'],
-    #['glossy_indirect', 'GlossInd'],
-    #['transmission_direct', 'TransDir'],
-    #['transmission_color', 'TransCol'],
-    #['transmission_indirect', 'TransInd'],
-    #['emit', 'Emit'],
-    #['environment', 'Env'],
-    ['ambient_occlusion', 'AO']
-]
-flat/render_image.passes_to_save = [
-    #['z', 'Depth'],
-    #['normal', 'Normal'],
-    #['vector', 'Vector'],
-    ['object_index', 'IndexOB']
-]
diff --git a/worldgen/core.py b/worldgen/core.py
index c0facf730..29a5d9c47 100644
--- a/worldgen/core.py
+++ b/worldgen/core.py
@@ -231,6 +231,7 @@ def execute_tasks(
     generate_resolution=(1920,1080),
     reset_assets=True,
     focal_length=None,
+    dryrun=False,
 ):
     if input_folder != output_folder:
         if reset_assets:
@@ -241,6 +242,10 @@ def execute_tasks(
         if (not os.path.islink(output_folder/"assets")) and (not (output_folder/"assets").exists()) and input_folder is not None and (input_folder/"assets").exists():
             os.symlink(input_folder/"assets", output_folder/"assets")
             # in this way, even coarse task can have input_folder to have pregenerated on-the-fly assets (e.g., in last run) to speed up developing
+
+    if dryrun:
+        return
+
     if Task.Coarse not in task:
         with Timer('Reading input blendfile'):
             bpy.ops.wm.open_mainfile(filepath=str(input_folder / 'scene.blend'))
@@ -287,9 +292,6 @@ def execute_tasks(
     if Task.Populate in task:
         populate_scene(output_folder, terrain, scene_seed)
 
-    if Task.Fine in task:
-        raise RuntimeError(f'{task=} contains deprecated {Task.Fine=}')
-
     if Task.FineTerrain in task:
         terrain.fine_terrain(output_folder)
     
@@ -305,17 +307,15 @@ def execute_tasks(
             for mesh in os.listdir(input_folder):
                 if (mesh.endswith(".glb") or mesh.endswith(".b_displacement.npy")) and not os.path.islink(output_folder / mesh):
                     os.symlink(input_folder / mesh, output_folder / mesh)
-        if Task.Coarse in task or Task.Populate in task or Task.FineTerrain in task:
-            bpy.context.preferences.system.scrollback = 100 
-            bpy.context.preferences.edit.undo_steps = 100
+        if Task.Coarse in task or Task.Populate in task:
+
             with Timer(f'Writing output blendfile'):
                 logging.info(f'Writing output blendfile to {output_folder / output_blend_name}')
                 bpy.ops.wm.save_mainfile(filepath=str(output_folder / output_blend_name))
                 tag_system.save_tag(path=str(output_folder / "MaskTag.json"))
 
             with (output_folder/ "version.txt").open('w') as f:
-                scene_version = get_scene_tag('VERSION')
-                f.write(f"{scene_version}\n")
+                f.write(f"{VERSION}\n")
 
             with (output_folder/'polycounts.txt').open('w') as f:
                 save_polycounts(f)
@@ -361,39 +361,25 @@ def apply_scene_seed(args):
     np.random.seed(scene_seed)
     return scene_seed
 
-def apply_gin_configs(args, scene_seed, skip_unknown=False):
-
-    scene_types = [p.stem for p in Path('config/scene_types').iterdir()]
-    scene_specified = any(s in scene_types or s.startswith("figure") for s in args.gin_config)
-
-    weights = {
-        "kelp_forest": 0.3,
-        "coral_reef": 1,
-        "forest": 2,
-        "river": 2,
-        "desert": 1,
-        "coast": 1,
-        "cave": 1,
-        "mountain": 1,
-        "canyon": 1,
-        "plain": 1,
-        "cliff": 1,
-        "arctic": 1,
-        "snowy_mountain": 1,
-    }
-    assert all(k in scene_types for k in weights)
-
-    scene_types = [s for s in scene_types if s in weights]
-    weights = np.array([weights[k] for k in scene_types], dtype=float)
-    weights /= weights.sum()
+@gin.configurable
+def apply_gin_configs(
+    args, 
+    scene_seed, 
+    skip_unknown=False, 
+    mandatory_config_dir=Path('config/scene_types'),
+):
 
-    if not scene_specified:
-        scene_type = np.random.RandomState(scene_seed).choice(scene_types, p=weights)
-        logging.warning(f'Randomly selected {scene_type=}. IF THIS IS NOT INTENDED THEN YOU ARE MISSING SCENE CONFIGS')
-        if len(args.gin_config) > 0 and args.gin_config[0] == 'base':
-            args.gin_config = [scene_type] + args.gin_config[1:]
-        else:
-            args.gin_config = [scene_type] + args.gin_config
+    if mandatory_config_dir is not None:
+        assert mandatory_config_dir.exists()
+        scene_types = [p.stem for p in mandatory_config_dir.iterdir()]
+        scenetype_specified = any(s in scene_types or s.split('.')[0] in scene_types for s in args.configs)
+    
+        if not scenetype_specified:
+            print(scene_types)
+            raise ValueError(
+                f"Please load one or more config from {mandatory_config_dir} using --configs to avoid unexpected behavior. "
+                "If you are sure you want to proceed without, override `apply_gin_configs.mandatory_config_dir=None`"
+            )
 
     def find_config(g):
         for p in Path('config').glob('**/*.gin'):
@@ -403,8 +389,8 @@ def find_config(g):
                 return p
         raise ValueError(f'Couldn not locate {g} or {g}.gin in anywhere config/**')
 
-    bindings = sanitize_gin_override(args.gin_param)
-    confs = [find_config(g) for g in ['base.gin'] + args.gin_config]
+    bindings = sanitize_gin_override(args.overrides)
+    confs = [find_config(g) for g in ['base.gin'] + args.configs]
     gin.parse_config_files_and_bindings(confs, bindings=bindings, skip_unknown=skip_unknown)
 
 def main(
diff --git a/worldgen/generate.py b/worldgen/generate.py
index bc8979af5..9f840a081 100644
--- a/worldgen/generate.py
+++ b/worldgen/generate.py
@@ -375,10 +375,10 @@ def main():
     parser.add_argument('-s', '--seed', default=None, help="The seed used to generate the scene")
     parser.add_argument('-t', '--task', nargs='+', default=['coarse'],
                         choices=['coarse', 'populate', 'fine_terrain', 'ground_truth', 'render', 'mesh_save'])
-    parser.add_argument('-g', '--gin_config', nargs='+', default=['base'],
+    parser.add_argument('-g', '--configs', nargs='+', default=['base'],
                         help='Set of config files for gin (separated by spaces) '
                              'e.g. --gin_config file1 file2 (exclude .gin from path)')
-    parser.add_argument('-p', '--gin_param', nargs='+', default=[],
+    parser.add_argument('-p', '--overrides', nargs='+', default=[],
                         help='Parameter settings that override config defaults '
                              'e.g. --gin_param module_1.a=2 module_2.b=3')
     parser.add_argument('--task_uniqname', type=str, default=None)
diff --git a/worldgen/tools/generate_terrain_assets.py b/worldgen/tools/dev/generate_terrain_assets.py
similarity index 100%
rename from worldgen/tools/generate_terrain_assets.py
rename to worldgen/tools/dev/generate_terrain_assets.py
diff --git a/worldgen/tools/kernelize_surfaces.py b/worldgen/tools/dev/kernelize_surfaces.py
similarity index 100%
rename from worldgen/tools/kernelize_surfaces.py
rename to worldgen/tools/dev/kernelize_surfaces.py
diff --git a/worldgen/tools/palette/.gitignore b/worldgen/tools/dev/palette/.gitignore
similarity index 100%
rename from worldgen/tools/palette/.gitignore
rename to worldgen/tools/dev/palette/.gitignore
diff --git a/worldgen/tools/palette/demo1.png b/worldgen/tools/dev/palette/demo1.png
similarity index 100%
rename from worldgen/tools/palette/demo1.png
rename to worldgen/tools/dev/palette/demo1.png
diff --git a/worldgen/tools/palette/demo2.png b/worldgen/tools/dev/palette/demo2.png
similarity index 100%
rename from worldgen/tools/palette/demo2.png
rename to worldgen/tools/dev/palette/demo2.png
diff --git a/worldgen/tools/palette/demo3.png b/worldgen/tools/dev/palette/demo3.png
similarity index 100%
rename from worldgen/tools/palette/demo3.png
rename to worldgen/tools/dev/palette/demo3.png
diff --git a/worldgen/tools/palette/demo4.png b/worldgen/tools/dev/palette/demo4.png
similarity index 100%
rename from worldgen/tools/palette/demo4.png
rename to worldgen/tools/dev/palette/demo4.png
diff --git a/worldgen/tools/palette/palette.py b/worldgen/tools/dev/palette/palette.py
similarity index 100%
rename from worldgen/tools/palette/palette.py
rename to worldgen/tools/dev/palette/palette.py
diff --git a/worldgen/tools/palette/readme.md b/worldgen/tools/dev/palette/readme.md
similarity index 100%
rename from worldgen/tools/palette/readme.md
rename to worldgen/tools/dev/palette/readme.md
diff --git a/worldgen/tools/manage_datagen_jobs.py b/worldgen/tools/manage_datagen_jobs.py
index a15c9e535..eda05fd3b 100644
--- a/worldgen/tools/manage_datagen_jobs.py
+++ b/worldgen/tools/manage_datagen_jobs.py
@@ -22,6 +22,7 @@
 from uuid import uuid4
 from enum import Enum
 from copy import copy
+from ast import literal_eval
 
 from functools import partial, cache
 from collections import defaultdict
@@ -29,6 +30,7 @@
 from pathlib import Path
 from shutil import which, rmtree, copyfile, copytree
 
+import pandas as pd
 from tqdm import tqdm
 
 import numpy as np
@@ -43,6 +45,9 @@
 from tools.util import upload_util
 from tools.util.upload_util import upload_job_folder # for pickle not to freak out
 
+PARTITION_ENVVAR = 'INFINIGEN_SLURMPARTITION' # used only if enabled in config
+EXCLUDE_FILE_ENVVAR = 'INFINIGEN_SLURM_EXCLUDENODES_LIST'
+
 class JobState:
     NotQueued = "notqueued"
     Queued = "queued"
@@ -55,6 +60,7 @@ class SceneState:
     Done = "done"
     Crashed = "crashed"
 
+JOB_OBJ_SUCCEEDED = 'MARK_AS_SUCCEEDED'
 CONCLUDED_STATES = {JobState.Succeeded, JobState.Failed}
 
 # Will throw exception if the scene was not found. Sometimes this happens if the scene was queued very very recently
@@ -73,6 +79,18 @@ def seff(job_obj, retry_on_error=True):
                 raise
             time.sleep(1)
 
+def node_from_slurm_jobid(scene_id):
+
+    if not which('sacct'):
+        return None
+    
+    try:
+        node_of_scene, *rest  = subprocess.check_output(f"{which('sacct')} -j {scene_id} --format Node --noheader".split()).decode().split()
+        return node_of_scene
+    except Exception as e:
+        logging.warning(f'sacct threw {e}')
+        return None
+
 def get_scene_state(scene_dict, taskname, scene_folder):
 
     if not scene_dict.get(f'{taskname}_submitted', False):
@@ -84,7 +102,7 @@ def get_scene_state(scene_dict, taskname, scene_folder):
     
     # for when both local and slurm scenes are being mixed
     if isinstance(job_obj, str):
-        assert job_obj == 'MARK_AS_SUCCEEDED'
+        assert job_obj == JOB_OBJ_SUCCEEDED
         return JobState.Succeeded
     elif isinstance(job_obj, LocalJob):
         res = job_obj.status()
@@ -106,15 +124,31 @@ def seed_generator():
     return hex(seed_int).removeprefix('0x')
 
 @gin.configurable
-def get_cmd(seed, task, configs, taskname, output_folder, driver_script='generate.py', input_folder=None, niceness=None):
+def get_cmd(
+    seed, 
+    task, 
+    configs, 
+    taskname, 
+    output_folder, 
+    blender_thread_limit=None,
+    driver_script='generate.py', 
+    input_folder=None, 
+    process_niceness=None,
+):
     
     if isinstance(task, list):
         task = " ".join(task)
 
     cmd = ''
-    if niceness is not None:
-        cmd += f'nice -n {niceness} '
-    cmd += f'{BLENDER_PATH} --background -y -noaudio --python {driver_script} -- '
+    if process_niceness is not None:
+        cmd += f'nice -n {process_niceness} '
+    cmd += f'{BLENDER_PATH} --background -y -noaudio --python {driver_script} '
+    
+    if blender_thread_limit is not None:
+        cmd += f'--threads {blender_thread_limit} '
+
+    cmd += '-- '
+
     if input_folder is not None:
         cmd += '--input_folder ' + str(input_folder) + ' '
     if output_folder is not None:
@@ -128,6 +162,8 @@ def get_cmd(seed, task, configs, taskname, output_folder, driver_script='generat
 
 @gin.configurable
 def get_slurm_banned_nodes(config_path=None):
+    if config_path == f'ENVVAR_{EXCLUDE_FILE_ENVVAR}':
+        config_path = os.environ.get(EXCLUDE_FILE_ENVVAR)
     if config_path is None:
         return []
     with Path(config_path).open('r') as f:
@@ -149,7 +185,19 @@ def get_suffix(indices):
     return suffix
 
 @gin.configurable
-def slurm_submit_cmd(cmd, folder, name, mem_gb=None, cpus=None, gpus=0, hours=1, slurm_account=None, slurm_exclude: list = None, **_):
+def slurm_submit_cmd(
+    cmd, 
+    folder, 
+    name, 
+    mem_gb=None, 
+    cpus=None, 
+    gpus=0, 
+    hours=1, 
+    slurm_account=None, 
+    slurm_exclude: list = None, 
+    slurm_niceness=None,
+    **_
+):
 
     executor = submitit.AutoExecutor(folder=(folder / "logs"))
     executor.update_parameters(
@@ -165,8 +213,21 @@ def slurm_submit_cmd(cmd, folder, name, mem_gb=None, cpus=None, gpus=0, hours=1,
     if gpus > 0:
         executor.update_parameters(gpus_per_node=gpus)
     if slurm_account is not None:
+
+        if slurm_account == f'ENVVAR_{PARTITION_ENVVAR}':
+            slurm_account = os.environ.get(PARTITION_ENVVAR)
+            if slurm_account is None:
+                logging.warning(f'{PARTITION_ENVVAR=} was not set, using no slurm account')
+
         executor.update_parameters(slurm_account=slurm_account)
 
+    slurm_additional_params = {}
+
+    if slurm_niceness is not None:
+        slurm_additional_params['nice'] = slurm_niceness
+
+    executor.update_parameters(slurm_additional_parameters=slurm_additional_params)
+
     while True:
         try:
             if callable(cmd[0]):
@@ -328,15 +389,12 @@ def queue_combined(
     seed,
     configs,
     taskname=None,
-    mem_gb=None,
     exclude_gpus=[],
-    cpus=None,
     gpus=0,
-    hours=None,
-    slurm_account=None,
     overrides=[],
     include_coarse=True,
     input_indices=None, output_indices=None,
+    **kwargs
 ):
     
     input_suffix = get_suffix(input_indices)
@@ -361,14 +419,11 @@ def queue_combined(
         f.write(f"{' '.join(' '.join(cmd).split())}\n\n")
 
     res = submit_cmd(cmd,
-        mem_gb=mem_gb,
         folder=folder,
         name=name,
-        cpus=cpus,
         gpus=gpus,
-        hours=hours,
         slurm_exclude=nodes_with_gpus(*exclude_gpus) + get_slurm_banned_nodes(),
-        slurm_account=slurm_account,
+        **kwargs
     )
     return res, output_folder
 
@@ -426,10 +481,9 @@ def queue_mesh_save(
 ):
 
     if (output_indices['subcam'] > 0) and reuse_subcams:
-        return "MARK_AS_SUCCEEDED", None
+        return JOB_OBJ_SUCCEEDED, None
 
     input_suffix = get_suffix(input_indices)
-
     output_suffix = get_suffix(output_indices)
 
     output_folder = Path(f'{folder}/savemesh{output_suffix}')
@@ -470,13 +524,12 @@ def queue_opengl(
 ):
 
     if (output_indices['subcam'] > 0) and reuse_subcams:
-        return "MARK_AS_SUCCEEDED", None
+        return JOB_OBJ_SUCCEEDED, None
 
     output_suffix = get_suffix(output_indices)
 
     tmp_script = Path(folder) / "tmp" / f"opengl_{uuid4().hex}.sh"
     tmp_script.parent.mkdir(exist_ok=True)
-    print(f"Creating {tmp_script}")
 
     process_mesh_path = Path("../process_mesh/build/process_mesh").resolve()
     input_folder = Path(folder)/f'savemesh{output_suffix}' # OUTPUT SUFFIX IS CORRECT HERE. I know its weird. But input suffix really means 'prev tier of the pipeline
@@ -508,7 +561,8 @@ def queue_opengl(
     with (folder / "run_pipeline.sh").open('a') as f:
         f.write(f"{' '.join(' '.join(cmd).split())}\n\n")
 
-    res = submit_cmd(cmd,
+    res = submit_cmd(
+        cmd,
         folder=folder,
         name=name,
         slurm_exclude=nodes_with_gpus(*exclude_gpus) + get_slurm_banned_nodes(),
@@ -516,54 +570,99 @@ def queue_opengl(
     )
     return res, output_folder
 
-@gin.configurable
-def init_db(args, inorder_seeds=False, enumerate_scenetypes=[None]):
+def init_db_from_existing(output_folder: Path):
 
-    n_scenes = args.num_scenes
+    # TODO in future: directly use existing_db (with some cleanup / checking).
 
-    scenes = []
+    db_path = output_folder/'scenes_db.csv'
+    if not db_path.exists():
+        raise ValueError(f'Recieved --use_existing but {db_path=} did not exist')
+    existing_db = pd.read_csv(db_path, converters={"configs": literal_eval})
 
-    if args.use_existing:
-        for seed_folder in args.output_folder.iterdir():
-            
-            if not seed_folder.is_dir():
-                continue
-            if not (seed_folder/'logs').exists():
-                logging.warning(f'Skipping {seed_folder=} due to missing "logs" subdirectory')
-                continue
+    def init_scene(seed_folder):
+        if not seed_folder.is_dir():
+            return None
+        if not (seed_folder/'logs').exists():
+            logging.warning(f'Skipping {seed_folder=} due to missing "logs" subdirectory')
+            return None
+
+        configs = existing_db.loc[existing_db["seed"] == seed_folder.name, "configs"].iloc[0]
+
+        scene_dict = {
+            'seed': seed_folder.name, 
+            'all_done': SceneState.NotDone,
+            'configs': list(configs)
+        }
 
-            n_scenes -= 1
+        finish_key = 'FINISH_'
+        for finish_file_name in (seed_folder/'logs').glob(finish_key + '*'):
+            taskname = os.path.basename(finish_file_name)[len(finish_key):]
+            logging.info(f'Marking {seed_folder.name=} {taskname=} as completed')
+            scene_dict[f'{taskname}_submitted'] = True
+            scene_dict[f'{taskname}_job_obj'] = JOB_OBJ_SUCCEEDED
 
-            scene_dict = {'seed': seed_folder.name, 'all_done': SceneState.NotDone}
+        return scene_dict
 
-            finish_key = 'FINISH_'
-            for finish_file_name in (seed_folder/'logs').glob(finish_key + '*'):
-                taskname = os.path.basename(finish_file_name)[len(finish_key):]
-                print(f'Marking {seed_folder.name=} {taskname=} as completed')
-                scene_dict[f'{taskname}_submitted'] = True
-                scene_dict[f'{taskname}_job_obj'] = 'MARK_AS_SUCCEEDED'
+    return [init_scene(seed_folder) for seed_folder in output_folder.iterdir()]
 
-            scenes.append(scene_dict)
-    elif args.specific_seed is not None and len(args.specific_seed):
-        return [{"seed": s, "all_done": SceneState.NotDone} for s in args.specific_seed]
+@gin.configurable
+def sample_scene_spec(i, seed_range=None, config_distribution=None, config_sample_mode='random'):
+
+    if seed_range is None:
+        seed = seed_generator()
+    else:
+        start, end = seed_range
+        if i > end - start:
+            return None
+        seed = hex(start + i).removeprefix('0x')
+
+    if config_distribution is None:
+        configs = []
+    elif config_sample_mode == 'random':
+        configs_options, weights = zip(*config_distribution) # list of rows to list per column
+        ps = np.array(weights) / sum(weights)
+        configs = np.random.choice(configs_options, p=ps)
+    elif config_sample_mode == 'roundrobin':
+        configs_options, weights = zip(*config_distribution) # list of rows to list per column
+        if not all(isinstance(w, int) for w in weights):
+            raise ValueError(f'{config_sample_mode=} expects integer scene counts as weights but got {weights=} with non-integer values')
+        idx = np.argmin(i % sum(weights) + 1 > np.cumsum(weights))
+        configs = configs_options[idx]
+    else:
+        raise ValueError(f'Unrecognized {config_sample_mode=}')
     
-    if n_scenes > 0:
-        for scenetype in enumerate_scenetypes:
-            for i in range(n_scenes//len(enumerate_scenetypes)):
-                seed = i if inorder_seeds else seed_generator()
-                configs = []
-                if scenetype is not None:
-                    configs.append(scenetype)
-                    seed = f'{scenetype}_{i}'
-                scene = {"all_done": SceneState.NotDone, "seed": seed, 'scene_configs': configs}
-                print(f'Added scene {seed}')
-                scenes.append(scene)
+    if isinstance(configs, str) and " " in configs:
+        configs = configs.split(" ")
+    if not isinstance(configs, list):
+        configs = [configs]
+
+    return {
+        "all_done": SceneState.NotDone, 
+        "seed": seed, 
+        'configs': configs
+    }
+
+@gin.configurable
+def init_db(args):
+
+    if args.use_existing:
+        scenes = init_db_from_existing(args.output_folder)
+    elif args.specific_seed is not None:
+        scenes = [{"seed": s, "all_done": SceneState.NotDone} for s in args.specific_seed]
+    else:
+        scenes = [sample_scene_spec(i) for i in range(args.num_scenes)]    
+
+    scenes = [s for s in scenes if s is not None]
+
+    if len(scenes) < args.num_scenes:
+        logging.warning(f'Initialized only {len(scenes)=} despite {args.num_scenes=}. Likely due to --use_existing, --specific_seed or seed_range.')
+
     return scenes
 
 def update_symlink(scene_folder, scenes):
     for new_name, scene in scenes:
 
-        if scene == 'MARK_AS_SUCCEEDED':
+        if scene == JOB_OBJ_SUCCEEDED:
             continue
         elif isinstance(scene, str):
             raise ValueError(f'Failed due to {scene=}')
@@ -600,16 +699,27 @@ def make_html_page(output_path, scenes, frame, camera_pair_id, **kwargs):
     with output_path.open('a') as f:
         f.write(html)
 
-def run_task(queue_func, scene_folder, scene_dict, taskname):
-
+@gin.configurable
+def run_task(
+    queue_func, 
+    scene_folder, 
+    scene_dict, 
+    taskname, 
+    dryrun=False
+):
+    
     assert scene_folder.parent.exists(), scene_folder
     scene_folder.mkdir(exist_ok=True)
     stage_scene_name = f"{scene_folder.parent.stem}_{scene_folder.stem}_{taskname}"
     assert not scene_dict.get(f'{taskname}_submitted', False)
 
+    if dryrun:
+        scene_dict[f'{taskname}_job_obj'] = JOB_OBJ_SUCCEEDED
+        scene_dict[f'{taskname}_submitted'] = 1
+        return
+
     seed = scene_dict['seed']
 
-    logging.info(f"{seed} - Submitting {taskname} scene")
     job_obj, output_folder = queue_func(
         folder=scene_folder,
         name=stage_scene_name,
@@ -706,15 +816,21 @@ def iterate_scene_tasks(
     scene_folder = args.output_folder/seed
     get_task_state = partial(get_scene_state, scene_dict=scene_dict, scene_folder=scene_folder)
 
-    global_overrides = [f'execute_tasks.frame_range={repr(list(frame_range))}', f'execute_tasks.camera_id=[0, 0]']
-    global_configs = args.configs + scene_dict.get('scene_configs', [])
-    global_iter = iterate_sequential_tasks(global_tasks, get_task_state,
-        overrides=args.override+global_overrides, configs=global_configs)
+    global_overrides = [
+        f'execute_tasks.frame_range={repr(list(frame_range))}', 
+        f'execute_tasks.camera_id=[0, 0]'
+    ]
+    global_configs = scene_dict.get('configs', []) + args.configs
+    global_iter = iterate_sequential_tasks(
+        global_tasks, 
+        get_task_state,
+        overrides=args.overrides+global_overrides, 
+        configs=global_configs
+    )
 
     for state, *rest in global_iter:
         yield state, *rest
     if not state == JobState.Succeeded:
-        logging.debug(f'{seed=} waiting on global')
         return
 
     view_range = render_frame_range if render_frame_range is not None else frame_range
@@ -735,7 +851,7 @@ def iterate_scene_tasks(
         view_idxs = dict(cam_rig=cam_rig, frame=view_frame)
         view_tasks_iter = iterate_sequential_tasks(
             view_dependent_tasks, get_task_state,
-            overrides=args.override+view_overrides, 
+            overrides=args.overrides+view_overrides, 
             configs=global_configs, output_indices=view_idxs
         )
         for state, *rest in view_tasks_iter:
@@ -743,7 +859,6 @@ def iterate_scene_tasks(
         if state not in CONCLUDED_STATES:
             if viewdep_paralell:
                 running_views += 1
-                logging.debug(f'{seed=} {cam_rig,view_frame=} waiting on viewdep')
                 continue
             else:
                 return 
@@ -753,6 +868,7 @@ def iterate_scene_tasks(
         running_blocks = 0
         for subcam, resample_idx in itertools.product(subcams, resamples):
             for cam_frame in range(view_frame_range[0], view_frame_range[1] + 1, cam_block_size):
+                
                 cam_frame_range = [cam_frame, min(view_frame_range[1], cam_frame + cam_block_size - 1)] # blender frame_end is INCLUSIVE
                 cam_overrides = [
                     f'execute_tasks.frame_range=[{cam_frame_range[0]},{cam_frame_range[1]}]',
@@ -761,20 +877,27 @@ def iterate_scene_tasks(
                 ]
 
                 camdep_indices = dict(
-                    cam_rig=cam_rig, frame=cam_frame, subcam=subcam, resample=resample_idx,
-                    view_first_frame=view_frame_range[0], last_view_frame=view_frame_range[1], last_cam_frame=cam_frame_range[1] # this line explicitly used by most jobs
+                    cam_rig=cam_rig, 
+                    frame=cam_frame, 
+                    subcam=subcam, 
+                    resample=resample_idx,
+                    view_first_frame=view_frame_range[0], 
+                    last_view_frame=view_frame_range[1], 
+                    last_cam_frame=cam_frame_range[1] # this line explicitly used by most jobs
                 ) 
                 camera_dep_iter = iterate_sequential_tasks(
-                    camera_dependent_tasks, get_task_state,
-                    overrides=args.override+cam_overrides, configs=global_configs,
+                    camera_dependent_tasks, 
+                    get_task_state,
+                    overrides=args.overrides+cam_overrides, 
+                    configs=global_configs,
                     input_indices=view_idxs if len(view_dependent_tasks) else None,
-                    output_indices=camdep_indices)
+                    output_indices=camdep_indices
+                )
                 for state, *rest in camera_dep_iter:
                     yield state, *rest
                 if state not in CONCLUDED_STATES:
                     if camdep_paralell:
                         running_blocks += 1
-                        logging.debug(f'{seed=} {cam_rig,cam_frame=} waiting on viewdep')
                         continue
                     else:
                         return
@@ -792,7 +915,6 @@ def iterate_scene_tasks(
                 path = scene_dict[f'{taskname}_output_folder']
                 print(f'Cleaning {path} for {taskname}')
                 if path == scene_folder:
-                    print(f'Skipping {path}')
                     continue
                 if path is not None and path.exists():
                     cleanup(path)
@@ -829,7 +951,7 @@ def infer_crash_reason(stdout_file, stderr_file: Path):
 
     if "System is out of GPU memory" in error_log:
         return "Out of GPU memory"
-    elif "this scene is timed-out" in error_log:
+    elif "this scene is timed-out" in error_log or 'DUE TO TIME LIMIT' in error_log:
         return "Timed out"
     elif "<Signals.SIGKILL: 9>" in error_log:
         return "SIGKILL: 9 (out-of-memory, probably)"
@@ -846,8 +968,14 @@ def infer_crash_reason(stdout_file, stderr_file: Path):
     output_text = f"{stdout_file.read_text()}\n{stderr_file.read_text()}\n"
     matches = re.findall("(Error:[^\n]+)\n", output_text)
 
+    ignore_errors = [
+        'Error: Not freed memory blocks',
+    ]
+
+    matches = [m for m in matches if not any(w in m for w in ignore_errors)]
+
     if len(matches):
-        return ','.join([m.strip() for m in matches if ('Error: Not freed memory blocks' not in m)])
+        return ','.join(matches)
     else:
         return f"Could not summarize cause, check {stderr_file}" 
 
@@ -855,14 +983,10 @@ def record_crashed_seed(crashed_seed, crash_stage, f, fatal=True):
     time_str = datetime.now().strftime("%m/%d %I:%M%p")
     stdout_file = args.output_folder / crashed_seed / "logs" / f"{crash_stage}.out"
     stderr_file = args.output_folder / crashed_seed / "logs" / f"{crash_stage}.err"
+
     scene_id, *_ = stderr_file.resolve().stem.split('_')
-    node_of_scene = ""
-    if which('sacct'):
-        try:
-            node_of_scene, *rest  = subprocess.check_output(f"{which('sacct')} -j {scene_id} --format Node --noheader".split()).decode().split()
-        except Exception as e:
-            logging.warning(f'sacct threw {e}')
-            return
+    node_of_scene = node_from_slurm_jobid(scene_id)
+        
     reason = infer_crash_reason(stdout_file, stderr_file)
     text = f"{crashed_seed} {crash_stage} {scene_id} {node_of_scene} {reason} {fatal=} {time_str}\n"
     print('Crashed: ' + text)
@@ -884,27 +1008,32 @@ def stats_summary(stats):
     stats = {k: v for k, v in stats.items() if not k.startswith(JobState.NotQueued)}
     lemmatized = set(l.split('_')[0] for l in stats.keys())
     stats = {l: sum(v for k, v in stats.items() if k.startswith(l)) for l in lemmatized}
-    for p in set(k.split('/')[0] for k in stats.keys()):
-        stats[f'{p}/total'] = sum(v for k, v in stats.items() if k.startswith(p))
-    return stats
-
-@gin.configurable
-def manage_datagen_jobs(all_scenes, elapsed, num_concurrent, sleep_threshold=0.95):
+    
+    uniq_keys = set(k.split('/')[0] for k in stats.keys())
+    totals = {p: sum(v for k, v in stats.items() if k.startswith(p)) for p in uniq_keys}
 
-    if LocalScheduleHandler._inst is not None:
-        LocalScheduleHandler.instance().poll()
+    for k, v in totals.items():
+        stats[f'{k}/total'] = v
+        
+    return stats, totals
 
-    curr_concurrent_max = math.floor(1 + num_concurrent * elapsed / args.warmup_sec) if elapsed < args.warmup_sec else num_concurrent
+def monitor_existing_jobs(all_scenes):
 
-    # Check results / current state of scenes we have already launched 
     stats = defaultdict(int)
+
     for scene in all_scenes:
+
         scene['num_running'], scene['num_done'] = 0, 0
         any_fatal = False
         for state, taskname, _, fatal in iterate_scene_tasks(scene, args, monitor_all=True):
+            
+            if state == JobState.NotQueued:
+                continue
+
             stats[f'{state}/{taskname}'] += 1
             scene['num_done'] += state in CONCLUDED_STATES
             scene['num_running'] += state not in CONCLUDED_STATES
+            
             if state == JobState.Failed:
                 if not scene.get(f'{taskname}_crash_recorded', False):
                     scene[f'{taskname}_crash_recorded'] = True
@@ -913,43 +1042,70 @@ def manage_datagen_jobs(all_scenes, elapsed, num_concurrent, sleep_threshold=0.9
                 if fatal:
                     any_fatal = True
 
+        if any_fatal:
+            scene['any_fatal_crash'] = True
+
         if scene['num_running'] == 0 and any_fatal and scene['all_done'] == SceneState.NotDone:
             scene['all_done'] = SceneState.Crashed    
             with (args.output_folder / "crash_summaries.txt").open('a') as f:
                 check_and_perform_cleanup(args, scene['seed'], crashed=True)
 
-    # Report stats, with sums by prefix, and extra info
-    stats = stats_summary(stats)
+    return stats
+
+def jobs_to_launch_next(all_scenes, greedy=True):
+    scenes = [j for j in all_scenes if (j["all_done"] == SceneState.NotDone)]
+    if greedy:
+        scenes = sorted(scenes, key=lambda s: s['num_running'] + s['num_done'], reverse=True)
+    for scene in scenes:
+        if scene.get('any_fatal_crash', False):
+            continue
+        for state, taskname, queue_func, _ in iterate_scene_tasks(scene, args, monitor_all=False):
+            if state != JobState.NotQueued:
+                continue
+            yield scene, taskname, queue_func
+
+@gin.configurable
+def manage_datagen_jobs(all_scenes, elapsed, num_concurrent, disk_sleep_threshold=0.95):
+
+    if LocalScheduleHandler._inst is not None:
+        LocalScheduleHandler.instance().poll()
+
+    warmup_pct = min(elapsed / args.warmup_sec, 1) if args.warmup_sec > 0 else 1
+    curr_concurrent_max = math.ceil(warmup_pct * num_concurrent)
+
+    # Check results / current state of scenes we have already launched 
+    stats = monitor_existing_jobs(all_scenes)
+    stats, totals = stats_summary(stats)
+
+    n_in_flight = totals.get(JobState.Running, 0) + totals.get(JobState.Queued, 0)
+    if n_in_flight > curr_concurrent_max:
+        raise ValueError(f'manage_datagen_jobs observed {n_in_flight=}, which exceeds allowed {curr_concurrent_max=}')
+    n_to_launch = max(curr_concurrent_max - n_in_flight, 0)
+
+    pd.DataFrame.from_records(all_scenes).to_csv(args.output_folder/'scenes_db.csv')
+
+    stats['n_in_flight'] = n_in_flight
+    stats['n_launching'] = n_to_launch
     stats['disk_usage'] = get_disk_usage(args.output_folder)
     stats['concurrent_max'] = curr_concurrent_max
     wandb.log(stats)
+    print("=" * 60)
     for k,v in sorted(stats.items()):
         print(f"{k.ljust(30)} : {v}")
     print("-" * 60)
 
     # Dont launch new scenes if disk is getting full
-    if stats['disk_usage'] > sleep_threshold:
+    if stats['disk_usage'] > disk_sleep_threshold:
         print(f"{args.output_folder} is too full ({get_disk_usage(args.output_folder)}%). Sleeping.")
         wandb.alert(title='Disk full', text=f'Sleeping due to full disk at {args.output_folder=}', wait_duration=3*60*60)
         time.sleep(60)
         return
 
-    # Launch new scenes to bring the current load back up to `curr_concurrent_max`
-    scenes = [j for j in all_scenes if (j["all_done"] == SceneState.NotDone)]
-    scenes = sorted(scenes, key=lambda s: s['num_running'] + s['num_done'], reverse=True) # greedily try to finish nearly-done videos asap
-    to_be_launched = curr_concurrent_max - stats.get(f'{JobState.Running}/all', 0) - stats.get(f'{JobState.Queued}/all', 0)
-    if to_be_launched <= 0:
-        return
-    for scene in scenes[:curr_concurrent_max]:
-        for state, taskname, queue_func, _ in iterate_scene_tasks(scene, args, monitor_all=False):
-            if state != JobState.NotQueued:
-                continue
-            to_be_launched -= 1
-            run_task(queue_func, args.output_folder / str(scene['seed']), scene, taskname)
-            if to_be_launched == 0:
-                break
-        if to_be_launched == 0:
-            break
+    # Launch to get back to intended n=`curr_concurrent_max` that should be in flight
+    for spec in itertools.islice(jobs_to_launch_next(all_scenes), n_to_launch):    
+        scene, taskname, queue_func = spec
+        logging.info(f"{scene['seed']} - running {taskname}")
+        run_task(queue_func, args.output_folder / str(scene['seed']), scene, taskname)
 
 @gin.configurable
 def main(args, shuffle=True, wandb_project='render_beta'):
@@ -963,7 +1119,7 @@ def main(args, shuffle=True, wandb_project='render_beta'):
     wandb.init(name=scene_name, config=vars(args), project=wandb_project, mode=args.wandb_mode)
 
     logging.basicConfig(
-        filename=str(args.output_folder / "jobs.log"),
+        #filename=str(args.output_folder / "jobs.log"),
         level=args.loglevel,
         format='[%(asctime)s]: %(message)s',
     )
@@ -978,20 +1134,23 @@ def main(args, shuffle=True, wandb_project='render_beta'):
     while any(j['all_done'] == SceneState.NotDone for j in all_scenes):
         now = datetime.now()
         print(f'{args.output_folder} {start_time.strftime("%m/%d %I:%M%p")} -> {now.strftime("%m/%d %I:%M%p")}')
-        logging.info('=' * 80)
-        manage_datagen_jobs(scenes, elapsed=(now-start_time).seconds)
-        logging.info("-" * 80)
+        manage_datagen_jobs(scenes, elapsed=(now-start_time).total_seconds())
         time.sleep(4)
 
-def test_upload(args):
+    
+def set_blender_path_global(args):
 
-    from_folder = args.output_folder/f'test_upload_{args.output_folder.name}'
-    from_folder.mkdir(parents=True, exist_ok=True)
-    (from_folder/'test_file.txt').touch()
+    global BLENDER_PATH
+    if args.blender_path is None:
+        if 'BLENDER' in os.environ:
+            BLENDER_PATH = os.environ['BLENDER']
+        else:
+            BLENDER_PATH = '../blender/blender' # assuming we run from infinigen/worldgen
+    else:
+        BLENDER_PATH = args.blender_path
+    if not os.path.exists(BLENDER_PATH):
+        raise ValueError(f'Couldnt not find {BLENDER_PATH=}, make sure --blender_path or $BLENDER is specified')
 
-    upload_util.upload_folder(from_folder, Path('infinigen/test_upload/'))
-    rmtree(from_folder)
-    
 if __name__ == "__main__":
     assert Path('.').resolve().parts[-1] == 'worldgen'
 
@@ -1063,7 +1222,7 @@ def test_upload(args):
     )
     parser.add_argument(
         '-p', 
-        '--override', 
+        '--overrides', 
         nargs='+', 
         type=str, 
         default=[], 
@@ -1089,32 +1248,22 @@ def test_upload(args):
         default=[], 
         help="List of gin overrides to configure this execution",
     )
+    parser.add_argument('--overwrite', action='store_true')
     parser.add_argument('-d', '--debug', action="store_const", dest="loglevel", const=logging.DEBUG, default=logging.INFO)
     parser.add_argument( '-v', '--verbose', action="store_const", dest="loglevel", const=logging.INFO)
     args = parser.parse_args()
 
-    envvar = 'INFINIGEN_ASSET_FOLDER'
-
     if not args.upload and args.cleanup == 'all':
-        raise ValueError(f'Pipeline is configured with {args.cleanup=} yet {args.upload=} --- no output would be preserved')
-
-    global BLENDER_PATH
-    if args.blender_path is None:
-        if 'BLENDER' in os.environ:
-            BLENDER_PATH = os.environ['BLENDER']
-        else:
-            BLENDER_PATH = '../blender/blender' # assuming we run from infinigen/worldgen
-    else:
-        BLENDER_PATH = args.blender_path
-    if not os.path.exists(BLENDER_PATH):
-        raise ValueError(f'Couldnt not find {BLENDER_PATH=}, make sure --blender_path or $BLENDER is specified')
-
+        raise ValueError(f'Pipeline is configured with {args.cleanup=} yet {args.upload=}! No output would be preserved!')
+    if args.upload and args.cleanup == 'none':
+        raise ValueError(f'--upload currently applies --cleanup big_files')
     assert args.specific_seed is None or args.num_scenes == 1
+    set_blender_path_global(args)
 
-    if args.output_folder.exists() and not args.use_existing:
-        raise FileExistsError(f'--output_folder {args.output_folder} already exists! Quitting to avoid overwrite. Please delete it, or specify a new --output_folder')
-
-    args.output_folder.mkdir(parents=True, exist_ok=args.use_existing)
+    overwrite_ok = args.use_existing or args.overwrite
+    if args.output_folder.exists() and not overwrite_ok:
+        raise FileExistsError(f'--output_folder {args.output_folder} already exists! Please delete it, specify a different --output_folder, or use --overwrite')
+    args.output_folder.mkdir(parents=True, exist_ok=overwrite_ok)
 
     if args.meta_seed is not None:
         random.seed(args.meta_seed)
@@ -1127,7 +1276,7 @@ def find_config(g):
             if p.parts[-1] == f'{g}.gin':
                 return p
         raise ValueError(f'Couldn not locate {g} or {g}.gin in anywhere pipeline_configs/**')
-    configs = [find_config(n) for n in args.pipeline_configs]
+    configs = [find_config(n) for n in ['base.gin'] + args.pipeline_configs]
     for c in configs:
         assert os.path.exists(c), c
     bindings = args.pipeline_overrides
diff --git a/worldgen/tools/pipeline_configs/base.gin b/worldgen/tools/pipeline_configs/base.gin
new file mode 100644
index 000000000..7eb526cac
--- /dev/null
+++ b/worldgen/tools/pipeline_configs/base.gin
@@ -0,0 +1,16 @@
+sample_scene_spec.config_distribution = [
+    ("forest", 4),
+    ("river", 4),
+    ("desert", 3),
+    ("coast", 3),
+    ("kelp_forest", 2),
+    ("coral_reef", 2),
+    ("cave", 2),
+    ("mountain", 2),
+    ("canyon", 2),
+    ("plain", 2),
+    ("cliff", 2),
+    ("arctic", 1),
+    ("snowy_mountain", 1),
+]
+sample_scene_spec.config_sample_mode = 'random'
diff --git a/worldgen/tools/pipeline_configs/compute_platform/local_256GB.gin b/worldgen/tools/pipeline_configs/compute_platform/local_256GB.gin
index 8d24ebf76..d7998ee33 100644
--- a/worldgen/tools/pipeline_configs/compute_platform/local_256GB.gin
+++ b/worldgen/tools/pipeline_configs/compute_platform/local_256GB.gin
@@ -1,12 +1,13 @@
 
 manage_datagen_jobs.num_concurrent = 16
 
-# Job updater
 
-get_cmd.niceness = 20 # let UI processes etc take precedence, to make the smooth and UI usable
+get_cmd.process_niceness = 20 # let UI processes etc take precedence, to make the smooth and UI usable
 local_submit_cmd.use_scheduler = True
 LocalScheduleHandler.jobs_per_gpu = 1
 
+get_cmd.blender_thread_limit = 8
+
 # All will run locally, LocalScheduleHandler doesnt actually enforce cpu/ram constraints currently
 queue_coarse.submit_cmd = @local_submit_cmd
 queue_fine_terrain.submit_cmd = @local_submit_cmd
diff --git a/worldgen/tools/pipeline_configs/compute_platform/slurm.gin b/worldgen/tools/pipeline_configs/compute_platform/slurm.gin
index cd9686cb5..ce4a0481b 100644
--- a/worldgen/tools/pipeline_configs/compute_platform/slurm.gin
+++ b/worldgen/tools/pipeline_configs/compute_platform/slurm.gin
@@ -1,8 +1,9 @@
 
-PARTITION = None
+PARTITION = 'ENVVAR_INFINIGEN_SLURMPARTITION' # change to partitionname string, or None
 
-manage_datagen_jobs.num_concurrent = 800
-get_slurm_banned_nodes.config_path = None # add a white-space separated txt file path here
+manage_datagen_jobs.num_concurrent = 200
+slurm_submit_cmd.slurm_niceness=10000 
+get_slurm_banned_nodes.config_path = 'ENVVAR_INFINIGEN_SLURM_EXCLUDENODES_LIST'
 
 # Combined (only used when `stereo_combined.gin` or similar is included)
 queue_combined.mem_gb = 12
@@ -40,7 +41,6 @@ queue_populate.exclude_gpus = ['a6000', 'rtx_3090']
 queue_render.submit_cmd = @slurm_submit_cmd
 queue_render.hours = 48
 
-rendershort/queue_render.exclude_gpus = []
 # no point requesting less than 48GB RAM, 8CPUs, due to ratios of RAM:GPUs on pvl
 rendershort/queue_render.mem_gb = 48 
 rendershort/queue_render.cpus = 8
@@ -48,13 +48,13 @@ rendershort/queue_render.slurm_account = %PARTITION
 rendershort/queue_render.gpus = 1
 rendershort/queue_render.render_type = "full"
 
-queue_render.exclude_gpus = ['gtx_1080', 'k80']
+rendershort/queue_render.exclude_gpus = ['gtx_1080', 'k80']
 renderbackup/queue_render.exclude_gpus = ['gtx_1080', 'k80', 'rtx_2080']
 
-renderbackup/queue_render.mem_gb = 64
-renderbackup/queue_render.cpus = 12
+renderbackup/queue_render.mem_gb = 96
+renderbackup/queue_render.cpus = 16
 renderbackup/queue_render.slurm_account = %PARTITION
-renderbackup/queue_render.gpus = 1
+renderbackup/queue_render.gpus = 2
 renderbackup/queue_render.render_type = "full"
 
 # Upload
@@ -68,15 +68,15 @@ queue_upload.dir_prefix_len = 2
 
 # Ground Truth
 queue_mesh_save.submit_cmd = @slurm_submit_cmd
-queue_mesh_save.mem_gb = 24
-queue_mesh_save.cpus = 4
+queue_mesh_save.mem_gb = 48
+queue_mesh_save.cpus = 8
 queue_mesh_save.hours = 24
 queue_mesh_save.slurm_account = %PARTITION
 queue_mesh_save.gpus = 0
 
 queue_opengl.submit_cmd = @slurm_submit_cmd
-queue_opengl.mem_gb = 24
-queue_opengl.cpus = 4
+queue_opengl.mem_gb = 48
+queue_opengl.cpus = 8
 queue_opengl.hours = 24
 queue_opengl.slurm_account = %PARTITION
 queue_opengl.gpus = 1
diff --git a/worldgen/tools/pipeline_configs/compute_platform/slurm_1h.gin b/worldgen/tools/pipeline_configs/compute_platform/slurm_1h.gin
new file mode 100644
index 000000000..74c4c3c77
--- /dev/null
+++ b/worldgen/tools/pipeline_configs/compute_platform/slurm_1h.gin
@@ -0,0 +1,12 @@
+include 'tools/pipeline_configs/compute_platform/slurm.gin'
+
+queue_combined.hours = 1
+queue_coarse.hours = 1
+queue_fine_terrain.hours = 1
+queue_populate.hours = 1
+queue_render.hours = 1
+queue_upload.hours = 1
+queue_mesh_save.hours = 1
+queue_opengl.hours = 1
+
+queue_coarse.cpus = 8
\ No newline at end of file
diff --git a/worldgen/tools/pipeline_configs/blender_gt.gin b/worldgen/tools/pipeline_configs/gt_options/blender_gt.gin
similarity index 100%
rename from worldgen/tools/pipeline_configs/blender_gt.gin
rename to worldgen/tools/pipeline_configs/gt_options/blender_gt.gin
diff --git a/worldgen/tools/pipeline_configs/gt_test.gin b/worldgen/tools/pipeline_configs/gt_options/gt_test.gin
similarity index 100%
rename from worldgen/tools/pipeline_configs/gt_test.gin
rename to worldgen/tools/pipeline_configs/gt_options/gt_test.gin
diff --git a/worldgen/tools/pipeline_configs/opengl_gt.gin b/worldgen/tools/pipeline_configs/gt_options/opengl_gt.gin
similarity index 100%
rename from worldgen/tools/pipeline_configs/opengl_gt.gin
rename to worldgen/tools/pipeline_configs/gt_options/opengl_gt.gin
diff --git a/worldgen/tools/pipeline_configs/gt_options/opengl_gt_noshortrender.gin b/worldgen/tools/pipeline_configs/gt_options/opengl_gt_noshortrender.gin
new file mode 100644
index 000000000..a3db10fa7
--- /dev/null
+++ b/worldgen/tools/pipeline_configs/gt_options/opengl_gt_noshortrender.gin
@@ -0,0 +1,7 @@
+include 'tools/pipeline_configs/opengl_gt.gin' # incase someone adds other settings to it
+
+iterate_scene_tasks.camera_dependent_tasks = [
+    {'name': 'renderbackup', 'func': @renderbackup/queue_render}, # still call it "backup" since it is reusing the compute_platform's backup config. we are just skipping straight to the backup
+    {'name': 'savemesh', 'func': @queue_mesh_save},
+    {'name': 'opengl', 'func': @queue_opengl}
+]
\ No newline at end of file
diff --git a/worldgen/tools/pipeline_configs/opengl_gt_noshortrender.gin b/worldgen/tools/pipeline_configs/opengl_gt_noshortrender.gin
new file mode 100644
index 000000000..a3db10fa7
--- /dev/null
+++ b/worldgen/tools/pipeline_configs/opengl_gt_noshortrender.gin
@@ -0,0 +1,7 @@
+include 'tools/pipeline_configs/opengl_gt.gin' # incase someone adds other settings to it
+
+iterate_scene_tasks.camera_dependent_tasks = [
+    {'name': 'renderbackup', 'func': @renderbackup/queue_render}, # still call it "backup" since it is reusing the compute_platform's backup config. we are just skipping straight to the backup
+    {'name': 'savemesh', 'func': @queue_mesh_save},
+    {'name': 'opengl', 'func': @queue_opengl}
+]
\ No newline at end of file
diff --git a/worldgen/tools/render_video_final.sh b/worldgen/tools/render_video_final.sh
deleted file mode 100644
index bbc7020b7..000000000
--- a/worldgen/tools/render_video_final.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-HOSTFIRST=$(hostname | tr "." "\n" | head -n 1)
-JOBNAME=$(date '+%m_%d_%H_%M').$HOSTFIRST.$1
-python -m tools.manage_datagen_jobs --blender_path ../blender/blender --output_folder outputs/$JOBNAME --num_scenes 500 \
-    --pipeline_config slurm stereo_video cuda_terrain opengl_gt --wandb_mode online --cleanup big_files --warmup_sec 60000 --config trailer high_quality_terrain --upload
diff --git a/worldgen/tools/results/parse_videos.py b/worldgen/tools/results/parse_videos.py
index 6f1ae4841..867aab7c7 100644
--- a/worldgen/tools/results/parse_videos.py
+++ b/worldgen/tools/results/parse_videos.py
@@ -37,7 +37,7 @@
         filters = " "
         if args.overlay:
             filters += f"-vf drawtext='text={seed_folder.absolute()}' "
-        cmd = f'ffmpeg -y -r {args.fps} -pattern_type glob -i {seed_folder.absolute()}/frames*/{args.image_type}*.png {filters} -pix_fmt yuv420p  {output_folder}/{seed_folder.name}_{args.image_type}.mp4'
+        cmd = f'ffmpeg -y -r {args.fps} -pattern_type glob -i {seed_folder.absolute()}/frames*_0/{args.image_type}*.png {filters} -pix_fmt yuv420p  {output_folder}/{seed_folder.name}_{args.image_type}.mp4'
         print(cmd.split())
         subprocess.run(cmd.split())
 
diff --git a/worldgen/tools/summarize.py b/worldgen/tools/results/summarize.py
similarity index 89%
rename from worldgen/tools/summarize.py
rename to worldgen/tools/results/summarize.py
index 56bf24193..575c9f6fb 100644
--- a/worldgen/tools/summarize.py
+++ b/worldgen/tools/results/summarize.py
@@ -29,7 +29,7 @@ def parse_mask_tag_jsons(base_folder):
     for file_path in base_folder.rglob('MaskTag.json'):
         if match := re.fullmatch("fine_([0-9])_([0-9])_([0-9]{4})_([0-9])", file_path.parent.name):
             _, _, frame_str, _ = match.groups()
-            yield (frame_str, file_path)
+            yield (int(frame_str), file_path)
     for file_path in base_folder.rglob('MaskTag.json'):
         if match := re.fullmatch("fine.*", file_path.parent.name):
             yield (0, file_path)
@@ -47,9 +47,11 @@ def summarize_folder(base_folder):
             output[data_type][suffix][rig][subcam][frame_str] = str(file_path.relative_to(base_folder))
             max_frame = max(max_frame, int(frame_str))
 
+    print(output.keys())
+
     # Rename keys
-    output["Camera Pose"] = output.pop("T")
-    output["Camera Intrinsics"] = output.pop("K")
+    #output["Camera Pose"] = output.pop("T")
+    #output["Camera Intrinsics"] = output.pop("K")
 
     mask_tag_jsons = sorted(parse_mask_tag_jsons(base_folder))
     for frame in range(1, max_frame+1):
@@ -115,23 +117,15 @@ def depth_to_jet(depth, scale_vmin=1.0):
     depth[~valid] = 1
     return np.ascontiguousarray(depth[...,:3] * 255, dtype=np.uint8)
 
-
-
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('folder', type=Path)
-    parser.add_argument('--preview', action='store_true')
-    args = parser.parse_args()
-
-    summary_json = summarize_folder(args.folder)
+def process_scene_folder(folder, preview):
+    summary_json = summarize_folder(folder)
     folder_data = json.loads(summary_json.read_text())
 
     missing = what_is_missing(folder_data)
     print("\n".join(missing))
 
-    if not args.preview:
-        exit(0)
+    if not preview:
+        return
 
     depth_paths = folder_data["Depth"]['npy']["00"]["00"]
     flow3d_paths = folder_data["Flow3D"]['npy']["00"]["00"]
@@ -142,15 +136,26 @@ def depth_to_jet(depth, scale_vmin=1.0):
 
     shape = (1280, 720)
     with mp.Pool() as pool:
-        all_flow_frames = pool.starmap(process_flow_frame, tqdm([(args.folder / path, shape) for _, path in sorted(flow3d_paths.items())]))
-        all_depth_frames = pool.starmap(process_depth_frame, tqdm([(args.folder / path, shape) for _, path in sorted(depth_paths.items())]))
-        all_occlusion_frames = pool.starmap(process_mask, tqdm([(args.folder / path, shape) for _, path in sorted(occlusion_boundary_paths.items())]))
-        all_flow_mask_frames = pool.starmap(process_mask, tqdm([(args.folder / path, shape) for _, path in sorted(flow_mask_paths.items())]))
+        all_flow_frames = pool.starmap(process_flow_frame, tqdm([(folder / path, shape) for _, path in sorted(flow3d_paths.items())]))
+        all_depth_frames = pool.starmap(process_depth_frame, tqdm([(folder / path, shape) for _, path in sorted(depth_paths.items())]))
+        all_occlusion_frames = pool.starmap(process_mask, tqdm([(folder / path, shape) for _, path in sorted(occlusion_boundary_paths.items())]))
+        all_flow_mask_frames = pool.starmap(process_mask, tqdm([(folder / path, shape) for _, path in sorted(flow_mask_paths.items())]))
 
-    previews: Path = args.folder / "previews"
+    previews: Path = folder / "previews"
     previews.mkdir(exist_ok=True)
     frames_to_video(previews / 'occlusion_boundaries.avi', all_occlusion_frames)
     frames_to_video(previews / 'flow_mask.avi', all_flow_mask_frames)
     depth_visualization = depth_to_jet(np.asarray(all_depth_frames))
     frames_to_video(previews / 'video_depth.avi', depth_visualization)
     frames_to_video(previews / 'flow_video.avi', all_flow_frames)
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('folder', type=Path)
+    parser.add_argument('--preview', action='store_true')
+    args = parser.parse_args()
+
+    process_scene_folder(args.folder, preview=args.preview)
+
+    
diff --git a/worldgen/tools/scripts/render_video_1080p.sh b/worldgen/tools/scripts/render_video_1080p.sh
new file mode 100644
index 000000000..5b54f3d97
--- /dev/null
+++ b/worldgen/tools/scripts/render_video_1080p.sh
@@ -0,0 +1,8 @@
+HOSTFIRST=$(hostname | tr "." "\n" | head -n 1)
+JOBNAME=$(date '+%m_%d_%H_%M').$HOSTFIRST.$1
+
+python -m tools.manage_datagen_jobs --blender_path ../blender/blender --output_folder outputs/$JOBNAME \
+    --num_scenes 100 --pipeline_config $@ stereo_video cuda_terrain opengl_gt_noshortrender \
+    --wandb_mode online --cleanup big_files --upload \
+    --warmup_sec 40000 \
+    --config high_quality_terrain
diff --git a/worldgen/tools/scripts/render_video_720p.sh b/worldgen/tools/scripts/render_video_720p.sh
new file mode 100644
index 000000000..a00dbb994
--- /dev/null
+++ b/worldgen/tools/scripts/render_video_720p.sh
@@ -0,0 +1,8 @@
+HOSTFIRST=$(hostname | tr "." "\n" | head -n 1)
+JOBNAME=$(date '+%m_%d_%H_%M').$HOSTFIRST.$1
+
+python -m tools.manage_datagen_jobs --output_folder outputs/$JOBNAME \
+    --num_scenes 100 --pipeline_config $@ stereo_video cuda_terrain opengl_gt \
+    --wandb_mode online --cleanup big_files --upload --warmup_sec 6000 \
+    --config high_quality_terrain \
+    --overrides compose_scene.generate_resolution=[1280,720]
diff --git a/worldgen/tools/scripts/render_video_stereo.sh b/worldgen/tools/scripts/render_video_stereo.sh
new file mode 100644
index 000000000..3e1d4edc3
--- /dev/null
+++ b/worldgen/tools/scripts/render_video_stereo.sh
@@ -0,0 +1,7 @@
+HOSTFIRST=$(hostname | tr "." "\n" | head -n 1)
+JOBNAME=$(date '+%m_%d_%H_%M').$HOSTFIRST.$1
+
+python -m tools.manage_datagen_jobs --blender_path ../blender/blender --output_folder outputs/$JOBNAME \
+    --num_scenes 1000 --pipeline_config $@ stereo cuda_terrain opengl_gt \
+    --wandb_mode online --cleanup big_files --upload --warmup_sec 40000 \
+    --override compose_scene.generate_resolution=[1280,720] \
diff --git a/worldgen/tools/cancel_jobs.py b/worldgen/tools/util/cancel_jobs.py
similarity index 100%
rename from worldgen/tools/cancel_jobs.py
rename to worldgen/tools/util/cancel_jobs.py
diff --git a/worldgen/tools/util/show_gpu_table.py b/worldgen/tools/util/show_gpu_table.py
index 07a983ced..dca63f04c 100644
--- a/worldgen/tools/util/show_gpu_table.py
+++ b/worldgen/tools/util/show_gpu_table.py
@@ -52,7 +52,7 @@ def nodes_with_gpus(*gpu_names):
     if len(gpu_names) == 0:
         return []
     _, node_type_lookup, _ = get_gpu_nodes()
-    return sorted(chain.from_iterable(node_type_lookup[n] for n in gpu_names))
+    return sorted(chain.from_iterable(node_type_lookup.get(n, set()) for n in gpu_names))
 
 if __name__ == '__main__':
     gpu_table, node_type_lookup, shared_node_mem = get_gpu_nodes()
diff --git a/worldgen/tools/util/submitit_emulator.py b/worldgen/tools/util/submitit_emulator.py
index 5612735c3..d5153aa9a 100644
--- a/worldgen/tools/util/submitit_emulator.py
+++ b/worldgen/tools/util/submitit_emulator.py
@@ -69,7 +69,6 @@ def get_fake_job_id():
 
 def job_wrapper(func, inner_args, inner_kwargs, stdout_file: Path, stderr_file: Path, cuda_devices=None):
 
-
     with stdout_file.open('w') as stdout, stderr_file.open('w') as stderr:
         sys.stdout = stdout
         sys.stderr = stderr
@@ -103,6 +102,7 @@ class ImmediateLocalExecutor:
 
     def __init__(self, folder: str):
         self.log_folder = Path(folder).resolve()
+        self.log_folder.mkdir(exist_ok=True)
         self.parameters = {}
 
     def update_parameters(self, **parameters):
diff --git a/worldgen/tools/util/upload_util.py b/worldgen/tools/util/upload_util.py
index ba19bef16..b23f2f0b5 100644
--- a/worldgen/tools/util/upload_util.py
+++ b/worldgen/tools/util/upload_util.py
@@ -17,7 +17,7 @@
 import subprocess
 from shutil import which, copyfile
 
-from . import smb_client
+from . import smb_client, cleanup
 
 GDRIVE_NAME = None
 
@@ -31,38 +31,16 @@ def rclone_upload_file(src_file, dst_folder):
     subprocess.check_output(cmd.split())
     print(f"Uploaded {src_file}")
 
+def get_commit_hash():
+    git = which('git')
+    if git is None:
+        return None
+    cmd = f"{git} rev-parse HEAD"
+    return subprocess.check_output(cmd.split()).decode().strip()
+
 # DO NOT make gin.configurable
 # this function gets submitted via pickle in some settings, and gin args are not preserved
-def upload_folder(folder, upload_dest_folder, method, metadata=None, **kwargs):
-
-    upload_info_path = folder / f"{folder.name}.json"
-    upload_info = {
-        'user': os.environ['USER'],
-        'node': platform.node().split('.')[0],
-        'timestamp': time.time(),
-        'datetime': datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
-        **(metadata if metadata is not None  else {})
-    }
-    with upload_info_path.open('w') as f:
-        json.dump(upload_info, f, indent=4)
-
-    with tarfile.open(folder.with_suffix('.tar.gz'), "w:gz") as tar:
-        tar.add(folder, os.path.sep)
-    assert folder.with_suffix('.tar.gz').exists()
-
-    if method == 'rclone':
-        upload_func = rclone_upload_file
-    elif method == 'smbclient':
-        upload_func = smb_client.upload
-    else:
-        raise ValueError(f'Unrecognized {method=}')
-
-    upload_func(folder.with_suffix('.tar.gz'), upload_dest_folder, **kwargs)  
-    upload_func(upload_info_path, upload_dest_folder, **kwargs)
-
-def upload_job_folder(parent_folder, task_uniqname, dir_prefix_len=3, method='smbclient', **kwargs):
-
-    parent_folder = Path(parent_folder)
+def reorganize_before_upload(parent_folder):
 
     seed = parent_folder.name
     tmpdir = (parent_folder / "tmp" / seed)
@@ -71,6 +49,7 @@ def upload_job_folder(parent_folder, task_uniqname, dir_prefix_len=3, method='sm
 
     frames_folders = list(sorted(parent_folder.glob("frames*")))
     for idx, frames_folder in enumerate(frames_folders):
+
         subfolder_name = f"resample_{idx}" if (idx > 0) else "original"
         subfolder = tmpdir / subfolder_name
         info_dir = subfolder / "info"
@@ -95,26 +74,67 @@ def upload_job_folder(parent_folder, task_uniqname, dir_prefix_len=3, method='sm
     
     copyfile(parent_folder / "run_pipeline.sh", log_dir / "run_pipeline.sh")  
 
-    version = (parent_folder / "fine" / "version.txt").read_text().splitlines()[0]
-    upload_dest_folder = Path('infinigen')/'renders'/version
-    if dir_prefix_len != 0:
-        upload_dest_folder = upload_dest_folder/seed[:dir_prefix_len]
+# DO NOT make gin.configurable
+# this function gets submitted via pickle in some settings, and gin args are not preserved
+def upload_job_folder(
+    parent_folder, 
+    task_uniqname, 
+    dir_prefix_len=3, 
+    method='smbclient', 
+):
 
-    metadata = {
-        'n_frames_folders': len(frames_folders),
-        'original_directory': str(parent_folder.resolve())
-    }
+    parent_folder = Path(parent_folder)
 
-    upload_folder(tmpdir, upload_dest_folder, method=method, metadata=metadata, **kwargs)
+    if method == 'rclone':
+        upload_func = rclone_upload_file
+    elif method == 'smbclient':
+        upload_func = smb_client.upload
+    else:
+        raise ValueError(f'Unrecognized {method=}')  
 
-    (parent_folder / "logs" / f"FINISH_{task_uniqname}").touch()
+    jobname = parent_folder.parent.name
+    seed = parent_folder.name
+    
+    upload_dest_folder = Path('infinigen')/'renders'/jobname
+    if dir_prefix_len != 0:
+        upload_dest_folder = upload_dest_folder/parent_folder.name[:dir_prefix_len]
+
+    print(f'{method=} {upload_dest_folder=}')
+
+    all_images = sorted(list(parent_folder.rglob("frames*/Image*.png")))
+    if len(all_images) > 0:
+        thumb_path = parent_folder/f'{seed}_thumbnail.png'
+        copyfile(all_images, thumb_path)
+        upload_func(thumb_path, upload_dest_folder)
 
-def test():
-    import manage_datagen_jobs
-    find_gin = lambda n: os.path.join("tools", "pipeline_configs", f"{n}.gin")
-    configs = [find_gin(n) for n in ['andromeda', 'smb_login']]
-    gin.parse_config_files_and_bindings(configs, bindings=[])
-    upload_folder(Path('outputs/23_01_25_allcs/a4b66f1'), 'upload', dir_prefix_len=3, method='smbclient')
+    try:
+        version = (parent_folder / "coarse" / "version.txt").read_text().splitlines()[0]
+    except FileNotFoundError:
+        version = None
 
-if __name__ == "__main__":
-    test()
+    metadata = {
+        'original_directory': str(parent_folder.resolve()),
+        'user': os.environ['USER'],
+        'node': platform.node().split('.')[0],
+        'timestamp': time.time(),
+        'datetime': datetime.now().strftime("%m/%d/%Y, %H:%M:%S"),
+        'version': version,
+        'commit': get_commit_hash(),
+        'n_frames': len(all_images)
+    }
+    metadata_path = parent_folder/f'{seed}_metadata.json'
+    with metadata_path.open('w') as f:
+        json.dump(metadata, f, indent=4)
+    print(metadata_path, metadata)
+    upload_func(metadata_path, upload_dest_folder)
+
+    tar_path = parent_folder.with_suffix('.tar.gz')
+    print(f"Performing cleanup and tar to {tar_path}")
+    cleanup.cleanup(parent_folder)
+    with tarfile.open(tar_path, "w:gz") as tar:
+        tar.add(parent_folder, os.path.sep)
+    assert tar_path.exists()
+    
+    print(f"Uploading tarfile")
+    upload_func(tar_path, upload_dest_folder)
+    (parent_folder / "logs" / f"FINISH_{task_uniqname}").touch()
diff --git a/worldgen/util/organization.py b/worldgen/util/organization.py
index 5f4ff6fcc..ad57e1b38 100644
--- a/worldgen/util/organization.py
+++ b/worldgen/util/organization.py
@@ -7,7 +7,6 @@
 class Task:
     Coarse = "coarse"
     Populate = "populate"
-    Fine = "fine"
     FineTerrain = "fine_terrain"
     Render = "render"
     GroundTruth = "ground_truth"