From 6e17b5c798cc2f71cc35b53e6f48a9ba439038e6 Mon Sep 17 00:00:00 2001 From: Alex Raistrick Date: Tue, 1 Aug 2023 17:53:31 -0400 Subject: [PATCH] Rendering improvements (princeton-vl/infinigen_internal/#39) * Tweak render_video_final * Remove random config choosing from core.py * Create tools/pipeline_configs/base.gin, move scenetype distribution configs into it * Create noshortrender config to test on IONIC * Implement slurm niceness override, add it to render_video_final.sh * Only include camera 0 in parse_video output * Read slurm partition from ENVVAR by default * Fix config postprocess * Fix slurm envvar * Typo fixes * Use roundrobin by default * Rendering tweaks * Change trailer.gin to video.gin with 720p res * Fix niceness * Set exclude_nodes list via envvar, move niceness configs into slurm.gin * Create render_video_720p.sh, start off experimental.gin but more needs adding * Add dryrun options * Fix --override vs --overrides * Move legacy task.Fine * Retool upload func * Add slurm_1h and stereo config * Rendering & typo fixes * Update render script and slurm.gin mem amounts * Fix excluded gpus * Add queues stats to wandb, add pandas to requirements.txt * Fix num_concurrent reset 24h later * Dont keep working on scenes which have had a fatal crash * Add new timeout message to error parsing * Fix overly nested upload dirs * Add thread limit to local jobs --- docs/CHANGELOG.md | 2 +- docs/ConfiguringInfinigen.md | 17 +- requirements.txt | 2 + worldgen/config/base.gin | 5 +- worldgen/config/experimental.gin | 3 + .../config/scene_types/snowy_mountain.gin | 2 +- worldgen/config/trailer.gin | 20 - worldgen/core.py | 70 ++- worldgen/generate.py | 4 +- .../{ => dev}/generate_terrain_assets.py | 0 .../tools/{ => dev}/kernelize_surfaces.py | 0 worldgen/tools/{ => dev}/palette/.gitignore | 0 worldgen/tools/{ => dev}/palette/demo1.png | Bin worldgen/tools/{ => dev}/palette/demo2.png | Bin worldgen/tools/{ => dev}/palette/demo3.png | Bin worldgen/tools/{ => dev}/palette/demo4.png | Bin worldgen/tools/{ => dev}/palette/palette.py | 0 worldgen/tools/{ => dev}/palette/readme.md | 0 worldgen/tools/manage_datagen_jobs.py | 431 ++++++++++++------ worldgen/tools/pipeline_configs/base.gin | 16 + .../compute_platform/local_256GB.gin | 5 +- .../compute_platform/slurm.gin | 24 +- .../compute_platform/slurm_1h.gin | 12 + .../{ => gt_options}/blender_gt.gin | 0 .../{ => gt_options}/gt_test.gin | 0 .../{ => gt_options}/opengl_gt.gin | 0 .../gt_options/opengl_gt_noshortrender.gin | 7 + .../opengl_gt_noshortrender.gin | 7 + worldgen/tools/render_video_final.sh | 4 - worldgen/tools/results/parse_videos.py | 2 +- worldgen/tools/{ => results}/summarize.py | 45 +- worldgen/tools/scripts/render_video_1080p.sh | 8 + worldgen/tools/scripts/render_video_720p.sh | 8 + worldgen/tools/scripts/render_video_stereo.sh | 7 + worldgen/tools/{ => util}/cancel_jobs.py | 0 worldgen/tools/util/show_gpu_table.py | 2 +- worldgen/tools/util/submitit_emulator.py | 2 +- worldgen/tools/util/upload_util.py | 118 +++-- worldgen/util/organization.py | 1 - 39 files changed, 514 insertions(+), 310 deletions(-) create mode 100644 worldgen/config/experimental.gin delete mode 100644 worldgen/config/trailer.gin rename worldgen/tools/{ => dev}/generate_terrain_assets.py (100%) rename worldgen/tools/{ => dev}/kernelize_surfaces.py (100%) rename worldgen/tools/{ => dev}/palette/.gitignore (100%) rename worldgen/tools/{ => dev}/palette/demo1.png (100%) rename worldgen/tools/{ => dev}/palette/demo2.png (100%) rename worldgen/tools/{ => dev}/palette/demo3.png (100%) rename worldgen/tools/{ => dev}/palette/demo4.png (100%) rename worldgen/tools/{ => dev}/palette/palette.py (100%) rename worldgen/tools/{ => dev}/palette/readme.md (100%) create mode 100644 worldgen/tools/pipeline_configs/base.gin create mode 100644 worldgen/tools/pipeline_configs/compute_platform/slurm_1h.gin rename worldgen/tools/pipeline_configs/{ => gt_options}/blender_gt.gin (100%) rename worldgen/tools/pipeline_configs/{ => gt_options}/gt_test.gin (100%) rename worldgen/tools/pipeline_configs/{ => gt_options}/opengl_gt.gin (100%) create mode 100644 worldgen/tools/pipeline_configs/gt_options/opengl_gt_noshortrender.gin create mode 100644 worldgen/tools/pipeline_configs/opengl_gt_noshortrender.gin delete mode 100644 worldgen/tools/render_video_final.sh rename worldgen/tools/{ => results}/summarize.py (89%) create mode 100644 worldgen/tools/scripts/render_video_1080p.sh create mode 100644 worldgen/tools/scripts/render_video_720p.sh create mode 100644 worldgen/tools/scripts/render_video_stereo.sh rename worldgen/tools/{ => util}/cancel_jobs.py (100%) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 70b336c91..7d4193631 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -1,3 +1,3 @@ v1.0.0 - Beta code release
-v1.0.1 - BSD-3 license, expanded ground-truth docs, show line-credits, miscellaneous fixes +v1.0.1 - BSD-3 license, expanded ground-truth docs, show line-credits, miscellaneous fixes
v1.0.2 - New documentation, plant improvements, disk and reproducibility improvements \ No newline at end of file diff --git a/docs/ConfiguringInfinigen.md b/docs/ConfiguringInfinigen.md index d1fa2124b..be9ea583b 100644 --- a/docs/ConfiguringInfinigen.md +++ b/docs/ConfiguringInfinigen.md @@ -39,7 +39,7 @@ If you find a useful and related combination of these commandline overrides, you Our `generate.py` driver always loads [`worldgen/configs/base.gin`][../worldgen/configs/base.gin], and you can inspect / modify this file to see many common and useful gin override options. -`generate.py` also expects that one file from (configs/scene_types/)[worldgen/config/scene_types] will be loaded, and if one is not specified on the commandline it will choose one randomly according to the keys and weights in `worldgen/core.py`. These scene_type configs contain gin overrides designed to encode the semantic constraints of real natural habitats (e.g. `worldgen/scene_types/desert.gin` causes sand to appear and cacti to be more likely). +`generate.py` also expects that one file from (configs/scene_types/)[worldgen/config/scene_types] will be loaded. These scene_type configs contain gin overrides designed to encode the semantic constraints of real natural habitats (e.g. `worldgen/scene_types/desert.gin` causes sand to appear and cacti to be more likely). ### Moving beyond "Hello World" @@ -58,7 +58,7 @@ Here is a breakdown of what every commandline argument does, and ideas for how y - `--num_scenes` decides how many unique scenes the program will attempt to generate before terminating. Once you have removed `--specific_seed`, you can increase this to generate many scenes in sequence or in paralell. - `--configs desert.gin simple.gin` forces the command to generate a desert scene, and to do so with relatively low mesh detail, low render resolution, low render samples, and some asset types disabled. - Do `--configs snowy_mountain.gin simple.gin` to try out a different scene type (`snowy_mountain.gin` can instead be any scene_type option from `worldgen/configs/scene_types/`) - - Remove the `desert.gin` and just specify `--configs simple.gin` to use random scene types according to the weighted list in `worldgen/core.py`. + - Remove the `desert.gin` and just specify `--configs simple.gin` to use random scene types according to the weighted list in `worldgen/tools/pipeline.py`. - You have the option of removing `simple.gin` and specify neither of the original configs. This turns off the many detail-reduction options included in `simple.gin`, and will create scenes closer to those in our intro video, albeit at significant compute costs. Removing `simple.gin` will likely cause crashes unless using a workstation/server with large amounts of RAM and VRAM. You can find more details on optimizing scene content for performance [here](#config-overrides-for-mesh-detail-and-performance). - `--pipeline_configs local_16GB.gin monocular.gin blender_gt.gin` - `local_16GB.gin` specifies to run only a single scene at a time, and to run each task as a local python process. See [here](#configuring-available-computing-resources) for more options @@ -126,7 +126,6 @@ You will also encounter configs using what we term a "registry pattern", e.g. `w - For example, in `base_surface_registry.gin`, `surface.registry.beach` specifies `("sand", 10)` to indicate that sand has high weight to be chosen to be assigned for the beach category. - Weights are normalized by their overall sum to obtain a probability distribution. - Name strings undergo lookup in the relevant source code folders, e.g. the name "sand" in a surface registry maps to `worldgen/surfaces/templates/sand.py`. - - The random choice among scene_type configs is itself a registry, although it is hardcoded in `core.py` currently, since the choice of what configs are loaded cannot depend on a config file. This will be improved soon. ### Config Overrides for mesh detail and performance @@ -153,7 +152,7 @@ If you find yourself bottlenecked by GPU time, you should consider the following - Reduce `base.gin`'s `full/render_image.num_samples = 8192` or `compose_scene.generate_resolution = (1920, 1080)`. This proportionally reduces rendering FLOPS, with some diminishing returns due to BVH setup time. - If your GPU(s) are _underutilized_, try the reverse of these tips. -Some scene type configs are also generally more expensive than others. `forest.gin` and `coral.gin` are very expensive due to dense detailed fauna, wheras `artic` and `snowy_mountain` are very cheap. Low-resource compute settings (<64GB) of RAM may only be able to handle a subset of our `worldgen/config/scene_type/` options, and you may wish to tune the ratios of scene_types by editing `worldgen/core.py`. +Some scene type configs are also generally more expensive than others. `forest.gin` and `coral.gin` are very expensive due to dense detailed fauna, wheras `artic` and `snowy_mountain` are very cheap. Low-resource compute settings (<64GB) of RAM may only be able to handle a subset of our `worldgen/config/scene_type/` options, and you may wish to tune the ratios of scene_types by editing `worldgen/tools/pipeline_configs/base.gin` or otherwise overriding `sample_scene_spec.config_distribution`. ### Other `manage_datagen_jobs.py` commandline options @@ -174,7 +173,7 @@ Most videos in the "Introducing Infinigen" launch video were made using commands ```` python -m tools.manage_datagen_jobs --output_folder outputs/my_videos --num_scenes 500 \ --pipeline_config slurm monocular_video cuda_terrain opengl_gt \ - --cleanup big_files --warmup_sec 60000 --config trailer high_quality_terrain + --cleanup big_files --warmup_sec 60000 --config video high_quality_terrain ```` #### Creating large-scale stereo datasets @@ -182,7 +181,7 @@ python -m tools.manage_datagen_jobs --output_folder outputs/my_videos --num_scen ```` python -m tools.manage_datagen_jobs --output_folder outputs/stereo_data --num_scenes 10000 \ --pipeline_config slurm stereo cuda_terrain opengl_gt \ - --cleanup big_files --warmup_sec 60000 --config trailer high_quality_terrain + --cleanup big_files --warmup_sec 60000 --config high_quality_terrain ```` #### Creating a few low-resolution images to your test changes @@ -220,7 +219,7 @@ python -m tools.manage_datagen_jobs --output_folder outputs/my_videos --num_scen ``` python -m tools.manage_datagen_jobs --output_folder outputs/my_videos --num_scenes 500 \ --pipeline_config slurm monocular_video cuda_terrain opengl_gt \ - --cleanup big_files --warmup_sec 30000 --config trailer high_quality_terrain \ + --cleanup big_files --warmup_sec 30000 --config video high_quality_terrain \ --overrides camera.camera_pose_proposal.altitude=["uniform", 20, 30] ``` @@ -230,8 +229,8 @@ python -m tools.manage_datagen_jobs --output_folder outputs/my_videos --num_scen ``` python -m tools.manage_datagen_jobs --output_folder outputs/my_videos --num_scenes 500 \ --pipeline_config slurm monocular_video cuda_terrain opengl_gt \ - --cleanup big_files --warmup_sec 30000 --config trailer high_quality_terrain \ + --cleanup big_files --warmup_sec 30000 --config video high_quality_terrain \ --pipeline_overrides iterate_scene_tasks.frame_range=[1,25] ``` -:bulb: This command uses `--pipeline_overrides` rather than `--overrides` since it is providing a gin override to the `manage_datagen_jobs.py` process, not some part main `generate.py` driver. +:bulb: This command uses `--pipeline_overrides` rather than `--overrides` since it is providing a gin override to the `manage_datagen_jobs.py` process, not some part of the main `generate.py` driver. diff --git a/requirements.txt b/requirements.txt index 156ca419b..604885870 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,3 +27,5 @@ landlab==2.4.1 scikit-learn psutil pyrender +pytest +pandas diff --git a/worldgen/config/base.gin b/worldgen/config/base.gin index 8daa82582..e828603f2 100644 --- a/worldgen/config/base.gin +++ b/worldgen/config/base.gin @@ -130,9 +130,9 @@ flat/render_image.passes_to_save = [ render_image.exposure = 1 -render_image.use_dof = 'IF_TARGET_SET' +render_image.use_dof = False render_image.dof_aperture_fstop = 3 -render_image.motion_blur = True +render_image.motion_blur = False render_image.motion_blur_shutter = 0.15 compositor_postprocessing.distort = False @@ -142,7 +142,6 @@ compose_scene.generate_resolution = (1920, 1080) get_sensor_coords.H = 720 get_sensor_coords.W = 1280 - min_terrain_distance = 2 keep_cam_pose_proposal.min_terrain_distance = %min_terrain_distance SphericalMesher.r_min = %min_terrain_distance diff --git a/worldgen/config/experimental.gin b/worldgen/config/experimental.gin new file mode 100644 index 000000000..cb54b452b --- /dev/null +++ b/worldgen/config/experimental.gin @@ -0,0 +1,3 @@ +# things that are not quite fully working correctly, but you can use if you please + +render_image.motion_blur = True # not fully supported in ground truth diff --git a/worldgen/config/scene_types/snowy_mountain.gin b/worldgen/config/scene_types/snowy_mountain.gin index 8c8884150..a8b6b74a7 100644 --- a/worldgen/config/scene_types/snowy_mountain.gin +++ b/worldgen/config/scene_types/snowy_mountain.gin @@ -27,7 +27,7 @@ compose_scene.flying_creature_registry = [ ] surfaces.templates.mountain.shader.layered_mountain = 0 -surfaces.templates.mountain.shader.snowy = 1 +surfaces.templates.mountain.shader.snowy = 0 # TODO: re-enable once terrain flickering resolved compose_scene.boulders_chance = 1 camera.camera_pose_proposal.pitch = ("clip_gaussian", 90, 30, 90, 100) diff --git a/worldgen/config/trailer.gin b/worldgen/config/trailer.gin deleted file mode 100644 index 3e2cb26e0..000000000 --- a/worldgen/config/trailer.gin +++ /dev/null @@ -1,20 +0,0 @@ -full/render_image.passes_to_save = [ - #['diffuse_direct', 'DiffDir'], - #['diffuse_color', 'DiffCol'], - #['diffuse_indirect', 'DiffInd'], - #['glossy_direct', 'GlossDir'], - #['glossy_color', 'GlossCol'], - #['glossy_indirect', 'GlossInd'], - #['transmission_direct', 'TransDir'], - #['transmission_color', 'TransCol'], - #['transmission_indirect', 'TransInd'], - #['emit', 'Emit'], - #['environment', 'Env'], - ['ambient_occlusion', 'AO'] -] -flat/render_image.passes_to_save = [ - #['z', 'Depth'], - #['normal', 'Normal'], - #['vector', 'Vector'], - ['object_index', 'IndexOB'] -] diff --git a/worldgen/core.py b/worldgen/core.py index c0facf730..29a5d9c47 100644 --- a/worldgen/core.py +++ b/worldgen/core.py @@ -231,6 +231,7 @@ def execute_tasks( generate_resolution=(1920,1080), reset_assets=True, focal_length=None, + dryrun=False, ): if input_folder != output_folder: if reset_assets: @@ -241,6 +242,10 @@ def execute_tasks( if (not os.path.islink(output_folder/"assets")) and (not (output_folder/"assets").exists()) and input_folder is not None and (input_folder/"assets").exists(): os.symlink(input_folder/"assets", output_folder/"assets") # in this way, even coarse task can have input_folder to have pregenerated on-the-fly assets (e.g., in last run) to speed up developing + + if dryrun: + return + if Task.Coarse not in task: with Timer('Reading input blendfile'): bpy.ops.wm.open_mainfile(filepath=str(input_folder / 'scene.blend')) @@ -287,9 +292,6 @@ def execute_tasks( if Task.Populate in task: populate_scene(output_folder, terrain, scene_seed) - if Task.Fine in task: - raise RuntimeError(f'{task=} contains deprecated {Task.Fine=}') - if Task.FineTerrain in task: terrain.fine_terrain(output_folder) @@ -305,17 +307,15 @@ def execute_tasks( for mesh in os.listdir(input_folder): if (mesh.endswith(".glb") or mesh.endswith(".b_displacement.npy")) and not os.path.islink(output_folder / mesh): os.symlink(input_folder / mesh, output_folder / mesh) - if Task.Coarse in task or Task.Populate in task or Task.FineTerrain in task: - bpy.context.preferences.system.scrollback = 100 - bpy.context.preferences.edit.undo_steps = 100 + if Task.Coarse in task or Task.Populate in task: + with Timer(f'Writing output blendfile'): logging.info(f'Writing output blendfile to {output_folder / output_blend_name}') bpy.ops.wm.save_mainfile(filepath=str(output_folder / output_blend_name)) tag_system.save_tag(path=str(output_folder / "MaskTag.json")) with (output_folder/ "version.txt").open('w') as f: - scene_version = get_scene_tag('VERSION') - f.write(f"{scene_version}\n") + f.write(f"{VERSION}\n") with (output_folder/'polycounts.txt').open('w') as f: save_polycounts(f) @@ -361,39 +361,25 @@ def apply_scene_seed(args): np.random.seed(scene_seed) return scene_seed -def apply_gin_configs(args, scene_seed, skip_unknown=False): - - scene_types = [p.stem for p in Path('config/scene_types').iterdir()] - scene_specified = any(s in scene_types or s.startswith("figure") for s in args.gin_config) - - weights = { - "kelp_forest": 0.3, - "coral_reef": 1, - "forest": 2, - "river": 2, - "desert": 1, - "coast": 1, - "cave": 1, - "mountain": 1, - "canyon": 1, - "plain": 1, - "cliff": 1, - "arctic": 1, - "snowy_mountain": 1, - } - assert all(k in scene_types for k in weights) - - scene_types = [s for s in scene_types if s in weights] - weights = np.array([weights[k] for k in scene_types], dtype=float) - weights /= weights.sum() +@gin.configurable +def apply_gin_configs( + args, + scene_seed, + skip_unknown=False, + mandatory_config_dir=Path('config/scene_types'), +): - if not scene_specified: - scene_type = np.random.RandomState(scene_seed).choice(scene_types, p=weights) - logging.warning(f'Randomly selected {scene_type=}. IF THIS IS NOT INTENDED THEN YOU ARE MISSING SCENE CONFIGS') - if len(args.gin_config) > 0 and args.gin_config[0] == 'base': - args.gin_config = [scene_type] + args.gin_config[1:] - else: - args.gin_config = [scene_type] + args.gin_config + if mandatory_config_dir is not None: + assert mandatory_config_dir.exists() + scene_types = [p.stem for p in mandatory_config_dir.iterdir()] + scenetype_specified = any(s in scene_types or s.split('.')[0] in scene_types for s in args.configs) + + if not scenetype_specified: + print(scene_types) + raise ValueError( + f"Please load one or more config from {mandatory_config_dir} using --configs to avoid unexpected behavior. " + "If you are sure you want to proceed without, override `apply_gin_configs.mandatory_config_dir=None`" + ) def find_config(g): for p in Path('config').glob('**/*.gin'): @@ -403,8 +389,8 @@ def find_config(g): return p raise ValueError(f'Couldn not locate {g} or {g}.gin in anywhere config/**') - bindings = sanitize_gin_override(args.gin_param) - confs = [find_config(g) for g in ['base.gin'] + args.gin_config] + bindings = sanitize_gin_override(args.overrides) + confs = [find_config(g) for g in ['base.gin'] + args.configs] gin.parse_config_files_and_bindings(confs, bindings=bindings, skip_unknown=skip_unknown) def main( diff --git a/worldgen/generate.py b/worldgen/generate.py index bc8979af5..9f840a081 100644 --- a/worldgen/generate.py +++ b/worldgen/generate.py @@ -375,10 +375,10 @@ def main(): parser.add_argument('-s', '--seed', default=None, help="The seed used to generate the scene") parser.add_argument('-t', '--task', nargs='+', default=['coarse'], choices=['coarse', 'populate', 'fine_terrain', 'ground_truth', 'render', 'mesh_save']) - parser.add_argument('-g', '--gin_config', nargs='+', default=['base'], + parser.add_argument('-g', '--configs', nargs='+', default=['base'], help='Set of config files for gin (separated by spaces) ' 'e.g. --gin_config file1 file2 (exclude .gin from path)') - parser.add_argument('-p', '--gin_param', nargs='+', default=[], + parser.add_argument('-p', '--overrides', nargs='+', default=[], help='Parameter settings that override config defaults ' 'e.g. --gin_param module_1.a=2 module_2.b=3') parser.add_argument('--task_uniqname', type=str, default=None) diff --git a/worldgen/tools/generate_terrain_assets.py b/worldgen/tools/dev/generate_terrain_assets.py similarity index 100% rename from worldgen/tools/generate_terrain_assets.py rename to worldgen/tools/dev/generate_terrain_assets.py diff --git a/worldgen/tools/kernelize_surfaces.py b/worldgen/tools/dev/kernelize_surfaces.py similarity index 100% rename from worldgen/tools/kernelize_surfaces.py rename to worldgen/tools/dev/kernelize_surfaces.py diff --git a/worldgen/tools/palette/.gitignore b/worldgen/tools/dev/palette/.gitignore similarity index 100% rename from worldgen/tools/palette/.gitignore rename to worldgen/tools/dev/palette/.gitignore diff --git a/worldgen/tools/palette/demo1.png b/worldgen/tools/dev/palette/demo1.png similarity index 100% rename from worldgen/tools/palette/demo1.png rename to worldgen/tools/dev/palette/demo1.png diff --git a/worldgen/tools/palette/demo2.png b/worldgen/tools/dev/palette/demo2.png similarity index 100% rename from worldgen/tools/palette/demo2.png rename to worldgen/tools/dev/palette/demo2.png diff --git a/worldgen/tools/palette/demo3.png b/worldgen/tools/dev/palette/demo3.png similarity index 100% rename from worldgen/tools/palette/demo3.png rename to worldgen/tools/dev/palette/demo3.png diff --git a/worldgen/tools/palette/demo4.png b/worldgen/tools/dev/palette/demo4.png similarity index 100% rename from worldgen/tools/palette/demo4.png rename to worldgen/tools/dev/palette/demo4.png diff --git a/worldgen/tools/palette/palette.py b/worldgen/tools/dev/palette/palette.py similarity index 100% rename from worldgen/tools/palette/palette.py rename to worldgen/tools/dev/palette/palette.py diff --git a/worldgen/tools/palette/readme.md b/worldgen/tools/dev/palette/readme.md similarity index 100% rename from worldgen/tools/palette/readme.md rename to worldgen/tools/dev/palette/readme.md diff --git a/worldgen/tools/manage_datagen_jobs.py b/worldgen/tools/manage_datagen_jobs.py index a15c9e535..eda05fd3b 100644 --- a/worldgen/tools/manage_datagen_jobs.py +++ b/worldgen/tools/manage_datagen_jobs.py @@ -22,6 +22,7 @@ from uuid import uuid4 from enum import Enum from copy import copy +from ast import literal_eval from functools import partial, cache from collections import defaultdict @@ -29,6 +30,7 @@ from pathlib import Path from shutil import which, rmtree, copyfile, copytree +import pandas as pd from tqdm import tqdm import numpy as np @@ -43,6 +45,9 @@ from tools.util import upload_util from tools.util.upload_util import upload_job_folder # for pickle not to freak out +PARTITION_ENVVAR = 'INFINIGEN_SLURMPARTITION' # used only if enabled in config +EXCLUDE_FILE_ENVVAR = 'INFINIGEN_SLURM_EXCLUDENODES_LIST' + class JobState: NotQueued = "notqueued" Queued = "queued" @@ -55,6 +60,7 @@ class SceneState: Done = "done" Crashed = "crashed" +JOB_OBJ_SUCCEEDED = 'MARK_AS_SUCCEEDED' CONCLUDED_STATES = {JobState.Succeeded, JobState.Failed} # Will throw exception if the scene was not found. Sometimes this happens if the scene was queued very very recently @@ -73,6 +79,18 @@ def seff(job_obj, retry_on_error=True): raise time.sleep(1) +def node_from_slurm_jobid(scene_id): + + if not which('sacct'): + return None + + try: + node_of_scene, *rest = subprocess.check_output(f"{which('sacct')} -j {scene_id} --format Node --noheader".split()).decode().split() + return node_of_scene + except Exception as e: + logging.warning(f'sacct threw {e}') + return None + def get_scene_state(scene_dict, taskname, scene_folder): if not scene_dict.get(f'{taskname}_submitted', False): @@ -84,7 +102,7 @@ def get_scene_state(scene_dict, taskname, scene_folder): # for when both local and slurm scenes are being mixed if isinstance(job_obj, str): - assert job_obj == 'MARK_AS_SUCCEEDED' + assert job_obj == JOB_OBJ_SUCCEEDED return JobState.Succeeded elif isinstance(job_obj, LocalJob): res = job_obj.status() @@ -106,15 +124,31 @@ def seed_generator(): return hex(seed_int).removeprefix('0x') @gin.configurable -def get_cmd(seed, task, configs, taskname, output_folder, driver_script='generate.py', input_folder=None, niceness=None): +def get_cmd( + seed, + task, + configs, + taskname, + output_folder, + blender_thread_limit=None, + driver_script='generate.py', + input_folder=None, + process_niceness=None, +): if isinstance(task, list): task = " ".join(task) cmd = '' - if niceness is not None: - cmd += f'nice -n {niceness} ' - cmd += f'{BLENDER_PATH} --background -y -noaudio --python {driver_script} -- ' + if process_niceness is not None: + cmd += f'nice -n {process_niceness} ' + cmd += f'{BLENDER_PATH} --background -y -noaudio --python {driver_script} ' + + if blender_thread_limit is not None: + cmd += f'--threads {blender_thread_limit} ' + + cmd += '-- ' + if input_folder is not None: cmd += '--input_folder ' + str(input_folder) + ' ' if output_folder is not None: @@ -128,6 +162,8 @@ def get_cmd(seed, task, configs, taskname, output_folder, driver_script='generat @gin.configurable def get_slurm_banned_nodes(config_path=None): + if config_path == f'ENVVAR_{EXCLUDE_FILE_ENVVAR}': + config_path = os.environ.get(EXCLUDE_FILE_ENVVAR) if config_path is None: return [] with Path(config_path).open('r') as f: @@ -149,7 +185,19 @@ def get_suffix(indices): return suffix @gin.configurable -def slurm_submit_cmd(cmd, folder, name, mem_gb=None, cpus=None, gpus=0, hours=1, slurm_account=None, slurm_exclude: list = None, **_): +def slurm_submit_cmd( + cmd, + folder, + name, + mem_gb=None, + cpus=None, + gpus=0, + hours=1, + slurm_account=None, + slurm_exclude: list = None, + slurm_niceness=None, + **_ +): executor = submitit.AutoExecutor(folder=(folder / "logs")) executor.update_parameters( @@ -165,8 +213,21 @@ def slurm_submit_cmd(cmd, folder, name, mem_gb=None, cpus=None, gpus=0, hours=1, if gpus > 0: executor.update_parameters(gpus_per_node=gpus) if slurm_account is not None: + + if slurm_account == f'ENVVAR_{PARTITION_ENVVAR}': + slurm_account = os.environ.get(PARTITION_ENVVAR) + if slurm_account is None: + logging.warning(f'{PARTITION_ENVVAR=} was not set, using no slurm account') + executor.update_parameters(slurm_account=slurm_account) + slurm_additional_params = {} + + if slurm_niceness is not None: + slurm_additional_params['nice'] = slurm_niceness + + executor.update_parameters(slurm_additional_parameters=slurm_additional_params) + while True: try: if callable(cmd[0]): @@ -328,15 +389,12 @@ def queue_combined( seed, configs, taskname=None, - mem_gb=None, exclude_gpus=[], - cpus=None, gpus=0, - hours=None, - slurm_account=None, overrides=[], include_coarse=True, input_indices=None, output_indices=None, + **kwargs ): input_suffix = get_suffix(input_indices) @@ -361,14 +419,11 @@ def queue_combined( f.write(f"{' '.join(' '.join(cmd).split())}\n\n") res = submit_cmd(cmd, - mem_gb=mem_gb, folder=folder, name=name, - cpus=cpus, gpus=gpus, - hours=hours, slurm_exclude=nodes_with_gpus(*exclude_gpus) + get_slurm_banned_nodes(), - slurm_account=slurm_account, + **kwargs ) return res, output_folder @@ -426,10 +481,9 @@ def queue_mesh_save( ): if (output_indices['subcam'] > 0) and reuse_subcams: - return "MARK_AS_SUCCEEDED", None + return JOB_OBJ_SUCCEEDED, None input_suffix = get_suffix(input_indices) - output_suffix = get_suffix(output_indices) output_folder = Path(f'{folder}/savemesh{output_suffix}') @@ -470,13 +524,12 @@ def queue_opengl( ): if (output_indices['subcam'] > 0) and reuse_subcams: - return "MARK_AS_SUCCEEDED", None + return JOB_OBJ_SUCCEEDED, None output_suffix = get_suffix(output_indices) tmp_script = Path(folder) / "tmp" / f"opengl_{uuid4().hex}.sh" tmp_script.parent.mkdir(exist_ok=True) - print(f"Creating {tmp_script}") process_mesh_path = Path("../process_mesh/build/process_mesh").resolve() input_folder = Path(folder)/f'savemesh{output_suffix}' # OUTPUT SUFFIX IS CORRECT HERE. I know its weird. But input suffix really means 'prev tier of the pipeline @@ -508,7 +561,8 @@ def queue_opengl( with (folder / "run_pipeline.sh").open('a') as f: f.write(f"{' '.join(' '.join(cmd).split())}\n\n") - res = submit_cmd(cmd, + res = submit_cmd( + cmd, folder=folder, name=name, slurm_exclude=nodes_with_gpus(*exclude_gpus) + get_slurm_banned_nodes(), @@ -516,54 +570,99 @@ def queue_opengl( ) return res, output_folder -@gin.configurable -def init_db(args, inorder_seeds=False, enumerate_scenetypes=[None]): +def init_db_from_existing(output_folder: Path): - n_scenes = args.num_scenes + # TODO in future: directly use existing_db (with some cleanup / checking). - scenes = [] + db_path = output_folder/'scenes_db.csv' + if not db_path.exists(): + raise ValueError(f'Recieved --use_existing but {db_path=} did not exist') + existing_db = pd.read_csv(db_path, converters={"configs": literal_eval}) - if args.use_existing: - for seed_folder in args.output_folder.iterdir(): - - if not seed_folder.is_dir(): - continue - if not (seed_folder/'logs').exists(): - logging.warning(f'Skipping {seed_folder=} due to missing "logs" subdirectory') - continue + def init_scene(seed_folder): + if not seed_folder.is_dir(): + return None + if not (seed_folder/'logs').exists(): + logging.warning(f'Skipping {seed_folder=} due to missing "logs" subdirectory') + return None + + configs = existing_db.loc[existing_db["seed"] == seed_folder.name, "configs"].iloc[0] + + scene_dict = { + 'seed': seed_folder.name, + 'all_done': SceneState.NotDone, + 'configs': list(configs) + } - n_scenes -= 1 + finish_key = 'FINISH_' + for finish_file_name in (seed_folder/'logs').glob(finish_key + '*'): + taskname = os.path.basename(finish_file_name)[len(finish_key):] + logging.info(f'Marking {seed_folder.name=} {taskname=} as completed') + scene_dict[f'{taskname}_submitted'] = True + scene_dict[f'{taskname}_job_obj'] = JOB_OBJ_SUCCEEDED - scene_dict = {'seed': seed_folder.name, 'all_done': SceneState.NotDone} + return scene_dict - finish_key = 'FINISH_' - for finish_file_name in (seed_folder/'logs').glob(finish_key + '*'): - taskname = os.path.basename(finish_file_name)[len(finish_key):] - print(f'Marking {seed_folder.name=} {taskname=} as completed') - scene_dict[f'{taskname}_submitted'] = True - scene_dict[f'{taskname}_job_obj'] = 'MARK_AS_SUCCEEDED' + return [init_scene(seed_folder) for seed_folder in output_folder.iterdir()] - scenes.append(scene_dict) - elif args.specific_seed is not None and len(args.specific_seed): - return [{"seed": s, "all_done": SceneState.NotDone} for s in args.specific_seed] +@gin.configurable +def sample_scene_spec(i, seed_range=None, config_distribution=None, config_sample_mode='random'): + + if seed_range is None: + seed = seed_generator() + else: + start, end = seed_range + if i > end - start: + return None + seed = hex(start + i).removeprefix('0x') + + if config_distribution is None: + configs = [] + elif config_sample_mode == 'random': + configs_options, weights = zip(*config_distribution) # list of rows to list per column + ps = np.array(weights) / sum(weights) + configs = np.random.choice(configs_options, p=ps) + elif config_sample_mode == 'roundrobin': + configs_options, weights = zip(*config_distribution) # list of rows to list per column + if not all(isinstance(w, int) for w in weights): + raise ValueError(f'{config_sample_mode=} expects integer scene counts as weights but got {weights=} with non-integer values') + idx = np.argmin(i % sum(weights) + 1 > np.cumsum(weights)) + configs = configs_options[idx] + else: + raise ValueError(f'Unrecognized {config_sample_mode=}') - if n_scenes > 0: - for scenetype in enumerate_scenetypes: - for i in range(n_scenes//len(enumerate_scenetypes)): - seed = i if inorder_seeds else seed_generator() - configs = [] - if scenetype is not None: - configs.append(scenetype) - seed = f'{scenetype}_{i}' - scene = {"all_done": SceneState.NotDone, "seed": seed, 'scene_configs': configs} - print(f'Added scene {seed}') - scenes.append(scene) + if isinstance(configs, str) and " " in configs: + configs = configs.split(" ") + if not isinstance(configs, list): + configs = [configs] + + return { + "all_done": SceneState.NotDone, + "seed": seed, + 'configs': configs + } + +@gin.configurable +def init_db(args): + + if args.use_existing: + scenes = init_db_from_existing(args.output_folder) + elif args.specific_seed is not None: + scenes = [{"seed": s, "all_done": SceneState.NotDone} for s in args.specific_seed] + else: + scenes = [sample_scene_spec(i) for i in range(args.num_scenes)] + + scenes = [s for s in scenes if s is not None] + + if len(scenes) < args.num_scenes: + logging.warning(f'Initialized only {len(scenes)=} despite {args.num_scenes=}. Likely due to --use_existing, --specific_seed or seed_range.') + return scenes def update_symlink(scene_folder, scenes): for new_name, scene in scenes: - if scene == 'MARK_AS_SUCCEEDED': + if scene == JOB_OBJ_SUCCEEDED: continue elif isinstance(scene, str): raise ValueError(f'Failed due to {scene=}') @@ -600,16 +699,27 @@ def make_html_page(output_path, scenes, frame, camera_pair_id, **kwargs): with output_path.open('a') as f: f.write(html) -def run_task(queue_func, scene_folder, scene_dict, taskname): - +@gin.configurable +def run_task( + queue_func, + scene_folder, + scene_dict, + taskname, + dryrun=False +): + assert scene_folder.parent.exists(), scene_folder scene_folder.mkdir(exist_ok=True) stage_scene_name = f"{scene_folder.parent.stem}_{scene_folder.stem}_{taskname}" assert not scene_dict.get(f'{taskname}_submitted', False) + if dryrun: + scene_dict[f'{taskname}_job_obj'] = JOB_OBJ_SUCCEEDED + scene_dict[f'{taskname}_submitted'] = 1 + return + seed = scene_dict['seed'] - logging.info(f"{seed} - Submitting {taskname} scene") job_obj, output_folder = queue_func( folder=scene_folder, name=stage_scene_name, @@ -706,15 +816,21 @@ def iterate_scene_tasks( scene_folder = args.output_folder/seed get_task_state = partial(get_scene_state, scene_dict=scene_dict, scene_folder=scene_folder) - global_overrides = [f'execute_tasks.frame_range={repr(list(frame_range))}', f'execute_tasks.camera_id=[0, 0]'] - global_configs = args.configs + scene_dict.get('scene_configs', []) - global_iter = iterate_sequential_tasks(global_tasks, get_task_state, - overrides=args.override+global_overrides, configs=global_configs) + global_overrides = [ + f'execute_tasks.frame_range={repr(list(frame_range))}', + f'execute_tasks.camera_id=[0, 0]' + ] + global_configs = scene_dict.get('configs', []) + args.configs + global_iter = iterate_sequential_tasks( + global_tasks, + get_task_state, + overrides=args.overrides+global_overrides, + configs=global_configs + ) for state, *rest in global_iter: yield state, *rest if not state == JobState.Succeeded: - logging.debug(f'{seed=} waiting on global') return view_range = render_frame_range if render_frame_range is not None else frame_range @@ -735,7 +851,7 @@ def iterate_scene_tasks( view_idxs = dict(cam_rig=cam_rig, frame=view_frame) view_tasks_iter = iterate_sequential_tasks( view_dependent_tasks, get_task_state, - overrides=args.override+view_overrides, + overrides=args.overrides+view_overrides, configs=global_configs, output_indices=view_idxs ) for state, *rest in view_tasks_iter: @@ -743,7 +859,6 @@ def iterate_scene_tasks( if state not in CONCLUDED_STATES: if viewdep_paralell: running_views += 1 - logging.debug(f'{seed=} {cam_rig,view_frame=} waiting on viewdep') continue else: return @@ -753,6 +868,7 @@ def iterate_scene_tasks( running_blocks = 0 for subcam, resample_idx in itertools.product(subcams, resamples): for cam_frame in range(view_frame_range[0], view_frame_range[1] + 1, cam_block_size): + cam_frame_range = [cam_frame, min(view_frame_range[1], cam_frame + cam_block_size - 1)] # blender frame_end is INCLUSIVE cam_overrides = [ f'execute_tasks.frame_range=[{cam_frame_range[0]},{cam_frame_range[1]}]', @@ -761,20 +877,27 @@ def iterate_scene_tasks( ] camdep_indices = dict( - cam_rig=cam_rig, frame=cam_frame, subcam=subcam, resample=resample_idx, - view_first_frame=view_frame_range[0], last_view_frame=view_frame_range[1], last_cam_frame=cam_frame_range[1] # this line explicitly used by most jobs + cam_rig=cam_rig, + frame=cam_frame, + subcam=subcam, + resample=resample_idx, + view_first_frame=view_frame_range[0], + last_view_frame=view_frame_range[1], + last_cam_frame=cam_frame_range[1] # this line explicitly used by most jobs ) camera_dep_iter = iterate_sequential_tasks( - camera_dependent_tasks, get_task_state, - overrides=args.override+cam_overrides, configs=global_configs, + camera_dependent_tasks, + get_task_state, + overrides=args.overrides+cam_overrides, + configs=global_configs, input_indices=view_idxs if len(view_dependent_tasks) else None, - output_indices=camdep_indices) + output_indices=camdep_indices + ) for state, *rest in camera_dep_iter: yield state, *rest if state not in CONCLUDED_STATES: if camdep_paralell: running_blocks += 1 - logging.debug(f'{seed=} {cam_rig,cam_frame=} waiting on viewdep') continue else: return @@ -792,7 +915,6 @@ def iterate_scene_tasks( path = scene_dict[f'{taskname}_output_folder'] print(f'Cleaning {path} for {taskname}') if path == scene_folder: - print(f'Skipping {path}') continue if path is not None and path.exists(): cleanup(path) @@ -829,7 +951,7 @@ def infer_crash_reason(stdout_file, stderr_file: Path): if "System is out of GPU memory" in error_log: return "Out of GPU memory" - elif "this scene is timed-out" in error_log: + elif "this scene is timed-out" in error_log or 'DUE TO TIME LIMIT' in error_log: return "Timed out" elif "" in error_log: return "SIGKILL: 9 (out-of-memory, probably)" @@ -846,8 +968,14 @@ def infer_crash_reason(stdout_file, stderr_file: Path): output_text = f"{stdout_file.read_text()}\n{stderr_file.read_text()}\n" matches = re.findall("(Error:[^\n]+)\n", output_text) + ignore_errors = [ + 'Error: Not freed memory blocks', + ] + + matches = [m for m in matches if not any(w in m for w in ignore_errors)] + if len(matches): - return ','.join([m.strip() for m in matches if ('Error: Not freed memory blocks' not in m)]) + return ','.join(matches) else: return f"Could not summarize cause, check {stderr_file}" @@ -855,14 +983,10 @@ def record_crashed_seed(crashed_seed, crash_stage, f, fatal=True): time_str = datetime.now().strftime("%m/%d %I:%M%p") stdout_file = args.output_folder / crashed_seed / "logs" / f"{crash_stage}.out" stderr_file = args.output_folder / crashed_seed / "logs" / f"{crash_stage}.err" + scene_id, *_ = stderr_file.resolve().stem.split('_') - node_of_scene = "" - if which('sacct'): - try: - node_of_scene, *rest = subprocess.check_output(f"{which('sacct')} -j {scene_id} --format Node --noheader".split()).decode().split() - except Exception as e: - logging.warning(f'sacct threw {e}') - return + node_of_scene = node_from_slurm_jobid(scene_id) + reason = infer_crash_reason(stdout_file, stderr_file) text = f"{crashed_seed} {crash_stage} {scene_id} {node_of_scene} {reason} {fatal=} {time_str}\n" print('Crashed: ' + text) @@ -884,27 +1008,32 @@ def stats_summary(stats): stats = {k: v for k, v in stats.items() if not k.startswith(JobState.NotQueued)} lemmatized = set(l.split('_')[0] for l in stats.keys()) stats = {l: sum(v for k, v in stats.items() if k.startswith(l)) for l in lemmatized} - for p in set(k.split('/')[0] for k in stats.keys()): - stats[f'{p}/total'] = sum(v for k, v in stats.items() if k.startswith(p)) - return stats - -@gin.configurable -def manage_datagen_jobs(all_scenes, elapsed, num_concurrent, sleep_threshold=0.95): + + uniq_keys = set(k.split('/')[0] for k in stats.keys()) + totals = {p: sum(v for k, v in stats.items() if k.startswith(p)) for p in uniq_keys} - if LocalScheduleHandler._inst is not None: - LocalScheduleHandler.instance().poll() + for k, v in totals.items(): + stats[f'{k}/total'] = v + + return stats, totals - curr_concurrent_max = math.floor(1 + num_concurrent * elapsed / args.warmup_sec) if elapsed < args.warmup_sec else num_concurrent +def monitor_existing_jobs(all_scenes): - # Check results / current state of scenes we have already launched stats = defaultdict(int) + for scene in all_scenes: + scene['num_running'], scene['num_done'] = 0, 0 any_fatal = False for state, taskname, _, fatal in iterate_scene_tasks(scene, args, monitor_all=True): + + if state == JobState.NotQueued: + continue + stats[f'{state}/{taskname}'] += 1 scene['num_done'] += state in CONCLUDED_STATES scene['num_running'] += state not in CONCLUDED_STATES + if state == JobState.Failed: if not scene.get(f'{taskname}_crash_recorded', False): scene[f'{taskname}_crash_recorded'] = True @@ -913,43 +1042,70 @@ def manage_datagen_jobs(all_scenes, elapsed, num_concurrent, sleep_threshold=0.9 if fatal: any_fatal = True + if any_fatal: + scene['any_fatal_crash'] = True + if scene['num_running'] == 0 and any_fatal and scene['all_done'] == SceneState.NotDone: scene['all_done'] = SceneState.Crashed with (args.output_folder / "crash_summaries.txt").open('a') as f: check_and_perform_cleanup(args, scene['seed'], crashed=True) - # Report stats, with sums by prefix, and extra info - stats = stats_summary(stats) + return stats + +def jobs_to_launch_next(all_scenes, greedy=True): + scenes = [j for j in all_scenes if (j["all_done"] == SceneState.NotDone)] + if greedy: + scenes = sorted(scenes, key=lambda s: s['num_running'] + s['num_done'], reverse=True) + for scene in scenes: + if scene.get('any_fatal_crash', False): + continue + for state, taskname, queue_func, _ in iterate_scene_tasks(scene, args, monitor_all=False): + if state != JobState.NotQueued: + continue + yield scene, taskname, queue_func + +@gin.configurable +def manage_datagen_jobs(all_scenes, elapsed, num_concurrent, disk_sleep_threshold=0.95): + + if LocalScheduleHandler._inst is not None: + LocalScheduleHandler.instance().poll() + + warmup_pct = min(elapsed / args.warmup_sec, 1) if args.warmup_sec > 0 else 1 + curr_concurrent_max = math.ceil(warmup_pct * num_concurrent) + + # Check results / current state of scenes we have already launched + stats = monitor_existing_jobs(all_scenes) + stats, totals = stats_summary(stats) + + n_in_flight = totals.get(JobState.Running, 0) + totals.get(JobState.Queued, 0) + if n_in_flight > curr_concurrent_max: + raise ValueError(f'manage_datagen_jobs observed {n_in_flight=}, which exceeds allowed {curr_concurrent_max=}') + n_to_launch = max(curr_concurrent_max - n_in_flight, 0) + + pd.DataFrame.from_records(all_scenes).to_csv(args.output_folder/'scenes_db.csv') + + stats['n_in_flight'] = n_in_flight + stats['n_launching'] = n_to_launch stats['disk_usage'] = get_disk_usage(args.output_folder) stats['concurrent_max'] = curr_concurrent_max wandb.log(stats) + print("=" * 60) for k,v in sorted(stats.items()): print(f"{k.ljust(30)} : {v}") print("-" * 60) # Dont launch new scenes if disk is getting full - if stats['disk_usage'] > sleep_threshold: + if stats['disk_usage'] > disk_sleep_threshold: print(f"{args.output_folder} is too full ({get_disk_usage(args.output_folder)}%). Sleeping.") wandb.alert(title='Disk full', text=f'Sleeping due to full disk at {args.output_folder=}', wait_duration=3*60*60) time.sleep(60) return - # Launch new scenes to bring the current load back up to `curr_concurrent_max` - scenes = [j for j in all_scenes if (j["all_done"] == SceneState.NotDone)] - scenes = sorted(scenes, key=lambda s: s['num_running'] + s['num_done'], reverse=True) # greedily try to finish nearly-done videos asap - to_be_launched = curr_concurrent_max - stats.get(f'{JobState.Running}/all', 0) - stats.get(f'{JobState.Queued}/all', 0) - if to_be_launched <= 0: - return - for scene in scenes[:curr_concurrent_max]: - for state, taskname, queue_func, _ in iterate_scene_tasks(scene, args, monitor_all=False): - if state != JobState.NotQueued: - continue - to_be_launched -= 1 - run_task(queue_func, args.output_folder / str(scene['seed']), scene, taskname) - if to_be_launched == 0: - break - if to_be_launched == 0: - break + # Launch to get back to intended n=`curr_concurrent_max` that should be in flight + for spec in itertools.islice(jobs_to_launch_next(all_scenes), n_to_launch): + scene, taskname, queue_func = spec + logging.info(f"{scene['seed']} - running {taskname}") + run_task(queue_func, args.output_folder / str(scene['seed']), scene, taskname) @gin.configurable def main(args, shuffle=True, wandb_project='render_beta'): @@ -963,7 +1119,7 @@ def main(args, shuffle=True, wandb_project='render_beta'): wandb.init(name=scene_name, config=vars(args), project=wandb_project, mode=args.wandb_mode) logging.basicConfig( - filename=str(args.output_folder / "jobs.log"), + #filename=str(args.output_folder / "jobs.log"), level=args.loglevel, format='[%(asctime)s]: %(message)s', ) @@ -978,20 +1134,23 @@ def main(args, shuffle=True, wandb_project='render_beta'): while any(j['all_done'] == SceneState.NotDone for j in all_scenes): now = datetime.now() print(f'{args.output_folder} {start_time.strftime("%m/%d %I:%M%p")} -> {now.strftime("%m/%d %I:%M%p")}') - logging.info('=' * 80) - manage_datagen_jobs(scenes, elapsed=(now-start_time).seconds) - logging.info("-" * 80) + manage_datagen_jobs(scenes, elapsed=(now-start_time).total_seconds()) time.sleep(4) -def test_upload(args): + +def set_blender_path_global(args): - from_folder = args.output_folder/f'test_upload_{args.output_folder.name}' - from_folder.mkdir(parents=True, exist_ok=True) - (from_folder/'test_file.txt').touch() + global BLENDER_PATH + if args.blender_path is None: + if 'BLENDER' in os.environ: + BLENDER_PATH = os.environ['BLENDER'] + else: + BLENDER_PATH = '../blender/blender' # assuming we run from infinigen/worldgen + else: + BLENDER_PATH = args.blender_path + if not os.path.exists(BLENDER_PATH): + raise ValueError(f'Couldnt not find {BLENDER_PATH=}, make sure --blender_path or $BLENDER is specified') - upload_util.upload_folder(from_folder, Path('infinigen/test_upload/')) - rmtree(from_folder) - if __name__ == "__main__": assert Path('.').resolve().parts[-1] == 'worldgen' @@ -1063,7 +1222,7 @@ def test_upload(args): ) parser.add_argument( '-p', - '--override', + '--overrides', nargs='+', type=str, default=[], @@ -1089,32 +1248,22 @@ def test_upload(args): default=[], help="List of gin overrides to configure this execution", ) + parser.add_argument('--overwrite', action='store_true') parser.add_argument('-d', '--debug', action="store_const", dest="loglevel", const=logging.DEBUG, default=logging.INFO) parser.add_argument( '-v', '--verbose', action="store_const", dest="loglevel", const=logging.INFO) args = parser.parse_args() - envvar = 'INFINIGEN_ASSET_FOLDER' - if not args.upload and args.cleanup == 'all': - raise ValueError(f'Pipeline is configured with {args.cleanup=} yet {args.upload=} --- no output would be preserved') - - global BLENDER_PATH - if args.blender_path is None: - if 'BLENDER' in os.environ: - BLENDER_PATH = os.environ['BLENDER'] - else: - BLENDER_PATH = '../blender/blender' # assuming we run from infinigen/worldgen - else: - BLENDER_PATH = args.blender_path - if not os.path.exists(BLENDER_PATH): - raise ValueError(f'Couldnt not find {BLENDER_PATH=}, make sure --blender_path or $BLENDER is specified') - + raise ValueError(f'Pipeline is configured with {args.cleanup=} yet {args.upload=}! No output would be preserved!') + if args.upload and args.cleanup == 'none': + raise ValueError(f'--upload currently applies --cleanup big_files') assert args.specific_seed is None or args.num_scenes == 1 + set_blender_path_global(args) - if args.output_folder.exists() and not args.use_existing: - raise FileExistsError(f'--output_folder {args.output_folder} already exists! Quitting to avoid overwrite. Please delete it, or specify a new --output_folder') - - args.output_folder.mkdir(parents=True, exist_ok=args.use_existing) + overwrite_ok = args.use_existing or args.overwrite + if args.output_folder.exists() and not overwrite_ok: + raise FileExistsError(f'--output_folder {args.output_folder} already exists! Please delete it, specify a different --output_folder, or use --overwrite') + args.output_folder.mkdir(parents=True, exist_ok=overwrite_ok) if args.meta_seed is not None: random.seed(args.meta_seed) @@ -1127,7 +1276,7 @@ def find_config(g): if p.parts[-1] == f'{g}.gin': return p raise ValueError(f'Couldn not locate {g} or {g}.gin in anywhere pipeline_configs/**') - configs = [find_config(n) for n in args.pipeline_configs] + configs = [find_config(n) for n in ['base.gin'] + args.pipeline_configs] for c in configs: assert os.path.exists(c), c bindings = args.pipeline_overrides diff --git a/worldgen/tools/pipeline_configs/base.gin b/worldgen/tools/pipeline_configs/base.gin new file mode 100644 index 000000000..7eb526cac --- /dev/null +++ b/worldgen/tools/pipeline_configs/base.gin @@ -0,0 +1,16 @@ +sample_scene_spec.config_distribution = [ + ("forest", 4), + ("river", 4), + ("desert", 3), + ("coast", 3), + ("kelp_forest", 2), + ("coral_reef", 2), + ("cave", 2), + ("mountain", 2), + ("canyon", 2), + ("plain", 2), + ("cliff", 2), + ("arctic", 1), + ("snowy_mountain", 1), +] +sample_scene_spec.config_sample_mode = 'random' diff --git a/worldgen/tools/pipeline_configs/compute_platform/local_256GB.gin b/worldgen/tools/pipeline_configs/compute_platform/local_256GB.gin index 8d24ebf76..d7998ee33 100644 --- a/worldgen/tools/pipeline_configs/compute_platform/local_256GB.gin +++ b/worldgen/tools/pipeline_configs/compute_platform/local_256GB.gin @@ -1,12 +1,13 @@ manage_datagen_jobs.num_concurrent = 16 -# Job updater -get_cmd.niceness = 20 # let UI processes etc take precedence, to make the smooth and UI usable +get_cmd.process_niceness = 20 # let UI processes etc take precedence, to make the smooth and UI usable local_submit_cmd.use_scheduler = True LocalScheduleHandler.jobs_per_gpu = 1 +get_cmd.blender_thread_limit = 8 + # All will run locally, LocalScheduleHandler doesnt actually enforce cpu/ram constraints currently queue_coarse.submit_cmd = @local_submit_cmd queue_fine_terrain.submit_cmd = @local_submit_cmd diff --git a/worldgen/tools/pipeline_configs/compute_platform/slurm.gin b/worldgen/tools/pipeline_configs/compute_platform/slurm.gin index cd9686cb5..ce4a0481b 100644 --- a/worldgen/tools/pipeline_configs/compute_platform/slurm.gin +++ b/worldgen/tools/pipeline_configs/compute_platform/slurm.gin @@ -1,8 +1,9 @@ -PARTITION = None +PARTITION = 'ENVVAR_INFINIGEN_SLURMPARTITION' # change to partitionname string, or None -manage_datagen_jobs.num_concurrent = 800 -get_slurm_banned_nodes.config_path = None # add a white-space separated txt file path here +manage_datagen_jobs.num_concurrent = 200 +slurm_submit_cmd.slurm_niceness=10000 +get_slurm_banned_nodes.config_path = 'ENVVAR_INFINIGEN_SLURM_EXCLUDENODES_LIST' # Combined (only used when `stereo_combined.gin` or similar is included) queue_combined.mem_gb = 12 @@ -40,7 +41,6 @@ queue_populate.exclude_gpus = ['a6000', 'rtx_3090'] queue_render.submit_cmd = @slurm_submit_cmd queue_render.hours = 48 -rendershort/queue_render.exclude_gpus = [] # no point requesting less than 48GB RAM, 8CPUs, due to ratios of RAM:GPUs on pvl rendershort/queue_render.mem_gb = 48 rendershort/queue_render.cpus = 8 @@ -48,13 +48,13 @@ rendershort/queue_render.slurm_account = %PARTITION rendershort/queue_render.gpus = 1 rendershort/queue_render.render_type = "full" -queue_render.exclude_gpus = ['gtx_1080', 'k80'] +rendershort/queue_render.exclude_gpus = ['gtx_1080', 'k80'] renderbackup/queue_render.exclude_gpus = ['gtx_1080', 'k80', 'rtx_2080'] -renderbackup/queue_render.mem_gb = 64 -renderbackup/queue_render.cpus = 12 +renderbackup/queue_render.mem_gb = 96 +renderbackup/queue_render.cpus = 16 renderbackup/queue_render.slurm_account = %PARTITION -renderbackup/queue_render.gpus = 1 +renderbackup/queue_render.gpus = 2 renderbackup/queue_render.render_type = "full" # Upload @@ -68,15 +68,15 @@ queue_upload.dir_prefix_len = 2 # Ground Truth queue_mesh_save.submit_cmd = @slurm_submit_cmd -queue_mesh_save.mem_gb = 24 -queue_mesh_save.cpus = 4 +queue_mesh_save.mem_gb = 48 +queue_mesh_save.cpus = 8 queue_mesh_save.hours = 24 queue_mesh_save.slurm_account = %PARTITION queue_mesh_save.gpus = 0 queue_opengl.submit_cmd = @slurm_submit_cmd -queue_opengl.mem_gb = 24 -queue_opengl.cpus = 4 +queue_opengl.mem_gb = 48 +queue_opengl.cpus = 8 queue_opengl.hours = 24 queue_opengl.slurm_account = %PARTITION queue_opengl.gpus = 1 diff --git a/worldgen/tools/pipeline_configs/compute_platform/slurm_1h.gin b/worldgen/tools/pipeline_configs/compute_platform/slurm_1h.gin new file mode 100644 index 000000000..74c4c3c77 --- /dev/null +++ b/worldgen/tools/pipeline_configs/compute_platform/slurm_1h.gin @@ -0,0 +1,12 @@ +include 'tools/pipeline_configs/compute_platform/slurm.gin' + +queue_combined.hours = 1 +queue_coarse.hours = 1 +queue_fine_terrain.hours = 1 +queue_populate.hours = 1 +queue_render.hours = 1 +queue_upload.hours = 1 +queue_mesh_save.hours = 1 +queue_opengl.hours = 1 + +queue_coarse.cpus = 8 \ No newline at end of file diff --git a/worldgen/tools/pipeline_configs/blender_gt.gin b/worldgen/tools/pipeline_configs/gt_options/blender_gt.gin similarity index 100% rename from worldgen/tools/pipeline_configs/blender_gt.gin rename to worldgen/tools/pipeline_configs/gt_options/blender_gt.gin diff --git a/worldgen/tools/pipeline_configs/gt_test.gin b/worldgen/tools/pipeline_configs/gt_options/gt_test.gin similarity index 100% rename from worldgen/tools/pipeline_configs/gt_test.gin rename to worldgen/tools/pipeline_configs/gt_options/gt_test.gin diff --git a/worldgen/tools/pipeline_configs/opengl_gt.gin b/worldgen/tools/pipeline_configs/gt_options/opengl_gt.gin similarity index 100% rename from worldgen/tools/pipeline_configs/opengl_gt.gin rename to worldgen/tools/pipeline_configs/gt_options/opengl_gt.gin diff --git a/worldgen/tools/pipeline_configs/gt_options/opengl_gt_noshortrender.gin b/worldgen/tools/pipeline_configs/gt_options/opengl_gt_noshortrender.gin new file mode 100644 index 000000000..a3db10fa7 --- /dev/null +++ b/worldgen/tools/pipeline_configs/gt_options/opengl_gt_noshortrender.gin @@ -0,0 +1,7 @@ +include 'tools/pipeline_configs/opengl_gt.gin' # incase someone adds other settings to it + +iterate_scene_tasks.camera_dependent_tasks = [ + {'name': 'renderbackup', 'func': @renderbackup/queue_render}, # still call it "backup" since it is reusing the compute_platform's backup config. we are just skipping straight to the backup + {'name': 'savemesh', 'func': @queue_mesh_save}, + {'name': 'opengl', 'func': @queue_opengl} +] \ No newline at end of file diff --git a/worldgen/tools/pipeline_configs/opengl_gt_noshortrender.gin b/worldgen/tools/pipeline_configs/opengl_gt_noshortrender.gin new file mode 100644 index 000000000..a3db10fa7 --- /dev/null +++ b/worldgen/tools/pipeline_configs/opengl_gt_noshortrender.gin @@ -0,0 +1,7 @@ +include 'tools/pipeline_configs/opengl_gt.gin' # incase someone adds other settings to it + +iterate_scene_tasks.camera_dependent_tasks = [ + {'name': 'renderbackup', 'func': @renderbackup/queue_render}, # still call it "backup" since it is reusing the compute_platform's backup config. we are just skipping straight to the backup + {'name': 'savemesh', 'func': @queue_mesh_save}, + {'name': 'opengl', 'func': @queue_opengl} +] \ No newline at end of file diff --git a/worldgen/tools/render_video_final.sh b/worldgen/tools/render_video_final.sh deleted file mode 100644 index bbc7020b7..000000000 --- a/worldgen/tools/render_video_final.sh +++ /dev/null @@ -1,4 +0,0 @@ -HOSTFIRST=$(hostname | tr "." "\n" | head -n 1) -JOBNAME=$(date '+%m_%d_%H_%M').$HOSTFIRST.$1 -python -m tools.manage_datagen_jobs --blender_path ../blender/blender --output_folder outputs/$JOBNAME --num_scenes 500 \ - --pipeline_config slurm stereo_video cuda_terrain opengl_gt --wandb_mode online --cleanup big_files --warmup_sec 60000 --config trailer high_quality_terrain --upload diff --git a/worldgen/tools/results/parse_videos.py b/worldgen/tools/results/parse_videos.py index 6f1ae4841..867aab7c7 100644 --- a/worldgen/tools/results/parse_videos.py +++ b/worldgen/tools/results/parse_videos.py @@ -37,7 +37,7 @@ filters = " " if args.overlay: filters += f"-vf drawtext='text={seed_folder.absolute()}' " - cmd = f'ffmpeg -y -r {args.fps} -pattern_type glob -i {seed_folder.absolute()}/frames*/{args.image_type}*.png {filters} -pix_fmt yuv420p {output_folder}/{seed_folder.name}_{args.image_type}.mp4' + cmd = f'ffmpeg -y -r {args.fps} -pattern_type glob -i {seed_folder.absolute()}/frames*_0/{args.image_type}*.png {filters} -pix_fmt yuv420p {output_folder}/{seed_folder.name}_{args.image_type}.mp4' print(cmd.split()) subprocess.run(cmd.split()) diff --git a/worldgen/tools/summarize.py b/worldgen/tools/results/summarize.py similarity index 89% rename from worldgen/tools/summarize.py rename to worldgen/tools/results/summarize.py index 56bf24193..575c9f6fb 100644 --- a/worldgen/tools/summarize.py +++ b/worldgen/tools/results/summarize.py @@ -29,7 +29,7 @@ def parse_mask_tag_jsons(base_folder): for file_path in base_folder.rglob('MaskTag.json'): if match := re.fullmatch("fine_([0-9])_([0-9])_([0-9]{4})_([0-9])", file_path.parent.name): _, _, frame_str, _ = match.groups() - yield (frame_str, file_path) + yield (int(frame_str), file_path) for file_path in base_folder.rglob('MaskTag.json'): if match := re.fullmatch("fine.*", file_path.parent.name): yield (0, file_path) @@ -47,9 +47,11 @@ def summarize_folder(base_folder): output[data_type][suffix][rig][subcam][frame_str] = str(file_path.relative_to(base_folder)) max_frame = max(max_frame, int(frame_str)) + print(output.keys()) + # Rename keys - output["Camera Pose"] = output.pop("T") - output["Camera Intrinsics"] = output.pop("K") + #output["Camera Pose"] = output.pop("T") + #output["Camera Intrinsics"] = output.pop("K") mask_tag_jsons = sorted(parse_mask_tag_jsons(base_folder)) for frame in range(1, max_frame+1): @@ -115,23 +117,15 @@ def depth_to_jet(depth, scale_vmin=1.0): depth[~valid] = 1 return np.ascontiguousarray(depth[...,:3] * 255, dtype=np.uint8) - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser() - parser.add_argument('folder', type=Path) - parser.add_argument('--preview', action='store_true') - args = parser.parse_args() - - summary_json = summarize_folder(args.folder) +def process_scene_folder(folder, preview): + summary_json = summarize_folder(folder) folder_data = json.loads(summary_json.read_text()) missing = what_is_missing(folder_data) print("\n".join(missing)) - if not args.preview: - exit(0) + if not preview: + return depth_paths = folder_data["Depth"]['npy']["00"]["00"] flow3d_paths = folder_data["Flow3D"]['npy']["00"]["00"] @@ -142,15 +136,26 @@ def depth_to_jet(depth, scale_vmin=1.0): shape = (1280, 720) with mp.Pool() as pool: - all_flow_frames = pool.starmap(process_flow_frame, tqdm([(args.folder / path, shape) for _, path in sorted(flow3d_paths.items())])) - all_depth_frames = pool.starmap(process_depth_frame, tqdm([(args.folder / path, shape) for _, path in sorted(depth_paths.items())])) - all_occlusion_frames = pool.starmap(process_mask, tqdm([(args.folder / path, shape) for _, path in sorted(occlusion_boundary_paths.items())])) - all_flow_mask_frames = pool.starmap(process_mask, tqdm([(args.folder / path, shape) for _, path in sorted(flow_mask_paths.items())])) + all_flow_frames = pool.starmap(process_flow_frame, tqdm([(folder / path, shape) for _, path in sorted(flow3d_paths.items())])) + all_depth_frames = pool.starmap(process_depth_frame, tqdm([(folder / path, shape) for _, path in sorted(depth_paths.items())])) + all_occlusion_frames = pool.starmap(process_mask, tqdm([(folder / path, shape) for _, path in sorted(occlusion_boundary_paths.items())])) + all_flow_mask_frames = pool.starmap(process_mask, tqdm([(folder / path, shape) for _, path in sorted(flow_mask_paths.items())])) - previews: Path = args.folder / "previews" + previews: Path = folder / "previews" previews.mkdir(exist_ok=True) frames_to_video(previews / 'occlusion_boundaries.avi', all_occlusion_frames) frames_to_video(previews / 'flow_mask.avi', all_flow_mask_frames) depth_visualization = depth_to_jet(np.asarray(all_depth_frames)) frames_to_video(previews / 'video_depth.avi', depth_visualization) frames_to_video(previews / 'flow_video.avi', all_flow_frames) + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument('folder', type=Path) + parser.add_argument('--preview', action='store_true') + args = parser.parse_args() + + process_scene_folder(args.folder, preview=args.preview) + + diff --git a/worldgen/tools/scripts/render_video_1080p.sh b/worldgen/tools/scripts/render_video_1080p.sh new file mode 100644 index 000000000..5b54f3d97 --- /dev/null +++ b/worldgen/tools/scripts/render_video_1080p.sh @@ -0,0 +1,8 @@ +HOSTFIRST=$(hostname | tr "." "\n" | head -n 1) +JOBNAME=$(date '+%m_%d_%H_%M').$HOSTFIRST.$1 + +python -m tools.manage_datagen_jobs --blender_path ../blender/blender --output_folder outputs/$JOBNAME \ + --num_scenes 100 --pipeline_config $@ stereo_video cuda_terrain opengl_gt_noshortrender \ + --wandb_mode online --cleanup big_files --upload \ + --warmup_sec 40000 \ + --config high_quality_terrain diff --git a/worldgen/tools/scripts/render_video_720p.sh b/worldgen/tools/scripts/render_video_720p.sh new file mode 100644 index 000000000..a00dbb994 --- /dev/null +++ b/worldgen/tools/scripts/render_video_720p.sh @@ -0,0 +1,8 @@ +HOSTFIRST=$(hostname | tr "." "\n" | head -n 1) +JOBNAME=$(date '+%m_%d_%H_%M').$HOSTFIRST.$1 + +python -m tools.manage_datagen_jobs --output_folder outputs/$JOBNAME \ + --num_scenes 100 --pipeline_config $@ stereo_video cuda_terrain opengl_gt \ + --wandb_mode online --cleanup big_files --upload --warmup_sec 6000 \ + --config high_quality_terrain \ + --overrides compose_scene.generate_resolution=[1280,720] diff --git a/worldgen/tools/scripts/render_video_stereo.sh b/worldgen/tools/scripts/render_video_stereo.sh new file mode 100644 index 000000000..3e1d4edc3 --- /dev/null +++ b/worldgen/tools/scripts/render_video_stereo.sh @@ -0,0 +1,7 @@ +HOSTFIRST=$(hostname | tr "." "\n" | head -n 1) +JOBNAME=$(date '+%m_%d_%H_%M').$HOSTFIRST.$1 + +python -m tools.manage_datagen_jobs --blender_path ../blender/blender --output_folder outputs/$JOBNAME \ + --num_scenes 1000 --pipeline_config $@ stereo cuda_terrain opengl_gt \ + --wandb_mode online --cleanup big_files --upload --warmup_sec 40000 \ + --override compose_scene.generate_resolution=[1280,720] \ diff --git a/worldgen/tools/cancel_jobs.py b/worldgen/tools/util/cancel_jobs.py similarity index 100% rename from worldgen/tools/cancel_jobs.py rename to worldgen/tools/util/cancel_jobs.py diff --git a/worldgen/tools/util/show_gpu_table.py b/worldgen/tools/util/show_gpu_table.py index 07a983ced..dca63f04c 100644 --- a/worldgen/tools/util/show_gpu_table.py +++ b/worldgen/tools/util/show_gpu_table.py @@ -52,7 +52,7 @@ def nodes_with_gpus(*gpu_names): if len(gpu_names) == 0: return [] _, node_type_lookup, _ = get_gpu_nodes() - return sorted(chain.from_iterable(node_type_lookup[n] for n in gpu_names)) + return sorted(chain.from_iterable(node_type_lookup.get(n, set()) for n in gpu_names)) if __name__ == '__main__': gpu_table, node_type_lookup, shared_node_mem = get_gpu_nodes() diff --git a/worldgen/tools/util/submitit_emulator.py b/worldgen/tools/util/submitit_emulator.py index 5612735c3..d5153aa9a 100644 --- a/worldgen/tools/util/submitit_emulator.py +++ b/worldgen/tools/util/submitit_emulator.py @@ -69,7 +69,6 @@ def get_fake_job_id(): def job_wrapper(func, inner_args, inner_kwargs, stdout_file: Path, stderr_file: Path, cuda_devices=None): - with stdout_file.open('w') as stdout, stderr_file.open('w') as stderr: sys.stdout = stdout sys.stderr = stderr @@ -103,6 +102,7 @@ class ImmediateLocalExecutor: def __init__(self, folder: str): self.log_folder = Path(folder).resolve() + self.log_folder.mkdir(exist_ok=True) self.parameters = {} def update_parameters(self, **parameters): diff --git a/worldgen/tools/util/upload_util.py b/worldgen/tools/util/upload_util.py index ba19bef16..b23f2f0b5 100644 --- a/worldgen/tools/util/upload_util.py +++ b/worldgen/tools/util/upload_util.py @@ -17,7 +17,7 @@ import subprocess from shutil import which, copyfile -from . import smb_client +from . import smb_client, cleanup GDRIVE_NAME = None @@ -31,38 +31,16 @@ def rclone_upload_file(src_file, dst_folder): subprocess.check_output(cmd.split()) print(f"Uploaded {src_file}") +def get_commit_hash(): + git = which('git') + if git is None: + return None + cmd = f"{git} rev-parse HEAD" + return subprocess.check_output(cmd.split()).decode().strip() + # DO NOT make gin.configurable # this function gets submitted via pickle in some settings, and gin args are not preserved -def upload_folder(folder, upload_dest_folder, method, metadata=None, **kwargs): - - upload_info_path = folder / f"{folder.name}.json" - upload_info = { - 'user': os.environ['USER'], - 'node': platform.node().split('.')[0], - 'timestamp': time.time(), - 'datetime': datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), - **(metadata if metadata is not None else {}) - } - with upload_info_path.open('w') as f: - json.dump(upload_info, f, indent=4) - - with tarfile.open(folder.with_suffix('.tar.gz'), "w:gz") as tar: - tar.add(folder, os.path.sep) - assert folder.with_suffix('.tar.gz').exists() - - if method == 'rclone': - upload_func = rclone_upload_file - elif method == 'smbclient': - upload_func = smb_client.upload - else: - raise ValueError(f'Unrecognized {method=}') - - upload_func(folder.with_suffix('.tar.gz'), upload_dest_folder, **kwargs) - upload_func(upload_info_path, upload_dest_folder, **kwargs) - -def upload_job_folder(parent_folder, task_uniqname, dir_prefix_len=3, method='smbclient', **kwargs): - - parent_folder = Path(parent_folder) +def reorganize_before_upload(parent_folder): seed = parent_folder.name tmpdir = (parent_folder / "tmp" / seed) @@ -71,6 +49,7 @@ def upload_job_folder(parent_folder, task_uniqname, dir_prefix_len=3, method='sm frames_folders = list(sorted(parent_folder.glob("frames*"))) for idx, frames_folder in enumerate(frames_folders): + subfolder_name = f"resample_{idx}" if (idx > 0) else "original" subfolder = tmpdir / subfolder_name info_dir = subfolder / "info" @@ -95,26 +74,67 @@ def upload_job_folder(parent_folder, task_uniqname, dir_prefix_len=3, method='sm copyfile(parent_folder / "run_pipeline.sh", log_dir / "run_pipeline.sh") - version = (parent_folder / "fine" / "version.txt").read_text().splitlines()[0] - upload_dest_folder = Path('infinigen')/'renders'/version - if dir_prefix_len != 0: - upload_dest_folder = upload_dest_folder/seed[:dir_prefix_len] +# DO NOT make gin.configurable +# this function gets submitted via pickle in some settings, and gin args are not preserved +def upload_job_folder( + parent_folder, + task_uniqname, + dir_prefix_len=3, + method='smbclient', +): - metadata = { - 'n_frames_folders': len(frames_folders), - 'original_directory': str(parent_folder.resolve()) - } + parent_folder = Path(parent_folder) - upload_folder(tmpdir, upload_dest_folder, method=method, metadata=metadata, **kwargs) + if method == 'rclone': + upload_func = rclone_upload_file + elif method == 'smbclient': + upload_func = smb_client.upload + else: + raise ValueError(f'Unrecognized {method=}') - (parent_folder / "logs" / f"FINISH_{task_uniqname}").touch() + jobname = parent_folder.parent.name + seed = parent_folder.name + + upload_dest_folder = Path('infinigen')/'renders'/jobname + if dir_prefix_len != 0: + upload_dest_folder = upload_dest_folder/parent_folder.name[:dir_prefix_len] + + print(f'{method=} {upload_dest_folder=}') + + all_images = sorted(list(parent_folder.rglob("frames*/Image*.png"))) + if len(all_images) > 0: + thumb_path = parent_folder/f'{seed}_thumbnail.png' + copyfile(all_images, thumb_path) + upload_func(thumb_path, upload_dest_folder) -def test(): - import manage_datagen_jobs - find_gin = lambda n: os.path.join("tools", "pipeline_configs", f"{n}.gin") - configs = [find_gin(n) for n in ['andromeda', 'smb_login']] - gin.parse_config_files_and_bindings(configs, bindings=[]) - upload_folder(Path('outputs/23_01_25_allcs/a4b66f1'), 'upload', dir_prefix_len=3, method='smbclient') + try: + version = (parent_folder / "coarse" / "version.txt").read_text().splitlines()[0] + except FileNotFoundError: + version = None -if __name__ == "__main__": - test() + metadata = { + 'original_directory': str(parent_folder.resolve()), + 'user': os.environ['USER'], + 'node': platform.node().split('.')[0], + 'timestamp': time.time(), + 'datetime': datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), + 'version': version, + 'commit': get_commit_hash(), + 'n_frames': len(all_images) + } + metadata_path = parent_folder/f'{seed}_metadata.json' + with metadata_path.open('w') as f: + json.dump(metadata, f, indent=4) + print(metadata_path, metadata) + upload_func(metadata_path, upload_dest_folder) + + tar_path = parent_folder.with_suffix('.tar.gz') + print(f"Performing cleanup and tar to {tar_path}") + cleanup.cleanup(parent_folder) + with tarfile.open(tar_path, "w:gz") as tar: + tar.add(parent_folder, os.path.sep) + assert tar_path.exists() + + print(f"Uploading tarfile") + upload_func(tar_path, upload_dest_folder) + (parent_folder / "logs" / f"FINISH_{task_uniqname}").touch() diff --git a/worldgen/util/organization.py b/worldgen/util/organization.py index 5f4ff6fcc..ad57e1b38 100644 --- a/worldgen/util/organization.py +++ b/worldgen/util/organization.py @@ -7,7 +7,6 @@ class Task: Coarse = "coarse" Populate = "populate" - Fine = "fine" FineTerrain = "fine_terrain" Render = "render" GroundTruth = "ground_truth"