From 54cc4b3a742ce2e61a12005236ed9a3a8b8dcd14 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Wed, 3 Sep 2025 08:46:23 +0000 Subject: [PATCH 01/17] Add script to split module based on source paths This adds a script, `tools/empath-split.py`, which is a wrapper for Binaryen's `wasm-split`. `wasm-split` has `--multi-split` mode, which takes a manifest file that lists the name of functions per module. (Example: https://github.com/WebAssembly/binaryen/blob/main/test/lit/wasm-split/multi-split.wast.manifest) But listing all functions belonging to each module is a tedious process. `empath-split` takes a wasm file and a text file that has a list of paths, which can be either directories or functions, and using the source map information, generates a manifest file, and runs `wasm-split`. This makes a small drive-by fix for `emsymbolizer`. Currently when it takes a 0 address, it returns the location info associated with offsets[-1], which is the largest offset. This fixes it, and adds an optional `lower_bound` argument to `find_offset` so that when we want to get a source info entry, we don't go below the current function start offset. --- test/test_other.py | 48 ++++++ tools/empath-split.py | 229 +++++++++++++++++++++++++++++ tools/emsymbolizer.py | 27 ++-- tools/link.py | 4 +- tools/maint/create_entry_points.py | 2 + tools/webassembly.py | 37 +++++ 6 files changed, 335 insertions(+), 12 deletions(-) create mode 100755 tools/empath-split.py diff --git a/test/test_other.py b/test/test_other.py index 82ff8310fcb9a..f510c64ec1e87 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -57,6 +57,7 @@ emmake = shared.bat_suffix(path_from_root('emmake')) emconfig = shared.bat_suffix(path_from_root('em-config')) emsize = shared.bat_suffix(path_from_root('emsize')) +empath_split = shared.bat_suffix(path_from_root('empath-split')) emprofile = shared.bat_suffix(path_from_root('emprofile')) emstrip = shared.bat_suffix(path_from_root('emstrip')) emsymbolizer = shared.bat_suffix(path_from_root('emsymbolizer')) @@ -16589,3 +16590,50 @@ def test_create_preloaded_file(self): return 0; }''') self.do_runf('main.c', 'done\n', cflags=['-sFORCE_FILESYSTEM', '--post-js=post.js']) + + def test_empath_split(self): + create_file('main.cpp', r''' + #include + void foo(); + int main() { + std::cout << "main" << std::endl; + foo(); + return 0; + } + ''') + create_file('foo.cpp', r''' + #include + void foo() { std::cout << "foo" << std::endl; } + ''') + create_file('path_list', r''' + main.cpp + foo.cpp + /emsdk/emscripten/system + /emsdk/emscripten/system/lib/libc/musl + /emsdk/emscripten/system/lib/libcxx + ''') + + self.run_process([EMCC, 'main.cpp', 'foo.cpp', '-gsource-map', '-g2', '-gsource-map', '-o', 'test.js']) + self.run_process([empath_split, 'test.wasm', 'path_list', '-g', '-o', 'test_primary.wasm', '--out-prefix=test_']) + + # Check if functions are correctly assigned and split with the specified + # paths. When one path contains another, the inner path should take its + # functions first, and the rest is split with the outer path. + def has_defined_function(file, func): + self.run_process([common.WASM_DIS, file, '-o', 'test.wast']) + pattern = re.compile(r'\(\s*func\s+\$' + func + r'[\s\(\)]') + with open('test.wast', 'r') as f: + return pattern.search(f.read()) is not None + + # main.cpp + self.assertTrue(has_defined_function('test_0.wasm', '__original_main')) + # foo.cpp + self.assertTrue(has_defined_function('test_1.wasm', r'foo\\28\\29')) + # /emsdk/emscripten/system + self.assertTrue(has_defined_function('test_2.wasm', '__abort_message')) + self.assertTrue(has_defined_function('test_2.wasm', 'pthread_cond_wait')) + # /emsdk/emscripten/system/lib/libc/musl + self.assertTrue(has_defined_function('test_3.wasm', 'strcmp')) + # /emsdk/emscripten/system/lib/libcxx + self.assertTrue(has_defined_function('test_4.wasm', r'std::__2::ios_base::getloc\\28\\29\\20const')) + self.assertTrue(has_defined_function('test_4.wasm', r'std::uncaught_exceptions\\28\\29')) diff --git a/tools/empath-split.py b/tools/empath-split.py new file mode 100755 index 0000000000000..90b27eed71a34 --- /dev/null +++ b/tools/empath-split.py @@ -0,0 +1,229 @@ +#!/usr/bin/env python3 +# Copyright 2025 The Emscripten Authors. All rights reserved. +# Emscripten is available under two separate licenses, the MIT license and the +# University of Illinois/NCSA Open Source License. Both these licenses can be +# found in the LICENSE file. + +""" +Wrapper for 'wasm-split --multi-split' functionality. This script generates a +.manifest file based on the list of user source paths, using source map +information. + +This assumes the name section exists in the input wasm file, and also assumes +the sourceMappingURL section exists in the input or a source map file is +separately supplied with --sourcemap. If we have two files a.c and b.c, to +generate a source map and the name section, if you compile and link within a +single command, you can do something like +$ emcc -g2 -gsrouce-map a.c b.c -o result.js +If you want to compile and link in separate commands, you can do +$ emcc -gsource-map a.c -o a.o +$ emcc -gsource-map b.c -o b.o +$ emcc -g2 -gsource-map a.o b.o -o result.js +See https://emscripten.org/docs/porting/Debugging.html for more details. + +This takes a wasm file and a paths file, which is a text file containing a list +of paths as inputs. The paths file should contain a single path per line. A +single split module will be generated per specified path. If a specified path +contains another specified path, functions contained in the inner path will be +split as the inner path's module, and the rest of the functions will be split as +the outer path's module. Functions that do not belong to any of the specified +paths will remain in the primary module. +""" + +import argparse +import os +import sys +import tempfile +from pathlib import PurePath + +__scriptdir__ = os.path.dirname(os.path.abspath(__file__)) +__rootdir__ = os.path.dirname(__scriptdir__) +sys.path.insert(0, __rootdir__) + +from tools import config +from tools import diagnostics +from tools import emsymbolizer +from tools import shared +from tools import webassembly +from tools.utils import exit_with_error, normalize_path + +def parse_args(): + parser = argparse.ArgumentParser( + description='Split a wasm file based on user paths', + epilog=""" +This is a wrapper for 'wasm-split --multi-split' functionality, so you should +add wasm-split's command line options as well. You should or may want to add +wasm-split options like -o (--output), --out-prefix, -g, and feature +enabling/disabling options. Run 'wasm-split -h' for the list of options. But you +should NOT add --manifest, because this will be generated from this script. +""") + parser.add_argument('wasm', help='Path to the input wasm file') + parser.add_argument('paths_file', help='Path to the input file containing paths') + parser.add_argument('-s', '--sourcemap', help='Force source map file') + parser.add_argument('-v', '--verbose', action='store_true', + help='Print verbose info for debugging this script') + parser.add_argument('--wasm-split', help='Path to wasm-split executable') + parser.add_argument('--preserve-manifest', action='store_true', + help='Preserve generated manifest file. This sets --verbose too.') + args, forwarded_args = parser.parse_known_args() + if '--manifest' in forwarded_args: + exit_with_error('manifest file will be generated by this script and should not be given') + if args.preserve_manifest: + args.verbose = True + return args, forwarded_args + + +def get_path_to_functions_map(wasm, sourcemap, paths, verbose): + def is_synthesized_func(func): + # TODO There can be more + synthesized_names = [ + 'main', + '__wasm_call_ctors', + '__clang_call_terminate', + ] + synthesized_prefixes = [ + 'legalstub$', + 'legalfunc$', + '__cxx_global_', + '_GLOBAL__', + 'virtual thunk to ', + ] + if func in synthesized_names: + return True + return func.startswith(tuple(synthesized_prefixes)) + + with webassembly.Module(wasm) as module: + if not module.has_name_section(): + exit_with_error('Name section does not eixst') + if not sourcemap: + if not emsymbolizer.get_sourceMappingURL_section(module): + exit_with_error('sourceMappingURL section does not exist') + + code_section = module.get_section(webassembly.SecType.CODE) + + funcs = module.get_functions() + func_names = module.get_function_names()[module.num_imported_funcs():] + assert len(funcs) == len(func_names) + + func_to_src = {} + src_to_funcs = {} + + if not sourcemap: + sourcemap = module.get_sourceMappingURL() + sm = emsymbolizer.WasmSourceMap() + sm.parse(sourcemap) + + for func_name, func in zip(func_names, funcs): + # From the last address, decrement the address by 1 until we find location + # info with source file information. The reason we do this is to reduce + # the probability of picking an address where another function is inlined + # into, picking the inlined function's source. + # We start from the end because it is simpler; it is harder to compute the + # first instruction's address, because there is a gap for local types + # between function offset and the first instruction. + addr = func.offset + func.size - 1 + while addr > func.offset: + loc = sm.lookup(addr, func.offset) + # This means there is no source map mappings for the entire function + # (because we give func.offset as a lower bound). Exit the loop. + if not loc: + break + # Exit the loop only if a location info with source file information is + # found. If not, continue the search. + if loc.source: + break + addr -= 1 + + if loc and loc.source: + func_to_src[func_name] = normalize_path(loc.source) + else: + if not is_synthesized_func(func_name): + diagnostics.warn(f"No source file information found in the source map for function '{func_name}'") + + for func_name, src in func_to_src.items(): + if src not in src_to_funcs: + src_to_funcs[src] = [] + src_to_funcs[src].append(func_name) + + # Visit paths in the reverse sorting order, so that we can proces inner paths + # first. + # e.g. If we have /a/b and /a/b/c, /a/b/c will come first, so we can assign + # functions contained in /a/b/c to it first and assign the maining functions + # to /a/b. + visited_funcs = set() + path_to_funcs = {} + for path in sorted(paths, reverse=True): + ppath = PurePath(path) + path_to_funcs[path] = [] + for src, funcs in src_to_funcs.items(): + psrc = PurePath(src) + if ppath == psrc or ppath in psrc.parents: + for func in funcs: + if func not in visited_funcs: + visited_funcs.add(func) + path_to_funcs[path].append(func) + return path_to_funcs + + +def main(): + args, forwarded_args = parse_args() + wasm = os.path.expanduser(args.wasm) + paths_file = os.path.expanduser(args.paths_file) + sourcemap = os.path.expanduser(args.sourcemap) if args.sourcemap else None + if args.wasm_split: + wasm_split = os.path.expanduser(args.wasm_split) + else: + wasm_split = os.path.join(config.BINARYEN_ROOT, 'bin', 'wasm-split') + + if not os.path.isfile(wasm): + exit_with_error(f"'{wasm}' was not found or not a file") + if not os.path.isfile(paths_file): + exit_with_error(f"'{paths_file}' was not found or not a file") + if sourcemap: + if not os.path.isfile(sourcemap): + exit_with_error(f"'{sourcemap}' was not found or not a file") + if not os.path.isfile(wasm_split): + exit_with_error(f"'{wasm_split}' was not found or not a file") + + with open(paths_file, 'r', encoding='utf-8') as f: + paths = [normalize_path(path.strip()) for path in f if path.strip()] + # To make /a/b/c and /a/b/c/ equivalent + paths = [path.rstrip(os.sep) for path in paths] + # Remove duplicates + paths = list(dict.fromkeys(paths)) + + path_to_funcs = get_path_to_functions_map(wasm, sourcemap, paths, args.verbose) + + f = tempfile.NamedTemporaryFile(suffix=".manifest", mode='w+', delete=False) + try: + manifest = f.name + for i, path in enumerate(paths): + f.write(f'{i}\n') + if not path_to_funcs[path]: + diagnostics.warn(f'{path} does not match any functions') + if args.verbose: + print(path) + for func in path_to_funcs[path]: + print(' ' + func) + print() + for func in path_to_funcs[path]: + f.write(func + '\n') + if i < len(paths) - 1: + f.write('\n') + f.close() + + cmd = [wasm_split, '--multi-split', wasm, '--manifest', manifest] + if args.verbose: + # This option is used both in this script and wasm-split + cmd.append('-v') + cmd += forwarded_args + if args.verbose: + print('\n' + ' '.join(cmd)) + shared.run_process(cmd) + finally: + if not args.preserve_manifest: + os.remove(manifest) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/tools/emsymbolizer.py b/tools/emsymbolizer.py index b411f49da3d15..6affa7d4c9b1a 100755 --- a/tools/emsymbolizer.py +++ b/tools/emsymbolizer.py @@ -178,7 +178,7 @@ def decodeVLQ(string): self.offsets.append(offset) self.offsets.sort() - def find_offset(self, offset): + def find_offset(self, offset, lower_bound=None): # Find the largest mapped offset <= the search offset lo = 0 hi = len(self.offsets) @@ -189,11 +189,22 @@ def find_offset(self, offset): hi = mid else: lo = mid + 1 - return self.offsets[lo - 1] + if lo == 0: + return None + # If lower bound is given, return the offset only if the offset is equal to + # or greather than the lower bound + if lower_bound: + if self.offsets[lo - 1] >= lower_bound: + return self.offsets[lo - 1] + else: + return None + else: + return self.offsets[lo - 1] - def lookup(self, offset): - nearest = self.find_offset(offset) - assert nearest in self.mappings, 'Sourcemap has an offset with no mapping' + def lookup(self, offset, lower_bound=None): + nearest = self.find_offset(offset, lower_bound) + if not nearest: + return None info = self.mappings[nearest] return LocationInfo( self.sources[info.source] if info.source is not None else None, @@ -206,12 +217,8 @@ def symbolize_address_sourcemap(module, address, force_file): URL = force_file if not URL: # If a sourcemap file is not forced, read it from the wasm module - section = get_sourceMappingURL_section(module) - assert section - module.seek(section.offset) - assert module.read_string() == 'sourceMappingURL' # TODO: support stripping/replacing a prefix from the URL - URL = module.read_string() + URL = module.get_sourceMappingURL() if shared.DEBUG: print(f'Source Mapping URL: {URL}') diff --git a/tools/link.py b/tools/link.py index 65ac42b47bd47..f8adf61cc360c 100644 --- a/tools/link.py +++ b/tools/link.py @@ -2301,8 +2301,8 @@ def phase_binaryen(target, options, wasm_target): intermediate_debug_info -= 1 # currently binaryen's DWARF support will limit some optimizations; warn on # that. see https://github.com/emscripten-core/emscripten/issues/15269 - if settings.GENERATE_DWARF: - diagnostics.warning('limited-postlink-optimizations', 'running limited binaryen optimizations because DWARF info requested (or indirectly required)') + #if settings.GENERATE_DWARF: + # diagnostics.warning('limited-postlink-optimizations', 'running limited binaryen optimizations because DWARF info requested (or indirectly required)') with ToolchainProfiler.profile_block('wasm_opt'): building.run_wasm_opt(wasm_target, wasm_target, diff --git a/tools/maint/create_entry_points.py b/tools/maint/create_entry_points.py index d610960c03349..93693b5496978 100755 --- a/tools/maint/create_entry_points.py +++ b/tools/maint/create_entry_points.py @@ -43,6 +43,7 @@ emstrip emsymbolizer emscan-deps +empath-split tools/file_packager tools/webidl_binder test/runner @@ -56,6 +57,7 @@ 'emdwp': 'tools/emdwp', 'emnm': 'tools/emnm', 'emsymbolizer': 'tools/emsymbolizer', + 'empath-split': 'tools/empath-split', } diff --git a/tools/webassembly.py b/tools/webassembly.py index d4c26c0cab15d..ce574f713afbd 100644 --- a/tools/webassembly.py +++ b/tools/webassembly.py @@ -521,6 +521,34 @@ def get_function_types(self): num_types = self.read_uleb() return [self.read_uleb() for _ in range(num_types)] + @memoize + def get_function_names(self): + num_funcs = self.num_imported_funcs() + len(self.get_functions()) + names = [None] * num_funcs + + name_section = self.get_custom_section('name') + if not name_section: + return names + + self.seek(name_section.offset) + self.read_string() # section name + section_end = name_section.offset + name_section.size + + while self.tell() < section_end: + subsection_id = self.read_byte() + subsection_size = self.read_uleb() + if subsection_id == 1: # function names + count = self.read_uleb() + for _ in range(count): + func_idx = self.read_uleb() + func_name = self.read_string() + assert func_idx < len(names) + names[func_idx] = func_name + else: + self.skip(subsection_size) + + return names + def has_name_section(self): return self.get_custom_section('name') is not None @@ -579,6 +607,15 @@ def get_target_features(self): features[feature] = prefix return features + @memoize + def get_sourceMappingURL(self): + section = self.get_custom_section('sourceMappingURL') + if not section: + return '' + self.seek(section.offset) + self.read_string() # 'sourceMappingURL' + return self.read_string() + def parse_dylink_section(wasm_file): with Module(wasm_file) as module: From ea300b2e8b6c200e80b83714b4ac40e17faa2db5 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Mon, 15 Sep 2025 17:09:26 +0000 Subject: [PATCH 02/17] Revert accidental change --- tools/link.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/link.py b/tools/link.py index f8adf61cc360c..65ac42b47bd47 100644 --- a/tools/link.py +++ b/tools/link.py @@ -2301,8 +2301,8 @@ def phase_binaryen(target, options, wasm_target): intermediate_debug_info -= 1 # currently binaryen's DWARF support will limit some optimizations; warn on # that. see https://github.com/emscripten-core/emscripten/issues/15269 - #if settings.GENERATE_DWARF: - # diagnostics.warning('limited-postlink-optimizations', 'running limited binaryen optimizations because DWARF info requested (or indirectly required)') + if settings.GENERATE_DWARF: + diagnostics.warning('limited-postlink-optimizations', 'running limited binaryen optimizations because DWARF info requested (or indirectly required)') with ToolchainProfiler.profile_block('wasm_opt'): building.run_wasm_opt(wasm_target, wasm_target, From 355fc6006b06ac259406a36cfc77203cb9310a0e Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Mon, 15 Sep 2025 17:36:53 +0000 Subject: [PATCH 03/17] comments --- tools/empath-split.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/empath-split.py b/tools/empath-split.py index 90b27eed71a34..0aea45530f890 100755 --- a/tools/empath-split.py +++ b/tools/empath-split.py @@ -92,6 +92,9 @@ def is_synthesized_func(func): return True return func.startswith(tuple(synthesized_prefixes)) + # Compute {func_name: src file} map, and revert it to get + # {src file: list of functions} map, and construct {path: list of functions} + # map from it with webassembly.Module(wasm) as module: if not module.has_name_section(): exit_with_error('Name section does not eixst') @@ -192,8 +195,10 @@ def main(): # Remove duplicates paths = list(dict.fromkeys(paths)) + # Compute {path: list of functions} map path_to_funcs = get_path_to_functions_map(wasm, sourcemap, paths, args.verbose) + # Write .manifest file f = tempfile.NamedTemporaryFile(suffix=".manifest", mode='w+', delete=False) try: manifest = f.name From e6d39d6711335d344b857872b497615e2c91ae48 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Mon, 15 Sep 2025 18:06:12 +0000 Subject: [PATCH 04/17] Fix regex so that (import .. (func ..)) is not included --- test/test_other.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_other.py b/test/test_other.py index f510c64ec1e87..da912530d2623 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -16621,7 +16621,7 @@ def test_empath_split(self): # functions first, and the rest is split with the outer path. def has_defined_function(file, func): self.run_process([common.WASM_DIS, file, '-o', 'test.wast']) - pattern = re.compile(r'\(\s*func\s+\$' + func + r'[\s\(\)]') + pattern = re.compile(r'^\s*\(\s*func\s+\$' + func + r'[\s\(\)]', flags=re.MULTILINE) with open('test.wast', 'r') as f: return pattern.search(f.read()) is not None From 1246d2e34cc6c0755b02029332881808eb88cf8b Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Mon, 15 Sep 2025 18:11:55 +0000 Subject: [PATCH 05/17] comment typo --- tools/empath-split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/empath-split.py b/tools/empath-split.py index 0aea45530f890..0dbd4cd8bfdb4 100755 --- a/tools/empath-split.py +++ b/tools/empath-split.py @@ -148,7 +148,7 @@ def is_synthesized_func(func): src_to_funcs[src] = [] src_to_funcs[src].append(func_name) - # Visit paths in the reverse sorting order, so that we can proces inner paths + # Visit paths in the reverse sorting order, so that we can process inner paths # first. # e.g. If we have /a/b and /a/b/c, /a/b/c will come first, so we can assign # functions contained in /a/b/c to it first and assign the maining functions From 9cb04b770feae8f62b5aa2deb07a6ef54066601b Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Mon, 15 Sep 2025 16:30:48 -0700 Subject: [PATCH 06/17] Update tools/empath-split.py Co-authored-by: Derek Schuff --- tools/empath-split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/empath-split.py b/tools/empath-split.py index 0dbd4cd8bfdb4..5551007586d9f 100755 --- a/tools/empath-split.py +++ b/tools/empath-split.py @@ -92,7 +92,7 @@ def is_synthesized_func(func): return True return func.startswith(tuple(synthesized_prefixes)) - # Compute {func_name: src file} map, and revert it to get + # Compute {func_name: src file} map, and invert it to get # {src file: list of functions} map, and construct {path: list of functions} # map from it with webassembly.Module(wasm) as module: From 4440ebf7ca972aa0f047aa92eae28d1b0651d03f Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Mon, 15 Sep 2025 23:58:44 +0000 Subject: [PATCH 07/17] Address comments --- test/test_other.py | 2 +- tools/empath-split.py | 4 +--- tools/webassembly.py | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/test/test_other.py b/test/test_other.py index da912530d2623..3f59c482245af 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -16613,7 +16613,7 @@ def test_empath_split(self): /emsdk/emscripten/system/lib/libcxx ''') - self.run_process([EMCC, 'main.cpp', 'foo.cpp', '-gsource-map', '-g2', '-gsource-map', '-o', 'test.js']) + self.run_process([EMCC, 'main.cpp', 'foo.cpp', '-gsource-map', '-g2', '-o', 'test.js']) self.run_process([empath_split, 'test.wasm', 'path_list', '-g', '-o', 'test_primary.wasm', '--out-prefix=test_']) # Check if functions are correctly assigned and split with the specified diff --git a/tools/empath-split.py b/tools/empath-split.py index 5551007586d9f..58a92894a656f 100755 --- a/tools/empath-split.py +++ b/tools/empath-split.py @@ -102,10 +102,8 @@ def is_synthesized_func(func): if not emsymbolizer.get_sourceMappingURL_section(module): exit_with_error('sourceMappingURL section does not exist') - code_section = module.get_section(webassembly.SecType.CODE) - funcs = module.get_functions() - func_names = module.get_function_names()[module.num_imported_funcs():] + func_names = module.get_function_names() assert len(funcs) == len(func_names) func_to_src = {} diff --git a/tools/webassembly.py b/tools/webassembly.py index ce574f713afbd..192bfdf850432 100644 --- a/tools/webassembly.py +++ b/tools/webassembly.py @@ -522,7 +522,7 @@ def get_function_types(self): return [self.read_uleb() for _ in range(num_types)] @memoize - def get_function_names(self): + def get_function_names(self, remove_imports=True): num_funcs = self.num_imported_funcs() + len(self.get_functions()) names = [None] * num_funcs @@ -547,7 +547,7 @@ def get_function_names(self): else: self.skip(subsection_size) - return names + return names[self.num_imported_funcs():] if remove_imports else names def has_name_section(self): return self.get_custom_section('name') is not None From 829e697771b782edc955cdd3a8684c5d5d957989 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Mon, 15 Sep 2025 23:58:57 +0000 Subject: [PATCH 08/17] ruff fixes --- test/test_other.py | 2 +- tools/empath-split.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_other.py b/test/test_other.py index 3f59c482245af..10b995a6e85be 100644 --- a/test/test_other.py +++ b/test/test_other.py @@ -16622,7 +16622,7 @@ def test_empath_split(self): def has_defined_function(file, func): self.run_process([common.WASM_DIS, file, '-o', 'test.wast']) pattern = re.compile(r'^\s*\(\s*func\s+\$' + func + r'[\s\(\)]', flags=re.MULTILINE) - with open('test.wast', 'r') as f: + with open('test.wast') as f: return pattern.search(f.read()) is not None # main.cpp diff --git a/tools/empath-split.py b/tools/empath-split.py index 58a92894a656f..0d1c10c104ce3 100755 --- a/tools/empath-split.py +++ b/tools/empath-split.py @@ -73,7 +73,7 @@ def parse_args(): return args, forwarded_args -def get_path_to_functions_map(wasm, sourcemap, paths, verbose): +def get_path_to_functions_map(wasm, sourcemap, paths): def is_synthesized_func(func): # TODO There can be more synthesized_names = [ @@ -186,7 +186,7 @@ def main(): if not os.path.isfile(wasm_split): exit_with_error(f"'{wasm_split}' was not found or not a file") - with open(paths_file, 'r', encoding='utf-8') as f: + with open(paths_file, encoding='utf-8') as f: paths = [normalize_path(path.strip()) for path in f if path.strip()] # To make /a/b/c and /a/b/c/ equivalent paths = [path.rstrip(os.sep) for path in paths] From 757253d7e9d9d9e971fe390ec194767e0fe49a11 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 16 Sep 2025 00:03:56 +0000 Subject: [PATCH 09/17] More ruff fix --- tools/empath-split.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/empath-split.py b/tools/empath-split.py index 0d1c10c104ce3..6712f1204def6 100755 --- a/tools/empath-split.py +++ b/tools/empath-split.py @@ -47,6 +47,7 @@ from tools import webassembly from tools.utils import exit_with_error, normalize_path + def parse_args(): parser = argparse.ArgumentParser( description='Split a wasm file based on user paths', From 272885c47c247c1f335136eeb08bce760338d2a1 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 16 Sep 2025 01:01:02 +0000 Subject: [PATCH 10/17] Add generated empath-split --- empath-split | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100755 empath-split diff --git a/empath-split b/empath-split new file mode 100755 index 0000000000000..c8fb10c8e5917 --- /dev/null +++ b/empath-split @@ -0,0 +1,35 @@ +#!/bin/sh +# Copyright 2020 The Emscripten Authors. All rights reserved. +# Emscripten is available under two separate licenses, the MIT license and the +# University of Illinois/NCSA Open Source License. Both these licenses can be +# found in the LICENSE file. +# +# Entry point for running python scripts on UNIX systems. +# +# Automatically generated by `create_entry_points.py`; DO NOT EDIT. +# +# To make modifications to this file, edit `tools/run_python.sh` and then run +# `tools/maint/create_entry_points.py` + +# $PYTHON -E does not ignore _PYTHON_SYSCONFIGDATA_NAME, an internal of cpython +# used in cross compilation via setup.py, so we unset it explicitly here. +unset _PYTHON_SYSCONFIGDATA_NAME + +if [ -z "$PYTHON" ]; then + PYTHON=$EMSDK_PYTHON +fi + +if [ -z "$PYTHON" ]; then + PYTHON=$(command -v python3 2> /dev/null) +fi + +if [ -z "$PYTHON" ]; then + PYTHON=$(command -v python 2> /dev/null) +fi + +if [ -z "$PYTHON" ]; then + echo 'unable to find python in $PATH' + exit 1 +fi + +exec "$PYTHON" -E "$(dirname $0)/tools/empath-split.py" "$@" From e47d6a44643a75bc02bb07783c18ec9d2afe783a Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 16 Sep 2025 01:36:33 +0000 Subject: [PATCH 11/17] Maybe I shouldn't add this after all --- empath-split | 35 ----------------------------------- 1 file changed, 35 deletions(-) delete mode 100755 empath-split diff --git a/empath-split b/empath-split deleted file mode 100755 index c8fb10c8e5917..0000000000000 --- a/empath-split +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/sh -# Copyright 2020 The Emscripten Authors. All rights reserved. -# Emscripten is available under two separate licenses, the MIT license and the -# University of Illinois/NCSA Open Source License. Both these licenses can be -# found in the LICENSE file. -# -# Entry point for running python scripts on UNIX systems. -# -# Automatically generated by `create_entry_points.py`; DO NOT EDIT. -# -# To make modifications to this file, edit `tools/run_python.sh` and then run -# `tools/maint/create_entry_points.py` - -# $PYTHON -E does not ignore _PYTHON_SYSCONFIGDATA_NAME, an internal of cpython -# used in cross compilation via setup.py, so we unset it explicitly here. -unset _PYTHON_SYSCONFIGDATA_NAME - -if [ -z "$PYTHON" ]; then - PYTHON=$EMSDK_PYTHON -fi - -if [ -z "$PYTHON" ]; then - PYTHON=$(command -v python3 2> /dev/null) -fi - -if [ -z "$PYTHON" ]; then - PYTHON=$(command -v python 2> /dev/null) -fi - -if [ -z "$PYTHON" ]; then - echo 'unable to find python in $PATH' - exit 1 -fi - -exec "$PYTHON" -E "$(dirname $0)/tools/empath-split.py" "$@" From d20c87bc17274cc687db8a779609d11e942f9c7c Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 16 Sep 2025 01:38:03 +0000 Subject: [PATCH 12/17] Add the scripts to .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 6700e4172e80d..558e31c0b3b46 100644 --- a/.gitignore +++ b/.gitignore @@ -46,6 +46,7 @@ emdump emdwp emmake emnm +empath-split emprofile emranlib emrun @@ -67,6 +68,7 @@ emdump.bat emdwp.bat emmake.bat emnm.bat +empath-split.bat emprofile.bat emranlib.bat emrun.bat From e36b0f59745f9aef43d4fdc9b965fd56eb0b76bc Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 16 Sep 2025 02:36:11 +0000 Subject: [PATCH 13/17] fix --- tools/empath-split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/empath-split.py b/tools/empath-split.py index 6712f1204def6..383e21dcb5c00 100755 --- a/tools/empath-split.py +++ b/tools/empath-split.py @@ -195,7 +195,7 @@ def main(): paths = list(dict.fromkeys(paths)) # Compute {path: list of functions} map - path_to_funcs = get_path_to_functions_map(wasm, sourcemap, paths, args.verbose) + path_to_funcs = get_path_to_functions_map(wasm, sourcemap, paths) # Write .manifest file f = tempfile.NamedTemporaryFile(suffix=".manifest", mode='w+', delete=False) From 3743aa73f948b175406ea826ca89e209309d212c Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Wed, 17 Sep 2025 01:01:06 +0000 Subject: [PATCH 14/17] Address comments --- tools/empath-split.py | 45 +++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/tools/empath-split.py b/tools/empath-split.py index 383e21dcb5c00..d7f4cb7448769 100755 --- a/tools/empath-split.py +++ b/tools/empath-split.py @@ -40,12 +40,14 @@ __rootdir__ = os.path.dirname(__scriptdir__) sys.path.insert(0, __rootdir__) +from tools import building from tools import config from tools import diagnostics from tools import emsymbolizer from tools import shared +from tools import utils from tools import webassembly -from tools.utils import exit_with_error, normalize_path +from tools.utils import exit_with_error def parse_args(): @@ -137,7 +139,7 @@ def is_synthesized_func(func): addr -= 1 if loc and loc.source: - func_to_src[func_name] = normalize_path(loc.source) + func_to_src[func_name] = utils.normalize_path(loc.source) else: if not is_synthesized_func(func_name): diagnostics.warn(f"No source file information found in the source map for function '{func_name}'") @@ -169,33 +171,30 @@ def is_synthesized_func(func): def main(): args, forwarded_args = parse_args() - wasm = os.path.expanduser(args.wasm) - paths_file = os.path.expanduser(args.paths_file) - sourcemap = os.path.expanduser(args.sourcemap) if args.sourcemap else None if args.wasm_split: - wasm_split = os.path.expanduser(args.wasm_split) + wasm_split = args.wasm_split else: - wasm_split = os.path.join(config.BINARYEN_ROOT, 'bin', 'wasm-split') - - if not os.path.isfile(wasm): - exit_with_error(f"'{wasm}' was not found or not a file") - if not os.path.isfile(paths_file): - exit_with_error(f"'{paths_file}' was not found or not a file") - if sourcemap: - if not os.path.isfile(sourcemap): - exit_with_error(f"'{sourcemap}' was not found or not a file") + wasm_split = os.path.join(building.get_binaryen_bin(), 'wasm-split') + + if not os.path.isfile(args.wasm): + exit_with_error(f"'{args.wasm}' was not found or not a file") + if not os.path.isfile(args.paths_file): + exit_with_error(f"'{args.paths_file}' was not found or not a file") + if args.sourcemap: + if not os.path.isfile(args.sourcemap): + exit_with_error(f"'{args.sourcemap}' was not found or not a file") if not os.path.isfile(wasm_split): exit_with_error(f"'{wasm_split}' was not found or not a file") - with open(paths_file, encoding='utf-8') as f: - paths = [normalize_path(path.strip()) for path in f if path.strip()] - # To make /a/b/c and /a/b/c/ equivalent - paths = [path.rstrip(os.sep) for path in paths] - # Remove duplicates - paths = list(dict.fromkeys(paths)) + paths = utils.read_file(args.paths_file).splitlines() + paths = [utils.normalize_path(path.strip()) for path in paths if path.strip()] + # To make /a/b/c and /a/b/c/ equivalent + paths = [path.rstrip(os.sep) for path in paths] + # Remove duplicates + paths = list(dict.fromkeys(paths)) # Compute {path: list of functions} map - path_to_funcs = get_path_to_functions_map(wasm, sourcemap, paths) + path_to_funcs = get_path_to_functions_map(args.wasm, args.sourcemap, paths) # Write .manifest file f = tempfile.NamedTemporaryFile(suffix=".manifest", mode='w+', delete=False) @@ -216,7 +215,7 @@ def main(): f.write('\n') f.close() - cmd = [wasm_split, '--multi-split', wasm, '--manifest', manifest] + cmd = [wasm_split, '--multi-split', args.wasm, '--manifest', manifest] if args.verbose: # This option is used both in this script and wasm-split cmd.append('-v') From f5f246833e8f06589519248b625b2c0811dd38a5 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Wed, 17 Sep 2025 01:14:26 +0000 Subject: [PATCH 15/17] Remove try-finally --- tools/empath-split.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tools/empath-split.py b/tools/empath-split.py index d7f4cb7448769..f12705552caa9 100755 --- a/tools/empath-split.py +++ b/tools/empath-split.py @@ -197,8 +197,7 @@ def main(): path_to_funcs = get_path_to_functions_map(args.wasm, args.sourcemap, paths) # Write .manifest file - f = tempfile.NamedTemporaryFile(suffix=".manifest", mode='w+', delete=False) - try: + with tempfile.NamedTemporaryFile(suffix=".manifest", mode='w+', delete=args.preserve_manifest) as f: manifest = f.name for i, path in enumerate(paths): f.write(f'{i}\n') @@ -213,7 +212,7 @@ def main(): f.write(func + '\n') if i < len(paths) - 1: f.write('\n') - f.close() + f.flush() cmd = [wasm_split, '--multi-split', args.wasm, '--manifest', manifest] if args.verbose: @@ -223,9 +222,6 @@ def main(): if args.verbose: print('\n' + ' '.join(cmd)) shared.run_process(cmd) - finally: - if not args.preserve_manifest: - os.remove(manifest) if __name__ == '__main__': From be1de651eaad02524ded7fe4f65533a0bb6d3703 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Wed, 17 Sep 2025 02:34:34 +0000 Subject: [PATCH 16/17] Ruff fix --- tools/empath-split.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/empath-split.py b/tools/empath-split.py index f12705552caa9..9b2808dd0b77b 100755 --- a/tools/empath-split.py +++ b/tools/empath-split.py @@ -41,7 +41,6 @@ sys.path.insert(0, __rootdir__) from tools import building -from tools import config from tools import diagnostics from tools import emsymbolizer from tools import shared From a727aa1866a05e083bedd48b03db3c543bce686a Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Thu, 18 Sep 2025 12:08:19 -0700 Subject: [PATCH 17/17] Update tools/empath-split.py Co-authored-by: Derek Schuff --- tools/empath-split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/empath-split.py b/tools/empath-split.py index 9b2808dd0b77b..218aba735e3d0 100755 --- a/tools/empath-split.py +++ b/tools/empath-split.py @@ -151,7 +151,7 @@ def is_synthesized_func(func): # Visit paths in the reverse sorting order, so that we can process inner paths # first. # e.g. If we have /a/b and /a/b/c, /a/b/c will come first, so we can assign - # functions contained in /a/b/c to it first and assign the maining functions + # functions contained in /a/b/c to it first and assign the remaining functions # to /a/b. visited_funcs = set() path_to_funcs = {}