- 
                Notifications
    You must be signed in to change notification settings 
- Fork 3.4k
Add script to split module based on source paths #25278
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 15 commits
54cc4b3
              ea300b2
              355fc60
              e6d39d6
              1246d2e
              9cb04b7
              4440ebf
              829e697
              a475a24
              757253d
              272885c
              7a99839
              e47d6a4
              d20c87b
              e36b0f5
              3743aa7
              f5f2468
              be1de65
              60c39d2
              a727aa1
              0bce350
              7957deb
              29b8a17
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,233 @@ | ||
| #!/usr/bin/env python3 | ||
| # Copyright 2025 The Emscripten Authors. All rights reserved. | ||
| # Emscripten is available under two separate licenses, the MIT license and the | ||
| # University of Illinois/NCSA Open Source License. Both these licenses can be | ||
| # found in the LICENSE file. | ||
|  | ||
| """ | ||
| Wrapper for 'wasm-split --multi-split' functionality. This script generates a | ||
| .manifest file based on the list of user source paths, using source map | ||
| information. | ||
| This assumes the name section exists in the input wasm file, and also assumes | ||
| the sourceMappingURL section exists in the input or a source map file is | ||
| separately supplied with --sourcemap. If we have two files a.c and b.c, to | ||
| generate a source map and the name section, if you compile and link within a | ||
| single command, you can do something like | ||
| $ emcc -g2 -gsrouce-map a.c b.c -o result.js | ||
| If you want to compile and link in separate commands, you can do | ||
| $ emcc -gsource-map a.c -o a.o | ||
| $ emcc -gsource-map b.c -o b.o | ||
| $ emcc -g2 -gsource-map a.o b.o -o result.js | ||
| See https://emscripten.org/docs/porting/Debugging.html for more details. | ||
| This takes a wasm file and a paths file, which is a text file containing a list | ||
| of paths as inputs. The paths file should contain a single path per line. A | ||
| single split module will be generated per specified path. If a specified path | ||
| contains another specified path, functions contained in the inner path will be | ||
| split as the inner path's module, and the rest of the functions will be split as | ||
| the outer path's module. Functions that do not belong to any of the specified | ||
| paths will remain in the primary module. | ||
| """ | ||
|  | ||
| import argparse | ||
| import os | ||
| import sys | ||
| import tempfile | ||
| from pathlib import PurePath | ||
|  | ||
| __scriptdir__ = os.path.dirname(os.path.abspath(__file__)) | ||
| __rootdir__ = os.path.dirname(__scriptdir__) | ||
| sys.path.insert(0, __rootdir__) | ||
|  | ||
| from tools import config | ||
| from tools import diagnostics | ||
| from tools import emsymbolizer | ||
| from tools import shared | ||
| from tools import webassembly | ||
| from tools.utils import exit_with_error, normalize_path | ||
|  | ||
|  | ||
| def parse_args(): | ||
| parser = argparse.ArgumentParser( | ||
| description='Split a wasm file based on user paths', | ||
| epilog=""" | ||
| This is a wrapper for 'wasm-split --multi-split' functionality, so you should | ||
| add wasm-split's command line options as well. You should or may want to add | ||
| wasm-split options like -o (--output), --out-prefix, -g, and feature | ||
| enabling/disabling options. Run 'wasm-split -h' for the list of options. But you | ||
| should NOT add --manifest, because this will be generated from this script. | ||
| """) | ||
| parser.add_argument('wasm', help='Path to the input wasm file') | ||
| parser.add_argument('paths_file', help='Path to the input file containing paths') | ||
| parser.add_argument('-s', '--sourcemap', help='Force source map file') | ||
| parser.add_argument('-v', '--verbose', action='store_true', | ||
| help='Print verbose info for debugging this script') | ||
| parser.add_argument('--wasm-split', help='Path to wasm-split executable') | ||
| parser.add_argument('--preserve-manifest', action='store_true', | ||
| help='Preserve generated manifest file. This sets --verbose too.') | ||
| args, forwarded_args = parser.parse_known_args() | ||
| if '--manifest' in forwarded_args: | ||
| exit_with_error('manifest file will be generated by this script and should not be given') | ||
| if args.preserve_manifest: | ||
| args.verbose = True | ||
| return args, forwarded_args | ||
|  | ||
|  | ||
| def get_path_to_functions_map(wasm, sourcemap, paths): | ||
| def is_synthesized_func(func): | ||
| # TODO There can be more | ||
| synthesized_names = [ | ||
| 'main', | ||
| '__wasm_call_ctors', | ||
| '__clang_call_terminate', | ||
| ] | ||
| synthesized_prefixes = [ | ||
| 'legalstub$', | ||
| 'legalfunc$', | ||
| '__cxx_global_', | ||
| '_GLOBAL__', | ||
| 'virtual thunk to ', | ||
| ] | ||
| if func in synthesized_names: | ||
| return True | ||
| return func.startswith(tuple(synthesized_prefixes)) | ||
|  | ||
| # Compute {func_name: src file} map, and invert it to get | ||
| # {src file: list of functions} map, and construct {path: list of functions} | ||
| # map from it | ||
| with webassembly.Module(wasm) as module: | ||
| if not module.has_name_section(): | ||
| exit_with_error('Name section does not eixst') | ||
| if not sourcemap: | ||
| if not emsymbolizer.get_sourceMappingURL_section(module): | ||
| exit_with_error('sourceMappingURL section does not exist') | ||
|  | ||
| funcs = module.get_functions() | ||
| func_names = module.get_function_names() | ||
| assert len(funcs) == len(func_names) | ||
|  | ||
| func_to_src = {} | ||
| src_to_funcs = {} | ||
|  | ||
| if not sourcemap: | ||
| sourcemap = module.get_sourceMappingURL() | ||
| sm = emsymbolizer.WasmSourceMap() | ||
| sm.parse(sourcemap) | ||
|  | ||
| for func_name, func in zip(func_names, funcs): | ||
| # From the last address, decrement the address by 1 until we find location | ||
| # info with source file information. The reason we do this is to reduce | ||
| # the probability of picking an address where another function is inlined | ||
| # into, picking the inlined function's source. | ||
| # We start from the end because it is simpler; it is harder to compute the | ||
| # first instruction's address, because there is a gap for local types | ||
| # between function offset and the first instruction. | ||
| addr = func.offset + func.size - 1 | ||
| while addr > func.offset: | ||
| loc = sm.lookup(addr, func.offset) | ||
| # This means there is no source map mappings for the entire function | ||
| # (because we give func.offset as a lower bound). Exit the loop. | ||
| if not loc: | ||
| break | ||
| # Exit the loop only if a location info with source file information is | ||
| # found. If not, continue the search. | ||
| if loc.source: | ||
| break | ||
| addr -= 1 | ||
|  | ||
| if loc and loc.source: | ||
| func_to_src[func_name] = normalize_path(loc.source) | ||
| else: | ||
| if not is_synthesized_func(func_name): | ||
| diagnostics.warn(f"No source file information found in the source map for function '{func_name}'") | ||
|  | ||
| for func_name, src in func_to_src.items(): | ||
| if src not in src_to_funcs: | ||
| src_to_funcs[src] = [] | ||
| src_to_funcs[src].append(func_name) | ||
|  | ||
| # Visit paths in the reverse sorting order, so that we can process inner paths | ||
| # first. | ||
| # e.g. If we have /a/b and /a/b/c, /a/b/c will come first, so we can assign | ||
| # functions contained in /a/b/c to it first and assign the maining functions | ||
|         
                  aheejin marked this conversation as resolved.
              Outdated
          
            Show resolved
            Hide resolved | ||
| # to /a/b. | ||
| visited_funcs = set() | ||
| path_to_funcs = {} | ||
| for path in sorted(paths, reverse=True): | ||
| ppath = PurePath(path) | ||
| path_to_funcs[path] = [] | ||
| for src, funcs in src_to_funcs.items(): | ||
| psrc = PurePath(src) | ||
| if ppath == psrc or ppath in psrc.parents: | ||
| for func in funcs: | ||
| if func not in visited_funcs: | ||
| visited_funcs.add(func) | ||
| path_to_funcs[path].append(func) | ||
| return path_to_funcs | ||
|  | ||
|  | ||
| def main(): | ||
| args, forwarded_args = parse_args() | ||
| wasm = os.path.expanduser(args.wasm) | ||
|         
                  aheejin marked this conversation as resolved.
              Outdated
          
            Show resolved
            Hide resolved | ||
| paths_file = os.path.expanduser(args.paths_file) | ||
| sourcemap = os.path.expanduser(args.sourcemap) if args.sourcemap else None | ||
| if args.wasm_split: | ||
| wasm_split = os.path.expanduser(args.wasm_split) | ||
| else: | ||
| wasm_split = os.path.join(config.BINARYEN_ROOT, 'bin', 'wasm-split') | ||
|         
                  aheejin marked this conversation as resolved.
              Outdated
          
            Show resolved
            Hide resolved | ||
|  | ||
| if not os.path.isfile(wasm): | ||
| exit_with_error(f"'{wasm}' was not found or not a file") | ||
| if not os.path.isfile(paths_file): | ||
| exit_with_error(f"'{paths_file}' was not found or not a file") | ||
| if sourcemap: | ||
| if not os.path.isfile(sourcemap): | ||
| exit_with_error(f"'{sourcemap}' was not found or not a file") | ||
| if not os.path.isfile(wasm_split): | ||
| exit_with_error(f"'{wasm_split}' was not found or not a file") | ||
|  | ||
| with open(paths_file, encoding='utf-8') as f: | ||
|         
                  aheejin marked this conversation as resolved.
              Outdated
          
            Show resolved
            Hide resolved | ||
| paths = [normalize_path(path.strip()) for path in f if path.strip()] | ||
| # To make /a/b/c and /a/b/c/ equivalent | ||
| paths = [path.rstrip(os.sep) for path in paths] | ||
| # Remove duplicates | ||
| paths = list(dict.fromkeys(paths)) | ||
|  | ||
| # Compute {path: list of functions} map | ||
| path_to_funcs = get_path_to_functions_map(wasm, sourcemap, paths) | ||
|  | ||
| # Write .manifest file | ||
| f = tempfile.NamedTemporaryFile(suffix=".manifest", mode='w+', delete=False) | ||
|         
                  aheejin marked this conversation as resolved.
              Outdated
          
            Show resolved
            Hide resolved | ||
| try: | ||
| manifest = f.name | ||
| for i, path in enumerate(paths): | ||
| f.write(f'{i}\n') | ||
| if not path_to_funcs[path]: | ||
| diagnostics.warn(f'{path} does not match any functions') | ||
| if args.verbose: | ||
| print(path) | ||
| for func in path_to_funcs[path]: | ||
| print(' ' + func) | ||
| print() | ||
| for func in path_to_funcs[path]: | ||
| f.write(func + '\n') | ||
| if i < len(paths) - 1: | ||
| f.write('\n') | ||
| f.close() | ||
|  | ||
| cmd = [wasm_split, '--multi-split', wasm, '--manifest', manifest] | ||
| if args.verbose: | ||
| # This option is used both in this script and wasm-split | ||
| cmd.append('-v') | ||
| cmd += forwarded_args | ||
| if args.verbose: | ||
| print('\n' + ' '.join(cmd)) | ||
| shared.run_process(cmd) | ||
| finally: | ||
| if not args.preserve_manifest: | ||
| os.remove(manifest) | ||
|  | ||
|  | ||
| if __name__ == '__main__': | ||
| sys.exit(main()) | ||
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|  | @@ -178,7 +178,7 @@ def decodeVLQ(string): | |||
| self.offsets.append(offset) | ||||
| self.offsets.sort() | ||||
|  | ||||
| def find_offset(self, offset): | ||||
| def find_offset(self, offset, lower_bound=None): | ||||
| # Find the largest mapped offset <= the search offset | ||||
| lo = 0 | ||||
| hi = len(self.offsets) | ||||
|  | @@ -189,11 +189,22 @@ def find_offset(self, offset): | |||
| hi = mid | ||||
| else: | ||||
| lo = mid + 1 | ||||
| return self.offsets[lo - 1] | ||||
| if lo == 0: | ||||
| return None | ||||
| # If lower bound is given, return the offset only if the offset is equal to | ||||
| # or greather than the lower bound | ||||
| if lower_bound: | ||||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given that there's only one caller of this (and of  There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another place is here: emscripten/tools/emsymbolizer.py Line 226 in eb6fcad 
 What do we give for lower_bound? It doesn't have the current function offset.There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh yeah, ok. | ||||
| if self.offsets[lo - 1] >= lower_bound: | ||||
| return self.offsets[lo - 1] | ||||
| else: | ||||
| return None | ||||
| else: | ||||
| return self.offsets[lo - 1] | ||||
|  | ||||
| def lookup(self, offset): | ||||
| nearest = self.find_offset(offset) | ||||
| assert nearest in self.mappings, 'Sourcemap has an offset with no mapping' | ||||
| def lookup(self, offset, lower_bound=None): | ||||
| nearest = self.find_offset(offset, lower_bound) | ||||
| if not nearest: | ||||
| return None | ||||
| info = self.mappings[nearest] | ||||
| return LocationInfo( | ||||
| self.sources[info.source] if info.source is not None else None, | ||||
|  | @@ -206,12 +217,8 @@ def symbolize_address_sourcemap(module, address, force_file): | |||
| URL = force_file | ||||
| if not URL: | ||||
| # If a sourcemap file is not forced, read it from the wasm module | ||||
| section = get_sourceMappingURL_section(module) | ||||
| assert section | ||||
| module.seek(section.offset) | ||||
| assert module.read_string() == 'sourceMappingURL' | ||||
| # TODO: support stripping/replacing a prefix from the URL | ||||
| URL = module.read_string() | ||||
| URL = module.get_sourceMappingURL() | ||||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no need to add to this PR if things are working for you, but last time I tried to actually use emsymbolizer, I had to add something like probably because I was using relative paths everywhere. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This doesn't change anything for emsymbolizer (I just moved sourceMappingURL-getting code from  There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right I had to add it locally (I added it right here in emsymbolizer because that's where the code was until now). Again, this was just an FYI. Maybe I'll just try to reproduce the behavior and add a proper test. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, emsymbolizer has worked for me with no change so far.. Yeah please let me know if you find the condition in which it becomes a problem. | ||||
|  | ||||
| if shared.DEBUG: | ||||
| print(f'Source Mapping URL: {URL}') | ||||
|  | ||||
Uh oh!
There was an error while loading. Please reload this page.