emscripten-core · aheejin · Sep 22, 2025 · Sep 3, 2025 · Sep 15, 2025 · Sep 15, 2025
diff --git a/.gitignore b/.gitignore
@@ -46,6 +46,7 @@ emdump
 emdwp
 emmake
 emnm
+empath-split
 emprofile
 emranlib
 emrun
@@ -67,6 +68,7 @@ emdump.bat
 emdwp.bat
 emmake.bat
 emnm.bat
+empath-split.bat
 emprofile.bat
 emranlib.bat
 emrun.bat

diff --git a/test/test_other.py b/test/test_other.py
@@ -57,6 +57,7 @@
 emmake = shared.bat_suffix(path_from_root('emmake'))
 emconfig = shared.bat_suffix(path_from_root('em-config'))
 emsize = shared.bat_suffix(path_from_root('emsize'))
+empath_split = shared.bat_suffix(path_from_root('empath-split'))
 emprofile = shared.bat_suffix(path_from_root('emprofile'))
 emstrip = shared.bat_suffix(path_from_root('emstrip'))
 emsymbolizer = shared.bat_suffix(path_from_root('emsymbolizer'))
@@ -16585,3 +16586,50 @@ def test_create_preloaded_file(self):
         return 0;
       }''')
     self.do_runf('main.c', 'done\n', cflags=['-sFORCE_FILESYSTEM', '--post-js=post.js'])
+
+  def test_empath_split(self):
+    create_file('main.cpp', r'''
+      #include <iostream>
+      void foo();
+      int main() {
+        std::cout << "main" << std::endl;
+        foo();
+        return 0;
+      }
+    ''')
+    create_file('foo.cpp', r'''
+      #include <iostream>
+      void foo() { std::cout << "foo" << std::endl; }
+    ''')
+    create_file('path_list', r'''
+      main.cpp
+      foo.cpp
+      /emsdk/emscripten/system
+      /emsdk/emscripten/system/lib/libc/musl
+      /emsdk/emscripten/system/lib/libcxx
+    ''')
+
+    self.run_process([EMCC, 'main.cpp', 'foo.cpp', '-gsource-map', '-g2', '-o', 'test.js'])
+    self.run_process([empath_split, 'test.wasm', 'path_list', '-g', '-o', 'test_primary.wasm', '--out-prefix=test_'])
+
+    # Check if functions are correctly assigned and split with the specified
+    # paths. When one path contains another, the inner path should take its
+    # functions first, and the rest is split with the outer path.
+    def has_defined_function(file, func):
+      self.run_process([common.WASM_DIS, file, '-o', 'test.wast'])
+      pattern = re.compile(r'^\s*\(\s*func\s+\$' + func + r'[\s\(\)]', flags=re.MULTILINE)
+      with open('test.wast') as f:
+        return pattern.search(f.read()) is not None
+
+    # main.cpp
+    self.assertTrue(has_defined_function('test_0.wasm', '__original_main'))
+    # foo.cpp
+    self.assertTrue(has_defined_function('test_1.wasm', r'foo\\28\\29'))
+    # /emsdk/emscripten/system
+    self.assertTrue(has_defined_function('test_2.wasm', '__abort_message'))
+    self.assertTrue(has_defined_function('test_2.wasm', 'pthread_cond_wait'))
+    # /emsdk/emscripten/system/lib/libc/musl
+    self.assertTrue(has_defined_function('test_3.wasm', 'strcmp'))
+    # /emsdk/emscripten/system/lib/libcxx
+    self.assertTrue(has_defined_function('test_4.wasm', r'std::__2::ios_base::getloc\\28\\29\\20const'))
+    self.assertTrue(has_defined_function('test_4.wasm', r'std::uncaught_exceptions\\28\\29'))
diff --git a/tools/empath-split.py b/tools/empath-split.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+# Copyright 2025 The Emscripten Authors.  All rights reserved.
+# Emscripten is available under two separate licenses, the MIT license and the
+# University of Illinois/NCSA Open Source License.  Both these licenses can be
+# found in the LICENSE file.
+
+"""
+Wrapper for 'wasm-split --multi-split' functionality. This script generates a
+.manifest file based on the list of user source paths, using source map
+information.
+
+This assumes the name section exists in the input wasm file, and also assumes
+the sourceMappingURL section exists in the input or a source map file is
+separately supplied with --sourcemap. If we have two files a.c and b.c, to
+generate a source map and the name section, if you compile and link within a
+single command, you can do something like
+$ emcc -g2 -gsrouce-map a.c b.c -o result.js
+If you want to compile and link in separate commands, you can do
+$ emcc -gsource-map a.c -o a.o
+$ emcc -gsource-map b.c -o b.o
+$ emcc -g2 -gsource-map a.o b.o -o result.js
+See https://emscripten.org/docs/porting/Debugging.html for more details.
+
+This takes a wasm file and a paths file, which is a text file containing a list
+of paths as inputs. The paths file should contain a single path per line. A
+single split module will be generated per specified path. If a specified path
+contains another specified path, functions contained in the inner path will be
+split as the inner path's module, and the rest of the functions will be split as
+the outer path's module. Functions that do not belong to any of the specified
+paths will remain in the primary module.
+"""
+
+import argparse
+import os
+import sys
+import tempfile
+from pathlib import PurePath
+
+__scriptdir__ = os.path.dirname(os.path.abspath(__file__))
+__rootdir__ = os.path.dirname(__scriptdir__)
+sys.path.insert(0, __rootdir__)
+
+from tools import config
+from tools import diagnostics
+from tools import emsymbolizer
+from tools import shared
+from tools import webassembly
+from tools.utils import exit_with_error, normalize_path
+
+
+def parse_args():
+  parser = argparse.ArgumentParser(
+      description='Split a wasm file based on user paths',
+      epilog="""
+This is a wrapper for 'wasm-split --multi-split' functionality, so you should
+add wasm-split's command line options as well. You should or may want to add
+wasm-split options like -o (--output), --out-prefix, -g, and feature
+enabling/disabling options. Run 'wasm-split -h' for the list of options. But you
+should NOT add --manifest, because this will be generated from this script.
+""")
+  parser.add_argument('wasm', help='Path to the input wasm file')
+  parser.add_argument('paths_file', help='Path to the input file containing paths')
+  parser.add_argument('-s', '--sourcemap', help='Force source map file')
+  parser.add_argument('-v', '--verbose', action='store_true',
+                      help='Print verbose info for debugging this script')
+  parser.add_argument('--wasm-split', help='Path to wasm-split executable')
+  parser.add_argument('--preserve-manifest', action='store_true',
+                      help='Preserve generated manifest file. This sets --verbose too.')
+  args, forwarded_args = parser.parse_known_args()
+  if '--manifest' in forwarded_args:
+    exit_with_error('manifest file will be generated by this script and should not be given')
+  if args.preserve_manifest:
+    args.verbose = True
+  return args, forwarded_args
+
+
+def get_path_to_functions_map(wasm, sourcemap, paths):
+  def is_synthesized_func(func):
+    # TODO There can be more
+    synthesized_names = [
+      'main',
+      '__wasm_call_ctors',
+      '__clang_call_terminate',
+    ]
+    synthesized_prefixes = [
+      'legalstub$',
+      'legalfunc$',
+      '__cxx_global_',
+      '_GLOBAL__',
+      'virtual thunk to ',
+    ]
+    if func in synthesized_names:
+      return True
+    return func.startswith(tuple(synthesized_prefixes))
+
+  # Compute {func_name: src file} map, and invert it to get
+  # {src file: list of functions} map, and construct {path: list of functions}
+  # map from it
+  with webassembly.Module(wasm) as module:
+    if not module.has_name_section():
+      exit_with_error('Name section does not eixst')
+    if not sourcemap:
+      if not emsymbolizer.get_sourceMappingURL_section(module):
+        exit_with_error('sourceMappingURL section does not exist')
+
+    funcs = module.get_functions()
+    func_names = module.get_function_names()
+    assert len(funcs) == len(func_names)
+
+    func_to_src = {}
+    src_to_funcs = {}
+
+    if not sourcemap:
+      sourcemap = module.get_sourceMappingURL()
+    sm = emsymbolizer.WasmSourceMap()
+    sm.parse(sourcemap)
+
+    for func_name, func in zip(func_names, funcs):
+      # From the last address, decrement the address by 1 until we find location
+      # info with source file information. The reason we do this is to reduce
+      # the probability of picking an address where another function is inlined
+      # into, picking the inlined function's source.
+      # We start from the end because it is simpler; it is harder to compute the
+      # first instruction's address, because there is a gap for local types
+      # between function offset and the first instruction.
+      addr = func.offset + func.size - 1
+      while addr > func.offset:
+        loc = sm.lookup(addr, func.offset)
+        # This means there is no source map mappings for the entire function
+        # (because we give func.offset as a lower bound). Exit the loop.
+        if not loc:
+          break
+        # Exit the loop only if a location info with source file information is
+        # found. If not, continue the search.
+        if loc.source:
+          break
+        addr -= 1
+
+      if loc and loc.source:
+        func_to_src[func_name] = normalize_path(loc.source)
+      else:
+        if not is_synthesized_func(func_name):
+          diagnostics.warn(f"No source file information found in the source map for function '{func_name}'")
+
+    for func_name, src in func_to_src.items():
+      if src not in src_to_funcs:
+        src_to_funcs[src] = []
+      src_to_funcs[src].append(func_name)
+
+  # Visit paths in the reverse sorting order, so that we can process inner paths
+  # first.
+  # e.g. If we have /a/b and /a/b/c, /a/b/c will come first, so we can assign
+  # functions contained in /a/b/c to it first and assign the maining functions
+  # to /a/b.
+  visited_funcs = set()
+  path_to_funcs = {}
+  for path in sorted(paths, reverse=True):
+    ppath = PurePath(path)
+    path_to_funcs[path] = []
+    for src, funcs in src_to_funcs.items():
+      psrc = PurePath(src)
+      if ppath == psrc or ppath in psrc.parents:
+        for func in funcs:
+          if func not in visited_funcs:
+            visited_funcs.add(func)
+            path_to_funcs[path].append(func)
+  return path_to_funcs
+
+
+def main():
+  args, forwarded_args = parse_args()
+  wasm = os.path.expanduser(args.wasm)
+  paths_file = os.path.expanduser(args.paths_file)
+  sourcemap = os.path.expanduser(args.sourcemap) if args.sourcemap else None
+  if args.wasm_split:
+    wasm_split = os.path.expanduser(args.wasm_split)
+  else:
+    wasm_split = os.path.join(config.BINARYEN_ROOT, 'bin', 'wasm-split')
+
+  if not os.path.isfile(wasm):
+    exit_with_error(f"'{wasm}' was not found or not a file")
+  if not os.path.isfile(paths_file):
+    exit_with_error(f"'{paths_file}' was not found or not a file")
+  if sourcemap:
+    if not os.path.isfile(sourcemap):
+      exit_with_error(f"'{sourcemap}' was not found or not a file")
+  if not os.path.isfile(wasm_split):
+    exit_with_error(f"'{wasm_split}' was not found or not a file")
+
+  with open(paths_file, encoding='utf-8') as f:
+    paths = [normalize_path(path.strip()) for path in f if path.strip()]
+    # To make /a/b/c and /a/b/c/ equivalent
+    paths = [path.rstrip(os.sep) for path in paths]
+    # Remove duplicates
+    paths = list(dict.fromkeys(paths))
+
+  # Compute {path: list of functions} map
+  path_to_funcs = get_path_to_functions_map(wasm, sourcemap, paths)
+
+  # Write .manifest file
+  f = tempfile.NamedTemporaryFile(suffix=".manifest", mode='w+', delete=False)
+  try:
+    manifest = f.name
+    for i, path in enumerate(paths):
+      f.write(f'{i}\n')
+      if not path_to_funcs[path]:
+        diagnostics.warn(f'{path} does not match any functions')
+      if args.verbose:
+        print(path)
+        for func in path_to_funcs[path]:
+          print('  ' + func)
+        print()
+      for func in path_to_funcs[path]:
+        f.write(func + '\n')
+      if i < len(paths) - 1:
+        f.write('\n')
+    f.close()
+
+    cmd = [wasm_split, '--multi-split', wasm, '--manifest', manifest]
+    if args.verbose:
+      # This option is used both in this script and wasm-split
+      cmd.append('-v')
+    cmd += forwarded_args
+    if args.verbose:
+      print('\n' + ' '.join(cmd))
+    shared.run_process(cmd)
+  finally:
+    if not args.preserve_manifest:
+      os.remove(manifest)
+
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/tools/emsymbolizer.py b/tools/emsymbolizer.py
@@ -178,7 +178,7 @@ def decodeVLQ(string):
       self.offsets.append(offset)
     self.offsets.sort()
 
-  def find_offset(self, offset):
+  def find_offset(self, offset, lower_bound=None):
     # Find the largest mapped offset <= the search offset
     lo = 0
     hi = len(self.offsets)
@@ -189,11 +189,22 @@ def find_offset(self, offset):
         hi = mid
       else:
         lo = mid + 1
-    return self.offsets[lo - 1]
+    if lo == 0:
+      return None
+    # If lower bound is given, return the offset only if the offset is equal to
+    # or greather than the lower bound
+    if lower_bound:
 return sm.lookup(address) 
 return sm.lookup(address) 
+      if self.offsets[lo - 1] >= lower_bound:
+        return self.offsets[lo - 1]
+      else:
+        return None
+    else:
+      return self.offsets[lo - 1]
 
-  def lookup(self, offset):
-    nearest = self.find_offset(offset)
-    assert nearest in self.mappings, 'Sourcemap has an offset with no mapping'
+  def lookup(self, offset, lower_bound=None):
+    nearest = self.find_offset(offset, lower_bound)
+    if not nearest:
+      return None
     info = self.mappings[nearest]
     return LocationInfo(
         self.sources[info.source] if info.source is not None else None,
@@ -206,12 +217,8 @@ def symbolize_address_sourcemap(module, address, force_file):
   URL = force_file
   if not URL:
     # If a sourcemap file is not forced, read it from the wasm module
-    section = get_sourceMappingURL_section(module)
-    assert section
-    module.seek(section.offset)
-    assert module.read_string() == 'sourceMappingURL'
     # TODO: support stripping/replacing a prefix from the URL
-    URL = module.read_string()
+    URL = module.get_sourceMappingURL()
 
   if shared.DEBUG:
     print(f'Source Mapping URL: {URL}')

diff --git a/tools/maint/create_entry_points.py b/tools/maint/create_entry_points.py
@@ -43,6 +43,7 @@
 emstrip
 emsymbolizer
 emscan-deps
+empath-split
 tools/file_packager
 tools/webidl_binder
 test/runner
@@ -56,6 +57,7 @@
   'emdwp': 'tools/emdwp',
   'emnm': 'tools/emnm',
   'emsymbolizer': 'tools/emsymbolizer',
+  'empath-split': 'tools/empath-split',
 }