Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ emdump
emdwp
emmake
emnm
empath-split
emprofile
emranlib
emrun
Expand All @@ -67,6 +68,7 @@ emdump.bat
emdwp.bat
emmake.bat
emnm.bat
empath-split.bat
emprofile.bat
emranlib.bat
emrun.bat
Expand Down
48 changes: 48 additions & 0 deletions test/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
emmake = shared.bat_suffix(path_from_root('emmake'))
emconfig = shared.bat_suffix(path_from_root('em-config'))
emsize = shared.bat_suffix(path_from_root('emsize'))
empath_split = shared.bat_suffix(path_from_root('empath-split'))
emprofile = shared.bat_suffix(path_from_root('emprofile'))
emstrip = shared.bat_suffix(path_from_root('emstrip'))
emsymbolizer = shared.bat_suffix(path_from_root('emsymbolizer'))
Expand Down Expand Up @@ -16585,3 +16586,50 @@ def test_create_preloaded_file(self):
return 0;
}''')
self.do_runf('main.c', 'done\n', cflags=['-sFORCE_FILESYSTEM', '--post-js=post.js'])

def test_empath_split(self):
create_file('main.cpp', r'''
#include <iostream>
void foo();
int main() {
std::cout << "main" << std::endl;
foo();
return 0;
}
''')
create_file('foo.cpp', r'''
#include <iostream>
void foo() { std::cout << "foo" << std::endl; }
''')
create_file('path_list', r'''
main.cpp
foo.cpp
/emsdk/emscripten/system
/emsdk/emscripten/system/lib/libc/musl
/emsdk/emscripten/system/lib/libcxx
''')

self.run_process([EMCC, 'main.cpp', 'foo.cpp', '-gsource-map', '-g2', '-o', 'test.js'])
self.run_process([empath_split, 'test.wasm', 'path_list', '-g', '-o', 'test_primary.wasm', '--out-prefix=test_'])

# Check if functions are correctly assigned and split with the specified
# paths. When one path contains another, the inner path should take its
# functions first, and the rest is split with the outer path.
def has_defined_function(file, func):
self.run_process([common.WASM_DIS, file, '-o', 'test.wast'])
pattern = re.compile(r'^\s*\(\s*func\s+\$' + func + r'[\s\(\)]', flags=re.MULTILINE)
with open('test.wast') as f:
return pattern.search(f.read()) is not None

# main.cpp
self.assertTrue(has_defined_function('test_0.wasm', '__original_main'))
# foo.cpp
self.assertTrue(has_defined_function('test_1.wasm', r'foo\\28\\29'))
# /emsdk/emscripten/system
self.assertTrue(has_defined_function('test_2.wasm', '__abort_message'))
self.assertTrue(has_defined_function('test_2.wasm', 'pthread_cond_wait'))
# /emsdk/emscripten/system/lib/libc/musl
self.assertTrue(has_defined_function('test_3.wasm', 'strcmp'))
# /emsdk/emscripten/system/lib/libcxx
self.assertTrue(has_defined_function('test_4.wasm', r'std::__2::ios_base::getloc\\28\\29\\20const'))
self.assertTrue(has_defined_function('test_4.wasm', r'std::uncaught_exceptions\\28\\29'))
233 changes: 233 additions & 0 deletions tools/empath-split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
#!/usr/bin/env python3
# Copyright 2025 The Emscripten Authors. All rights reserved.
# Emscripten is available under two separate licenses, the MIT license and the
# University of Illinois/NCSA Open Source License. Both these licenses can be
# found in the LICENSE file.

"""
Wrapper for 'wasm-split --multi-split' functionality. This script generates a
.manifest file based on the list of user source paths, using source map
information.
This assumes the name section exists in the input wasm file, and also assumes
the sourceMappingURL section exists in the input or a source map file is
separately supplied with --sourcemap. If we have two files a.c and b.c, to
generate a source map and the name section, if you compile and link within a
single command, you can do something like
$ emcc -g2 -gsrouce-map a.c b.c -o result.js
If you want to compile and link in separate commands, you can do
$ emcc -gsource-map a.c -o a.o
$ emcc -gsource-map b.c -o b.o
$ emcc -g2 -gsource-map a.o b.o -o result.js
See https://emscripten.org/docs/porting/Debugging.html for more details.
This takes a wasm file and a paths file, which is a text file containing a list
of paths as inputs. The paths file should contain a single path per line. A
single split module will be generated per specified path. If a specified path
contains another specified path, functions contained in the inner path will be
split as the inner path's module, and the rest of the functions will be split as
the outer path's module. Functions that do not belong to any of the specified
paths will remain in the primary module.
"""

import argparse
import os
import sys
import tempfile
from pathlib import PurePath

__scriptdir__ = os.path.dirname(os.path.abspath(__file__))
__rootdir__ = os.path.dirname(__scriptdir__)
sys.path.insert(0, __rootdir__)

from tools import config
from tools import diagnostics
from tools import emsymbolizer
from tools import shared
from tools import webassembly
from tools.utils import exit_with_error, normalize_path


def parse_args():
parser = argparse.ArgumentParser(
description='Split a wasm file based on user paths',
epilog="""
This is a wrapper for 'wasm-split --multi-split' functionality, so you should
add wasm-split's command line options as well. You should or may want to add
wasm-split options like -o (--output), --out-prefix, -g, and feature
enabling/disabling options. Run 'wasm-split -h' for the list of options. But you
should NOT add --manifest, because this will be generated from this script.
""")
parser.add_argument('wasm', help='Path to the input wasm file')
parser.add_argument('paths_file', help='Path to the input file containing paths')
parser.add_argument('-s', '--sourcemap', help='Force source map file')
parser.add_argument('-v', '--verbose', action='store_true',
help='Print verbose info for debugging this script')
parser.add_argument('--wasm-split', help='Path to wasm-split executable')
parser.add_argument('--preserve-manifest', action='store_true',
help='Preserve generated manifest file. This sets --verbose too.')
args, forwarded_args = parser.parse_known_args()
if '--manifest' in forwarded_args:
exit_with_error('manifest file will be generated by this script and should not be given')
if args.preserve_manifest:
args.verbose = True
return args, forwarded_args


def get_path_to_functions_map(wasm, sourcemap, paths):
def is_synthesized_func(func):
# TODO There can be more
synthesized_names = [
'main',
'__wasm_call_ctors',
'__clang_call_terminate',
]
synthesized_prefixes = [
'legalstub$',
'legalfunc$',
'__cxx_global_',
'_GLOBAL__',
'virtual thunk to ',
]
if func in synthesized_names:
return True
return func.startswith(tuple(synthesized_prefixes))

# Compute {func_name: src file} map, and invert it to get
# {src file: list of functions} map, and construct {path: list of functions}
# map from it
with webassembly.Module(wasm) as module:
if not module.has_name_section():
exit_with_error('Name section does not eixst')
if not sourcemap:
if not emsymbolizer.get_sourceMappingURL_section(module):
exit_with_error('sourceMappingURL section does not exist')

funcs = module.get_functions()
func_names = module.get_function_names()
assert len(funcs) == len(func_names)

func_to_src = {}
src_to_funcs = {}

if not sourcemap:
sourcemap = module.get_sourceMappingURL()
sm = emsymbolizer.WasmSourceMap()
sm.parse(sourcemap)

for func_name, func in zip(func_names, funcs):
# From the last address, decrement the address by 1 until we find location
# info with source file information. The reason we do this is to reduce
# the probability of picking an address where another function is inlined
# into, picking the inlined function's source.
# We start from the end because it is simpler; it is harder to compute the
# first instruction's address, because there is a gap for local types
# between function offset and the first instruction.
addr = func.offset + func.size - 1
while addr > func.offset:
loc = sm.lookup(addr, func.offset)
# This means there is no source map mappings for the entire function
# (because we give func.offset as a lower bound). Exit the loop.
if not loc:
break
# Exit the loop only if a location info with source file information is
# found. If not, continue the search.
if loc.source:
break
addr -= 1

if loc and loc.source:
func_to_src[func_name] = normalize_path(loc.source)
else:
if not is_synthesized_func(func_name):
diagnostics.warn(f"No source file information found in the source map for function '{func_name}'")

for func_name, src in func_to_src.items():
if src not in src_to_funcs:
src_to_funcs[src] = []
src_to_funcs[src].append(func_name)

# Visit paths in the reverse sorting order, so that we can process inner paths
# first.
# e.g. If we have /a/b and /a/b/c, /a/b/c will come first, so we can assign
# functions contained in /a/b/c to it first and assign the maining functions
# to /a/b.
visited_funcs = set()
path_to_funcs = {}
for path in sorted(paths, reverse=True):
ppath = PurePath(path)
path_to_funcs[path] = []
for src, funcs in src_to_funcs.items():
psrc = PurePath(src)
if ppath == psrc or ppath in psrc.parents:
for func in funcs:
if func not in visited_funcs:
visited_funcs.add(func)
path_to_funcs[path].append(func)
return path_to_funcs


def main():
args, forwarded_args = parse_args()
wasm = os.path.expanduser(args.wasm)
paths_file = os.path.expanduser(args.paths_file)
sourcemap = os.path.expanduser(args.sourcemap) if args.sourcemap else None
if args.wasm_split:
wasm_split = os.path.expanduser(args.wasm_split)
else:
wasm_split = os.path.join(config.BINARYEN_ROOT, 'bin', 'wasm-split')

if not os.path.isfile(wasm):
exit_with_error(f"'{wasm}' was not found or not a file")
if not os.path.isfile(paths_file):
exit_with_error(f"'{paths_file}' was not found or not a file")
if sourcemap:
if not os.path.isfile(sourcemap):
exit_with_error(f"'{sourcemap}' was not found or not a file")
if not os.path.isfile(wasm_split):
exit_with_error(f"'{wasm_split}' was not found or not a file")

with open(paths_file, encoding='utf-8') as f:
paths = [normalize_path(path.strip()) for path in f if path.strip()]
# To make /a/b/c and /a/b/c/ equivalent
paths = [path.rstrip(os.sep) for path in paths]
# Remove duplicates
paths = list(dict.fromkeys(paths))

# Compute {path: list of functions} map
path_to_funcs = get_path_to_functions_map(wasm, sourcemap, paths)

# Write .manifest file
f = tempfile.NamedTemporaryFile(suffix=".manifest", mode='w+', delete=False)
try:
manifest = f.name
for i, path in enumerate(paths):
f.write(f'{i}\n')
if not path_to_funcs[path]:
diagnostics.warn(f'{path} does not match any functions')
if args.verbose:
print(path)
for func in path_to_funcs[path]:
print(' ' + func)
print()
for func in path_to_funcs[path]:
f.write(func + '\n')
if i < len(paths) - 1:
f.write('\n')
f.close()

cmd = [wasm_split, '--multi-split', wasm, '--manifest', manifest]
if args.verbose:
# This option is used both in this script and wasm-split
cmd.append('-v')
cmd += forwarded_args
if args.verbose:
print('\n' + ' '.join(cmd))
shared.run_process(cmd)
finally:
if not args.preserve_manifest:
os.remove(manifest)


if __name__ == '__main__':
sys.exit(main())
27 changes: 17 additions & 10 deletions tools/emsymbolizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def decodeVLQ(string):
self.offsets.append(offset)
self.offsets.sort()

def find_offset(self, offset):
def find_offset(self, offset, lower_bound=None):
# Find the largest mapped offset <= the search offset
lo = 0
hi = len(self.offsets)
Expand All @@ -189,11 +189,22 @@ def find_offset(self, offset):
hi = mid
else:
lo = mid + 1
return self.offsets[lo - 1]
if lo == 0:
return None
# If lower bound is given, return the offset only if the offset is equal to
# or greather than the lower bound
if lower_bound:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that there's only one caller of this (and of lookup) and we don't anticipate any different use cases, maybe we should just simplify this by requiring lower_bound.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another place is here:

return sm.lookup(address)

What do we give for lower_bound? It doesn't have the current function offset.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh yeah, ok.
Maybe for the original "just symbolize a random address" emsymbolizer use case we can eventually do better than we are now (e.g. give some kind of warning if we end up finding a location that corresponds to a different function from the given address, because odds are good it's not what the user actually wanted). But that doesn't have to be for this PR.

if self.offsets[lo - 1] >= lower_bound:
return self.offsets[lo - 1]
else:
return None
else:
return self.offsets[lo - 1]

def lookup(self, offset):
nearest = self.find_offset(offset)
assert nearest in self.mappings, 'Sourcemap has an offset with no mapping'
def lookup(self, offset, lower_bound=None):
nearest = self.find_offset(offset, lower_bound)
if not nearest:
return None
info = self.mappings[nearest]
return LocationInfo(
self.sources[info.source] if info.source is not None else None,
Expand All @@ -206,12 +217,8 @@ def symbolize_address_sourcemap(module, address, force_file):
URL = force_file
if not URL:
# If a sourcemap file is not forced, read it from the wasm module
section = get_sourceMappingURL_section(module)
assert section
module.seek(section.offset)
assert module.read_string() == 'sourceMappingURL'
# TODO: support stripping/replacing a prefix from the URL
URL = module.read_string()
URL = module.get_sourceMappingURL()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no need to add to this PR if things are working for you, but last time I tried to actually use emsymbolizer, I had to add something like

if not os.path.isfile(URL):
      URL = os.path.join(os.path.dirname(module.filename), URL)

probably because I was using relative paths everywhere.

Copy link
Member Author

@aheejin aheejin Sep 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't change anything for emsymbolizer (I just moved sourceMappingURL-getting code from emsymbolizer.py to webassembly.py) and there was no os.path.join(os.path.dirname, ...) in emsymbolizer.py. Where am I supposed to add it?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right I had to add it locally (I added it right here in emsymbolizer because that's where the code was until now). Again, this was just an FYI. Maybe I'll just try to reproduce the behavior and add a proper test.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, emsymbolizer has worked for me with no change so far.. Yeah please let me know if you find the condition in which it becomes a problem.


if shared.DEBUG:
print(f'Source Mapping URL: {URL}')
Expand Down
2 changes: 2 additions & 0 deletions tools/maint/create_entry_points.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
emstrip
emsymbolizer
emscan-deps
empath-split
tools/file_packager
tools/webidl_binder
test/runner
Expand All @@ -56,6 +57,7 @@
'emdwp': 'tools/emdwp',
'emnm': 'tools/emnm',
'emsymbolizer': 'tools/emsymbolizer',
'empath-split': 'tools/empath-split',
}


Expand Down
Loading