Merge #289: type inference: fix bug in union-bound algorithm

uncomputable · uncomputable · commit 5c3ff26cd631 · 2025-04-14T21:49:32.000+02:00
f821e22 fuzz: add ability to generate Simplicity construct nodes, test for encode/decode roundtrip (Andrew Poelstra) bd3fe4e test: add unit tests which demonstrate problems fixed by the last commit (Andrew Poelstra) 75dd206 types: fix bug in "halving" variant of the union-bound algorithm (Andrew Poelstra) 78288bc types: cut off display after 10000 nodes (Andrew Poelstra) Pull request description: We have implemented the "halving" variant of the union-bound algorithm, but my code to do so was wrong in a way that sometimes causes types to "come ununified", i.e. a single type bound is split into two, with different instances in the program more-or-less randomly assigned to each side of the split. This causes all sorts of weird behavior. Fixes #286 ACKs for top commit: uncomputable: utACK f821e22 Tree-SHA512: aa6c1f36e9c26170111bae6222cc8de2e468f1f06752739a89897d807fd212995f69b68a4d31c5c1373b344df9bbec23967e447350cb8ce75656f868d9e73a6c
diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml
@@ -23,6 +23,7 @@ c_rust_merkle,
 decode_natural,
 decode_program,
 parse_human,
+regression_286,
 regression_value,
         ]
     steps:
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
@@ -67,6 +67,13 @@ test = false
 doc = false
 bench = false
 
+[[bin]]
+name = "regression_286"
+path = "fuzz_targets/regression_286.rs"
+test = false
+doc = false
+bench = false
+
 [[bin]]
 name = "regression_value"
 path = "fuzz_targets/regression_value.rs"
diff --git a/fuzz/fuzz_lib/lib.rs b/fuzz/fuzz_lib/lib.rs
@@ -7,6 +7,10 @@ use simplicity::types::Final as FinalTy;
 use simplicity::{BitIter, Value};
 use std::sync::Arc;
 
+mod program;
+
+pub use program::ProgramControl;
+
 /// A wrapper around a buffer which has utilities for extracting various
 /// Simplicity types.
 #[derive(Clone)]
@@ -16,6 +20,7 @@ pub struct Extractor<'f> {
     bit_len: usize,
 }
 
+// More impls in the other modules.
 impl<'f> Extractor<'f> {
     /// Wrap the buffer in an extractor.
     pub fn new(data: &'f [u8]) -> Self {
@@ -37,6 +42,13 @@ impl<'f> Extractor<'f> {
         }
     }
 
+    /// Attempt to yield a u16 from the fuzzer.
+    ///
+    /// Internally, extracts in big-endian.
+    pub fn extract_u16(&mut self) -> Option<u16> {
+        Some((u16::from(self.extract_u8()?) << 8) + u16::from(self.extract_u8()?))
+    }
+
     /// Attempt to yield a single bit from the fuzzer.
     pub fn extract_bit(&mut self) -> Option<bool> {
         if self.bit_len == 0 {
diff --git a/fuzz/fuzz_lib/program.rs b/fuzz/fuzz_lib/program.rs
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: CC0-1.0
+
+use std::sync::Arc;
+
+use super::Extractor;
+use simplicity::node::{
+    CoreConstructible as _, DisconnectConstructible as _, JetConstructible as _,
+    WitnessConstructible as _,
+};
+use simplicity::types;
+use simplicity::{jet::Core, Cmr, ConstructNode, FailEntropy};
+
+/// Structure controlling the kind of programs generated by the fuzzer.
+pub struct ProgramControl {
+    /// Whether to attempt to insert "type bombs" into the generated program.
+    ///
+    /// Importantly, type bombs may have 2^n nodes for fairly large n, and will
+    /// not respect `max_nodes`. So if you are trying to generate small programs
+    /// you should not enable this.
+    pub enable_type_bomb: bool,
+    /// Whether to attempt to insert disconnect nodes into the generated program.
+    pub enable_disconnect: bool,
+    /// Whether to attempt to insert witness nodes into the generated program.
+    pub enable_witness: bool,
+    /// Whether to attempt to insert fail nodes into the generated program.
+    pub enable_fail: bool,
+    /// Whether to attempt to insert assertl and assertr nodes into the generated program.
+    pub enable_asserts: bool,
+    /// Maximum number of nodes a generated program may have. This limit may not
+    /// be exactly enforced. If it is `None`, no limit is enforced.
+    pub max_nodes: Option<usize>,
+}
+
+impl ProgramControl {
+    fn from_u16(u: u16) -> Self {
+        ProgramControl {
+            enable_type_bomb: u & 0x8000 == 0x8000,
+            enable_disconnect: u & 0x4000 == 0x4000,
+            enable_witness: u & 0x2000 == 0x2000,
+            enable_fail: u & 0x1000 == 0x1000,
+            enable_asserts: u & 0x0800 == 0x0800,
+            max_nodes: Some(5 * usize::from(u & 0x07ff)),
+        }
+    }
+}
+
+impl Extractor<'_> {
+    pub fn extract_core_construct_node(
+        &mut self,
+        force_control: Option<ProgramControl>,
+    ) -> Option<Arc<ConstructNode<Core>>> {
+        type ArcNode = Arc<ConstructNode<Core>>;
+
+        let ctx = types::Context::new();
+        let mut stack: Vec<ArcNode> = vec![];
+
+        let program_control =
+            force_control.unwrap_or(ProgramControl::from_u16(self.extract_u16()?));
+
+        let mut count = 0usize;
+        for _ in 0..program_control.max_nodes.unwrap_or(usize::MAX) {
+            let control = self.extract_u8()?;
+            if program_control.enable_type_bomb && control & 0x80 == 0x80 {
+                let mut ret = stack.pop()?;
+                // Special-case: type bomb. Iterate x -> pair(x, x) on the top stack
+                // item up to 128 times. Its CPU cost and target type will blow up
+                // by a factor 2^128. If its target type has nonzero size this should
+                // fail to construct; if it's 0 we should be able to construct it but
+                // the bit machine should reject it.
+                for _ in 0..control & 0x7f {
+                    // FIXME should we refuse to make the type-bomb if `ret` contains any
+                    //  witness or disconnect nodes? In this case the encoding of our
+                    //  CommitNode won't round-trip, since we're force-sharing both children
+                    //  of this `pair` but when decoding `CommitNode` we reject anything that
+                    //  shares witnesses or disconnects, which at commit-time we treat as
+                    //  being unique and never shared.
+                    ret = ArcNode::pair(&ret, &ret).unwrap();
+                }
+                stack.push(ret);
+            } else {
+                match control {
+                    // Return whatever we've got (note that this will "waste" everything else
+                    // on the stack)
+                    0 => {
+                        if stack.len() == 1 {
+                            return stack.pop();
+                        } else {
+                            return None;
+                        }
+                    }
+                    // 1 through 63
+                    1 => stack.push(ArcNode::unit(&ctx)),
+                    2 => stack.push(ArcNode::iden(&ctx)),
+                    3 => {
+                        use simplicity::dag::DagLike as _;
+
+                        let val = self.extract_value_direct()?;
+                        if program_control.max_nodes.is_some() {
+                            for _ in val.as_ref().pre_order_iter::<simplicity::dag::NoSharing>() {
+                                count = count.checked_add(1)?;
+                            }
+                        }
+                        if let Some(max) = program_control.max_nodes {
+                            if val.compact_len() > max {
+                                return None;
+                            }
+                        }
+                        stack.push(ArcNode::scribe(&ctx, &val));
+                    }
+                    4 if program_control.enable_witness => stack.push(ArcNode::witness(&ctx, None)),
+                    5 => {
+                        let child = stack.pop()?;
+                        stack.push(ArcNode::injl(&child));
+                    }
+                    6 => {
+                        let child = stack.pop()?;
+                        stack.push(ArcNode::injr(&child));
+                    }
+                    7 => {
+                        let child = stack.pop()?;
+                        stack.push(ArcNode::drop_(&child));
+                    }
+                    8 => {
+                        let child = stack.pop()?;
+                        stack.push(ArcNode::take(&child));
+                    }
+                    9 => {
+                        let child = stack.pop()?;
+                        let cmr_u8 = self.extract_u8()?;
+                        let cmr = Cmr::from_byte_array([cmr_u8; 32]);
+                        stack.push(ArcNode::assertl(&child, cmr).ok()?);
+                    }
+                    10 => {
+                        let child = stack.pop()?;
+                        let cmr_u8 = self.extract_u8()?;
+                        let cmr = Cmr::from_byte_array([cmr_u8; 32]);
+                        stack.push(ArcNode::assertr(cmr, &child).ok()?);
+                    }
+                    11 if program_control.enable_fail => {
+                        let fail_u8 = self.extract_u8()?;
+                        let fail = FailEntropy::from_byte_array([fail_u8; 64]);
+                        stack.push(ArcNode::fail(&ctx, fail));
+                    }
+                    12 => {
+                        let rchild = stack.pop()?;
+                        let lchild = stack.pop()?;
+                        stack.push(ArcNode::pair(&lchild, &rchild).ok()?);
+                    }
+                    13 => {
+                        let rchild = stack.pop()?;
+                        let lchild = stack.pop()?;
+                        stack.push(ArcNode::case(&lchild, &rchild).ok()?);
+                    }
+                    14 => {
+                        let rchild = stack.pop()?;
+                        let lchild = stack.pop()?;
+                        stack.push(ArcNode::comp(&lchild, &rchild).ok()?);
+                    }
+                    15 if program_control.enable_disconnect => {
+                        let child = stack.pop()?;
+                        stack.push(ArcNode::disconnect(&child, &None).ok()?);
+                    }
+                    // We assume that the above cases did not cover 64-255, so that if we
+                    // right-shift by 6 we can get all 4 values.
+                    _ => {
+                        let extra_bits = usize::from(control >> 6);
+                        let idx = (extra_bits << 8) + usize::from(self.extract_u8()?);
+                        stack.push(ArcNode::jet(&ctx, Core::ALL[idx % Core::ALL.len()]));
+                    }
+                }
+            }
+
+            if let Some(max) = program_control.max_nodes {
+                count = count.checked_add(1)?;
+                if count > max {
+                    return None;
+                }
+            }
+        }
+
+        None
+    }
+}
diff --git a/fuzz/fuzz_targets/regression_286.rs b/fuzz/fuzz_targets/regression_286.rs
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: CC0-1.0
+
+#![cfg_attr(fuzzing, no_main)]
+
+#[cfg(any(fuzzing, test))]
+fn do_test(data: &[u8]) {
+    use simplicity::{jet::Core, BitIter, CommitNode};
+
+    let mut extractor = simplicity_fuzz::Extractor::new(data);
+
+    let construct =
+        match extractor.extract_core_construct_node(Some(simplicity_fuzz::ProgramControl {
+            enable_type_bomb: false,
+            enable_disconnect: false,
+            enable_witness: false,
+            enable_fail: false,
+            enable_asserts: true,
+            max_nodes: Some(25),
+        })) {
+            Some(x) => x,
+            None => return,
+        };
+    //println!("constructed {construct}");
+    let finalized = match construct.finalize_types() {
+        Ok(x) => x,
+        Err(_) => return,
+    };
+    //println!("finalized {finalized}");
+    let prog = finalized.encode_to_vec();
+    //println!("{}", simplicity::bitcoin::hex::DisplayHex::as_hex(&prog));
+    let prog = BitIter::from(prog);
+    let decode = CommitNode::<Core>::decode(prog).unwrap();
+    assert_eq!(
+        finalized, decode,
+        "Constructed committed LHS; encoded and decoded to get RHS",
+    );
+}
+
+#[cfg(fuzzing)]
+libfuzzer_sys::fuzz_target!(|data| do_test(data));
+
+#[cfg(not(fuzzing))]
+fn main() {}
+
+#[cfg(test)]
+mod tests {
+    use base64::Engine;
+
+    #[test]
+    fn duplicate_crash() {
+        let data = base64::prelude::BASE64_STANDARD
+            .decode("Cg==")
+            .expect("base64 should be valid");
+        super::do_test(&data);
+    }
+}
diff --git a/src/node/construct.rs b/src/node/construct.rs
@@ -517,4 +517,74 @@ mod tests {
                 .cmr()
         );
     }
+
+    #[test]
+    fn regression_286_1() {
+        // This is the smallest pure Simplicity program I was able to find that exhibits the bad
+        // behavior seen in https://github.com/BlockstreamResearch/rust-simplicity/issues/286
+        let ctx = types::Context::new();
+        let cmr = Cmr::from_byte_array([0xde; 32]);
+
+        let u0 = Arc::<ConstructNode<Core>>::unit(&ctx);
+        let i1 = Arc::<ConstructNode<Core>>::injl(&u0);
+        let i2 = Arc::<ConstructNode<Core>>::injr(&i1);
+        let i3 = Arc::<ConstructNode<Core>>::injr(&i2);
+        let i4 = Arc::<ConstructNode<Core>>::injl(&i3);
+        let u5 = Arc::<ConstructNode<Core>>::unit(&ctx);
+        let i6 = Arc::<ConstructNode<Core>>::injl(&u5);
+        let i7 = Arc::<ConstructNode<Core>>::injr(&i6);
+        let p8 = Arc::<ConstructNode<Core>>::pair(&i4, &i7).unwrap();
+        let u9 = Arc::<ConstructNode<Core>>::unit(&ctx);
+        let a10 = Arc::<ConstructNode<Core>>::assertr(cmr, &u9).unwrap();
+        let u11 = Arc::<ConstructNode<Core>>::unit(&ctx);
+        let a12 = Arc::<ConstructNode<Core>>::assertr(cmr, &u11).unwrap();
+        let a13 = Arc::<ConstructNode<Core>>::assertl(&a12, cmr).unwrap();
+        let c14 = Arc::<ConstructNode<Core>>::case(&a10, &a13).unwrap();
+        let c15 = Arc::<ConstructNode<Core>>::comp(&p8, &c14).unwrap();
+
+        let finalized: Arc<CommitNode<_>> = c15.finalize_types().unwrap();
+        let prog = finalized.encode_to_vec();
+        // In #286 we are encoding correctly...
+        assert_eq!(
+            hex::DisplayHex::as_hex(&prog).to_string(),
+            "dc920a28812b6f6f6f6f6f6f6f6f6f6f6f6f6f6f6f6f6f6f6f6f6f6f6f6f6f6f6f6f6f6f6f6f243090e00b10e00680",
+        );
+
+        let prog = BitIter::from(prog);
+        let decode = CommitNode::<Core>::decode(prog).unwrap();
+
+        // ...but then during decoding we read the program incorrectly and this assertion fails.
+        assert_eq!(finalized, decode);
+    }
+
+    #[test]
+    fn regression_286_2() {
+        use crate::types::Error as TypeError;
+        use crate::Error;
+
+        // This one is smaller because it starts with a witness node which has a large type.
+        // This is a bit easier to grok but can't be serialized as a complete/valid program
+        // without providing the witness data, which limits its ability to share with the
+        // other libraries.
+        //
+        // It also exhibits the bug earlier than the other one -- it *should* just fail to
+        // typecheck and not be constructible. So we can't get an encoding of it.
+        let ctx = types::Context::new();
+
+        let w0 = Arc::<ConstructNode<Core>>::witness(&ctx, None);
+        let i1 = Arc::<ConstructNode<Core>>::iden(&ctx);
+        let d2 = Arc::<ConstructNode<Core>>::drop_(&i1);
+        let i3 = Arc::<ConstructNode<Core>>::iden(&ctx);
+        let i4 = Arc::<ConstructNode<Core>>::iden(&ctx);
+        let t5 = Arc::<ConstructNode<Core>>::take(&i4);
+        let ca6 = Arc::<ConstructNode<Core>>::case(&i3, &t5).unwrap();
+        let ca7 = Arc::<ConstructNode<Core>>::case(&d2, &ca6).unwrap();
+        let c8 = Arc::<ConstructNode<Core>>::comp(&w0, &ca7).unwrap();
+        let u9 = Arc::<ConstructNode<Core>>::unit(&ctx);
+        let c10 = Arc::<ConstructNode<Core>>::comp(&c8, &u9).unwrap();
+
+        // In #286 we incorrectly succeed finalizing the types, and then encode a bad program.
+        let err = c10.finalize_types().unwrap_err();
+        assert!(matches!(err, Error::Type(TypeError::OccursCheck { .. })));
+    }
 }
diff --git a/src/types/mod.rs b/src/types/mod.rs
@@ -368,12 +368,17 @@ impl Type {
 }
 
 const MAX_DISPLAY_DEPTH: usize = 64;
+const MAX_DISPLAY_LENGTH: usize = 10000;
 
 impl fmt::Debug for Type {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         for data in (&self.ctx, self.inner.bound.root())
             .verbose_pre_order_iter::<NoSharing>(Some(MAX_DISPLAY_DEPTH))
         {
+            if data.index > MAX_DISPLAY_LENGTH {
+                write!(f, "... [truncated type after {} nodes]", MAX_DISPLAY_LENGTH)?;
+                return Ok(());
+            }
             if data.depth == MAX_DISPLAY_DEPTH {
                 if data.n_children_yielded == 0 {
                     f.write_str("...")?;
@@ -407,6 +412,10 @@ impl fmt::Display for Type {
         for data in (&self.ctx, self.inner.bound.root())
             .verbose_pre_order_iter::<NoSharing>(Some(MAX_DISPLAY_DEPTH))
         {
+            if data.index > MAX_DISPLAY_LENGTH {
+                write!(f, "... [truncated type after {} nodes]", MAX_DISPLAY_LENGTH)?;
+                return Ok(());
+            }
             if data.depth == MAX_DISPLAY_DEPTH {
                 if data.n_children_yielded == 0 {
                     f.write_str("...")?;
diff --git a/src/types/union_bound.rs b/src/types/union_bound.rs

Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,7 @@ c_rust_merkle,`
`23`	`23`	`decode_natural,`
`24`	`24`	`decode_program,`
`25`	`25`	`parse_human,`
	`26`	`+regression_286,`
`26`	`27`	`regression_value,`
`27`	`28`	`]`
`28`	`29`	`steps:`