Merge branch 'main' of https://github.com/google/labs-prototypes

google · Nov 9, 2023 · 57cae64 · 57cae64
2 parents 060b124 + 18c0ec3
commit 57cae64
Show file tree

Hide file tree

Showing 17 changed files with 438 additions and 35 deletions.
diff --git a/package-lock.json b/package-lock.json
diff --git a/seeds/breadboard-web/tests/async-gen.ts b/seeds/breadboard-web/tests/async-gen.ts
@@ -7,11 +7,9 @@
 import { expect, test } from "vitest";
 import {
   LastMessageKeeper,
-  PatchedReadableStream,
   asyncGen,
   streamFromAsyncGen,
 } from "../src/async-gen";
-import { Readable } from "stream";
 
 test("async-gen", async () => {
   const results = [];

diff --git a/seeds/breadboard/src/index.ts b/seeds/breadboard/src/index.ts
@@ -56,3 +56,8 @@ export { toMermaid } from "./mermaid.js";
 export type { Schema } from "jsonschema";
 export { callHandler } from "./handler.js";
 export { asRuntimeKit } from "./kits/ctors.js";
+export {
+  StreamCapability,
+  isStreamCapability,
+  type StreamCapabilityType,
+} from "./stream.js";
diff --git a/seeds/breadboard/src/stream.ts b/seeds/breadboard/src/stream.ts
@@ -0,0 +1,57 @@
+/**
+ * @license
+ * Copyright 2023 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import type { Capability, NodeValue } from "./types.js";
+
+const STREAM_KIND = "stream" as const;
+
+export interface StreamCapabilityType<ChunkType = object> extends Capability {
+  kind: typeof STREAM_KIND;
+  stream: ReadableStream<ChunkType>;
+}
+
+export class StreamCapability<ChunkType>
+  implements StreamCapabilityType<ChunkType>
+{
+  kind = STREAM_KIND;
+  stream: ReadableStream<ChunkType>;
+
+  constructor(stream: ReadableStream<ChunkType>) {
+    this.stream = stream;
+  }
+}
+
+export const isStreamCapability = (object: unknown) => {
+  const maybeStream = object as StreamCapabilityType;
+  return (
+    maybeStream.kind &&
+    maybeStream.kind === STREAM_KIND &&
+    maybeStream.stream instanceof ReadableStream
+  );
+};
+
+const findStreams = (value: NodeValue, foundStreams: ReadableStream[]) => {
+  if (Array.isArray(value)) {
+    value.forEach((item: NodeValue) => {
+      findStreams(item, foundStreams);
+    });
+  } else if (typeof value === "object") {
+    const maybeCapability = value as StreamCapabilityType;
+    if (maybeCapability.kind && maybeCapability.kind === STREAM_KIND) {
+      foundStreams.push(maybeCapability.stream);
+    } else {
+      Object.values(value as object).forEach((item) => {
+        findStreams(item, foundStreams);
+      });
+    }
+  }
+};
+
+export const getStreams = (value: NodeValue) => {
+  const foundStreams: ReadableStream[] = [];
+  findStreams(value, foundStreams);
+  return foundStreams;
+};
diff --git a/seeds/breadboard/src/ui/output.ts b/seeds/breadboard/src/ui/output.ts
@@ -5,6 +5,7 @@
  */
 
 import { type Schema } from "jsonschema";
+import { StreamCapabilityType } from "../stream.js";
 
 export type OutputArgs = Record<string, unknown> & {
   schema: Schema;
@@ -30,9 +31,29 @@ export class Output extends HTMLElement {
       return;
     }
     Object.entries(schema.properties).forEach(([key, property]) => {
+      if (property.type === "object" && property.format === "stream") {
+        this.appendStream(
+          property,
+          (values[key] as StreamCapabilityType).stream
+        );
+        return;
+      }
       const html = document.createElement("pre");
       html.innerHTML = `${values[key]}`;
       root.append(`${property.title}: `, html, "\n");
     });
   }
+
+  appendStream(property: Schema, stream: ReadableStream) {
+    const root = this.shadowRoot;
+    if (!root) return;
+    root.append(`${property.title}: `);
+    stream.pipeThrough(new TextDecoderStream()).pipeTo(
+      new WritableStream({
+        write(chunk) {
+          root.append(chunk);
+        },
+      })
+    );
+  }
 }
diff --git a/seeds/breadboard/src/worker/controller.ts b/seeds/breadboard/src/worker/controller.ts
@@ -4,6 +4,8 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+import { getStreams } from "../stream.js";
+import { InputValues } from "../types.js";
 import {
   type ControllerMessage,
   type RoundTripControllerMessage,
@@ -40,11 +42,13 @@ export class WorkerTransport implements MessageControllerTransport {
   }
 
   sendRoundTripMessage<T extends RoundTripControllerMessage>(message: T) {
-    this.worker.postMessage(message);
+    const streams = getStreams(message.data as InputValues);
+    this.worker.postMessage(message, streams);
   }
 
   sendMessage<T extends ControllerMessage>(message: T) {
-    this.worker.postMessage(message);
+    const streams = getStreams(message.data as InputValues);
+    this.worker.postMessage(message, streams);
   }
 
   #onMessage(e: MessageEvent) {

diff --git a/seeds/chunker-python/README.md b/seeds/chunker-python/README.md
@@ -33,6 +33,9 @@ aggregated into passages under `max_words_per_aggregate_passage` words. If
 cannot be combined into a single passage under
 `max_words_per_aggregate_passage` words.
 
+`html_tags_to_exclude`: Text within any of the tags in this set will not be
+included in the output passages. Defaults to `{"noscript", "script", "style"}`.
+
 If you find your passages are too disjointed (insufficient context in a single
 passage for your application), consider increasing
 `max_words_per_aggregate_passage` and/or setting
@@ -124,4 +127,20 @@ passages = chunker.chunk(html)
 The sibling children of the `<p>` node are greedily aggregated while the total
 is <=4 words:
 
-passages: ["Heading", "Text before link", "and after."]
+passages: ["Heading", "Text before link", "and after."]
+
+
+### Example 5
+
+```
+chunker = HtmlChunker(
+    max_words_per_aggregate_passage=4,
+    greedily_aggregate_sibling_nodes=False,
+    html_tags_to_exclude={"p"}
+)
+passages = chunker.chunk(html)
+```
+
+All text within the `<p>` tag is excluded from the output.:
+
+passages: ["Heading"]
diff --git a/seeds/chunker-python/pyproject.toml b/seeds/chunker-python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "google_labs_html_chunker"
-version = "0.0.3"
+version = "0.0.5"
 authors = [
   { name="Google Labs", email="[email protected]" },
 ]

diff --git a/seeds/chunker-python/src/google_labs_html_chunker/html_chunker.py b/seeds/chunker-python/src/google_labs_html_chunker/html_chunker.py
@@ -11,11 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from bs4 import BeautifulSoup, NavigableString, Comment
 
-# Html tags for non-content text. Text within these tags will be excluded from
-# passages.
-_NON_CONTENT_HTML_TAGS = frozenset({"noscript", "script", "style"})
+import bs4
+
+# Text within these html tags will be excluded from passages by default.
+_DEFAULT_HTML_TAGS_TO_EXCLUDE = frozenset({"noscript", "script", "style"})
 
 # Html tags that indicate a section break. Sibling nodes will not be
 # greedily-aggregated into a chunk across one of these tags.
@@ -53,15 +53,22 @@ class HtmlChunker:
       false, each sibling node is output as a separate passage if they cannot
       all be combined into a single passage under
       max_words_per_aggregate_passage words.
+    html_tags_to_exclude: Text within any of the tags in this set will not be
+      included in the output passages. Defaults to {"noscript", "script",
+      "style"}.
   """
 
   def __init__(
       self,
       max_words_per_aggregate_passage: int,
       greedily_aggregate_sibling_nodes: bool,
+      html_tags_to_exclude: frozenset[str] = _DEFAULT_HTML_TAGS_TO_EXCLUDE,
   ) -> None:
     self.max_words_per_aggregate_passage = max_words_per_aggregate_passage
     self.greedily_aggregate_sibling_nodes = greedily_aggregate_sibling_nodes
+    self.html_tags_to_exclude = {
+        tag.strip().lower() for tag in html_tags_to_exclude
+    }
 
   class PassageList:
     """A list of text passages."""
@@ -127,13 +134,16 @@ def _process_node(self, node) -> AggregateNode:
     current_node = self.AggregateNode()
     if node.name:
       current_node.html_tag = node.name
-    if node.name in _NON_CONTENT_HTML_TAGS or isinstance(node, Comment):
+    if node.name in self.html_tags_to_exclude or isinstance(node, bs4.Comment):
       # Exclude text within these tags.
       return current_node
 
-    if isinstance(node, NavigableString):
-      current_node.num_words = len(node.split())
-      current_node.segments.append(node.strip())
+    if isinstance(node, bs4.NavigableString):
+      # Store the text for this leaf node (skipping text directly under the
+      # top-level BeautifulSoup object, e.g. "html" from <!DOCTYPE html>).
+      if node.parent.name != "[document]":
+        current_node.num_words = len(node.split())
+        current_node.segments.append(node.strip())
       return current_node
 
     # Will hold the aggregate of this node and all its unchunked descendants
@@ -201,7 +211,7 @@ def chunk(self, html: str) -> list[str]:
     Returns:
       A list of text passages from the html.
     """
-    tree = BeautifulSoup(html, "html5lib")
+    tree = bs4.BeautifulSoup(html, "html5lib")
     root_agg_node = self._process_node(tree)
     if not root_agg_node.get_passages():
       root_agg_node.passage_list.add_passage_for_node(root_agg_node)

diff --git a/seeds/chunker-python/src/main.py b/seeds/chunker-python/src/main.py
@@ -25,6 +25,7 @@
   arg_parser.add_argument("-o", "--outfile", help="Output passages file path.", required=True)
   arg_parser.add_argument("--maxwords", type=int, default=200, help="Max words per aggregate passage.")
   arg_parser.add_argument("--greedyagg", action=argparse.BooleanOptionalAction, help="Whether to greedily aggregate sibling nodes.")
+  arg_parser.add_argument("--excludetags", type=str, default="noscript,script,style", help="Comma-separated HTML tags from which to exclude text.")
   args = arg_parser.parse_args()
 
   html_file = open(args.infile, "r")
@@ -34,6 +35,7 @@
   chunker = HtmlChunker(
       max_words_per_aggregate_passage=args.maxwords,
       greedily_aggregate_sibling_nodes=args.greedyagg,
+      html_tags_to_exclude={tag for tag in args.excludetags.split(',')},
   )
   passages = chunker.chunk(html)
 

diff --git a/seeds/chunker-python/tests/test_html_chunker.py b/seeds/chunker-python/tests/test_html_chunker.py
@@ -46,6 +46,39 @@ def test_handles_escape_codes(self):
         ["Here's a paragraph."],
     )
 
+  def test_handles_unicode_characters(self):
+    html = (
+        "<p>Here is a"
+        " \u2119\u212b\u213e\u212b\u210A\u213e\u212b\u2119\u210F.</p>"
+    )
+
+    chunker = HtmlChunker(
+        max_words_per_aggregate_passage=10,
+        greedily_aggregate_sibling_nodes=False,
+    )
+
+    self.assertEqual(
+        chunker.chunk(html),
+        ["Here is a ℙÅℾÅℊℾÅℙℏ."],
+    )
+
+  def test_handles_byte_string(self):
+    html_bytes = (
+        b"<p>Here is a"
+        b" \xe2\x84\x99\xe2\x84\xab\xe2\x84\xbe\xe2\x84\xab\xe2\x84\x8a\xe2\x84\xbe\xe2\x84\xab\xe2\x84\x99\xe2\x84\x8f.</p>"
+    )
+
+    chunker = HtmlChunker(
+        max_words_per_aggregate_passage=10,
+        greedily_aggregate_sibling_nodes=False,
+    )
+
+    # When using bytes, we must provide the decoding, in this case utf-8.
+    self.assertEqual(
+        chunker.chunk(html_bytes.decode("utf-8")),
+        ["Here is a ℙÅℾÅℊℾÅℙℏ."],
+    )
+
   def test_strips_whitespace_around_node_text(self):
     html = """
       <div>
@@ -189,16 +222,19 @@ def test_does_not_join_split_text_nodes_within_p_tag_when_over_max(self):
         ],
     )
 
-  def test_skips_non_content_text(self):
+  def test_excludes_text_from_default_html_tags(self):
     html = """
-      <head>
-        <title>Title</title>
-        <style>.my-tag{display:none}</style>
-      <head>
-      <body>
-        <script type="application/json">{"@context":"https://schema.org"}</script>
-        <p><!-- A comment -->Paragraph</p>
-      </body>
+      <!DOCTYPE html>
+      <html>
+        <head>
+          <title>Title</title>
+          <style>.my-tag{display:none}</style>
+        <head>
+        <body>
+          <script type="application/json">{"@context":"https://schema.org"}</script>
+          <p><!-- A comment -->Paragraph</p>
+        </body>
+      </html>
     """
 
     chunker = HtmlChunker(
@@ -213,6 +249,34 @@ def test_skips_non_content_text(self):
         ],
     )
 
+  def test_excludes_text_from_given_html_tags(self):
+    html = """
+      <!DOCTYPE html>
+      <html>
+        <head>
+          <title>Title</title>
+          <style>.my-tag{display:none}</style>
+        <head>
+        <body>
+          <script type="application/json">{"@context":"https://schema.org"}</script>
+          <p><!-- A comment -->Paragraph</p>
+        </body>
+      </html>
+    """
+
+    chunker = HtmlChunker(
+        max_words_per_aggregate_passage=10,
+        greedily_aggregate_sibling_nodes=False,
+        html_tags_to_exclude={"   HEAD ", "p"},
+    )
+
+    self.assertEqual(
+        chunker.chunk(html),
+        [
+            '{"@context":"https://schema.org"}',
+        ],
+    )
+
   def test_greedily_aggregates_sibling_nodes(self):
     html = """
       <div>