Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
PaulKinlan committed Nov 9, 2023
2 parents 060b124 + 18c0ec3 commit 57cae64
Show file tree
Hide file tree
Showing 17 changed files with 438 additions and 35 deletions.
1 change: 1 addition & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions seeds/breadboard-web/tests/async-gen.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,9 @@
import { expect, test } from "vitest";
import {
LastMessageKeeper,
PatchedReadableStream,
asyncGen,
streamFromAsyncGen,
} from "../src/async-gen";
import { Readable } from "stream";

test("async-gen", async () => {
const results = [];
Expand Down
5 changes: 5 additions & 0 deletions seeds/breadboard/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,8 @@ export { toMermaid } from "./mermaid.js";
export type { Schema } from "jsonschema";
export { callHandler } from "./handler.js";
export { asRuntimeKit } from "./kits/ctors.js";
export {
StreamCapability,
isStreamCapability,
type StreamCapabilityType,
} from "./stream.js";
57 changes: 57 additions & 0 deletions seeds/breadboard/src/stream.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/**
* @license
* Copyright 2023 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/

import type { Capability, NodeValue } from "./types.js";

const STREAM_KIND = "stream" as const;

export interface StreamCapabilityType<ChunkType = object> extends Capability {
kind: typeof STREAM_KIND;
stream: ReadableStream<ChunkType>;
}

export class StreamCapability<ChunkType>
implements StreamCapabilityType<ChunkType>
{
kind = STREAM_KIND;
stream: ReadableStream<ChunkType>;

constructor(stream: ReadableStream<ChunkType>) {
this.stream = stream;
}
}

export const isStreamCapability = (object: unknown) => {
const maybeStream = object as StreamCapabilityType;
return (
maybeStream.kind &&
maybeStream.kind === STREAM_KIND &&
maybeStream.stream instanceof ReadableStream
);
};

const findStreams = (value: NodeValue, foundStreams: ReadableStream[]) => {
if (Array.isArray(value)) {
value.forEach((item: NodeValue) => {
findStreams(item, foundStreams);
});
} else if (typeof value === "object") {
const maybeCapability = value as StreamCapabilityType;
if (maybeCapability.kind && maybeCapability.kind === STREAM_KIND) {
foundStreams.push(maybeCapability.stream);
} else {
Object.values(value as object).forEach((item) => {
findStreams(item, foundStreams);
});
}
}
};

export const getStreams = (value: NodeValue) => {
const foundStreams: ReadableStream[] = [];
findStreams(value, foundStreams);
return foundStreams;
};
21 changes: 21 additions & 0 deletions seeds/breadboard/src/ui/output.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
*/

import { type Schema } from "jsonschema";
import { StreamCapabilityType } from "../stream.js";

export type OutputArgs = Record<string, unknown> & {
schema: Schema;
Expand All @@ -30,9 +31,29 @@ export class Output extends HTMLElement {
return;
}
Object.entries(schema.properties).forEach(([key, property]) => {
if (property.type === "object" && property.format === "stream") {
this.appendStream(
property,
(values[key] as StreamCapabilityType).stream
);
return;
}
const html = document.createElement("pre");
html.innerHTML = `${values[key]}`;
root.append(`${property.title}: `, html, "\n");
});
}

appendStream(property: Schema, stream: ReadableStream) {
const root = this.shadowRoot;
if (!root) return;
root.append(`${property.title}: `);
stream.pipeThrough(new TextDecoderStream()).pipeTo(
new WritableStream({
write(chunk) {
root.append(chunk);
},
})
);
}
}
8 changes: 6 additions & 2 deletions seeds/breadboard/src/worker/controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
* SPDX-License-Identifier: Apache-2.0
*/

import { getStreams } from "../stream.js";
import { InputValues } from "../types.js";
import {
type ControllerMessage,
type RoundTripControllerMessage,
Expand Down Expand Up @@ -40,11 +42,13 @@ export class WorkerTransport implements MessageControllerTransport {
}

sendRoundTripMessage<T extends RoundTripControllerMessage>(message: T) {
this.worker.postMessage(message);
const streams = getStreams(message.data as InputValues);
this.worker.postMessage(message, streams);
}

sendMessage<T extends ControllerMessage>(message: T) {
this.worker.postMessage(message);
const streams = getStreams(message.data as InputValues);
this.worker.postMessage(message, streams);
}

#onMessage(e: MessageEvent) {
Expand Down
21 changes: 20 additions & 1 deletion seeds/chunker-python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ aggregated into passages under `max_words_per_aggregate_passage` words. If
cannot be combined into a single passage under
`max_words_per_aggregate_passage` words.

`html_tags_to_exclude`: Text within any of the tags in this set will not be
included in the output passages. Defaults to `{"noscript", "script", "style"}`.

If you find your passages are too disjointed (insufficient context in a single
passage for your application), consider increasing
`max_words_per_aggregate_passage` and/or setting
Expand Down Expand Up @@ -124,4 +127,20 @@ passages = chunker.chunk(html)
The sibling children of the `<p>` node are greedily aggregated while the total
is <=4 words:

passages: ["Heading", "Text before link", "and after."]
passages: ["Heading", "Text before link", "and after."]


### Example 5

```
chunker = HtmlChunker(
max_words_per_aggregate_passage=4,
greedily_aggregate_sibling_nodes=False,
html_tags_to_exclude={"p"}
)
passages = chunker.chunk(html)
```

All text within the `<p>` tag is excluded from the output.:

passages: ["Heading"]
2 changes: 1 addition & 1 deletion seeds/chunker-python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "google_labs_html_chunker"
version = "0.0.3"
version = "0.0.5"
authors = [
{ name="Google Labs", email="[email protected]" },
]
Expand Down
28 changes: 19 additions & 9 deletions seeds/chunker-python/src/google_labs_html_chunker/html_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from bs4 import BeautifulSoup, NavigableString, Comment

# Html tags for non-content text. Text within these tags will be excluded from
# passages.
_NON_CONTENT_HTML_TAGS = frozenset({"noscript", "script", "style"})
import bs4

# Text within these html tags will be excluded from passages by default.
_DEFAULT_HTML_TAGS_TO_EXCLUDE = frozenset({"noscript", "script", "style"})

# Html tags that indicate a section break. Sibling nodes will not be
# greedily-aggregated into a chunk across one of these tags.
Expand Down Expand Up @@ -53,15 +53,22 @@ class HtmlChunker:
false, each sibling node is output as a separate passage if they cannot
all be combined into a single passage under
max_words_per_aggregate_passage words.
html_tags_to_exclude: Text within any of the tags in this set will not be
included in the output passages. Defaults to {"noscript", "script",
"style"}.
"""

def __init__(
self,
max_words_per_aggregate_passage: int,
greedily_aggregate_sibling_nodes: bool,
html_tags_to_exclude: frozenset[str] = _DEFAULT_HTML_TAGS_TO_EXCLUDE,
) -> None:
self.max_words_per_aggregate_passage = max_words_per_aggregate_passage
self.greedily_aggregate_sibling_nodes = greedily_aggregate_sibling_nodes
self.html_tags_to_exclude = {
tag.strip().lower() for tag in html_tags_to_exclude
}

class PassageList:
"""A list of text passages."""
Expand Down Expand Up @@ -127,13 +134,16 @@ def _process_node(self, node) -> AggregateNode:
current_node = self.AggregateNode()
if node.name:
current_node.html_tag = node.name
if node.name in _NON_CONTENT_HTML_TAGS or isinstance(node, Comment):
if node.name in self.html_tags_to_exclude or isinstance(node, bs4.Comment):
# Exclude text within these tags.
return current_node

if isinstance(node, NavigableString):
current_node.num_words = len(node.split())
current_node.segments.append(node.strip())
if isinstance(node, bs4.NavigableString):
# Store the text for this leaf node (skipping text directly under the
# top-level BeautifulSoup object, e.g. "html" from <!DOCTYPE html>).
if node.parent.name != "[document]":
current_node.num_words = len(node.split())
current_node.segments.append(node.strip())
return current_node

# Will hold the aggregate of this node and all its unchunked descendants
Expand Down Expand Up @@ -201,7 +211,7 @@ def chunk(self, html: str) -> list[str]:
Returns:
A list of text passages from the html.
"""
tree = BeautifulSoup(html, "html5lib")
tree = bs4.BeautifulSoup(html, "html5lib")
root_agg_node = self._process_node(tree)
if not root_agg_node.get_passages():
root_agg_node.passage_list.add_passage_for_node(root_agg_node)
Expand Down
2 changes: 2 additions & 0 deletions seeds/chunker-python/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
arg_parser.add_argument("-o", "--outfile", help="Output passages file path.", required=True)
arg_parser.add_argument("--maxwords", type=int, default=200, help="Max words per aggregate passage.")
arg_parser.add_argument("--greedyagg", action=argparse.BooleanOptionalAction, help="Whether to greedily aggregate sibling nodes.")
arg_parser.add_argument("--excludetags", type=str, default="noscript,script,style", help="Comma-separated HTML tags from which to exclude text.")
args = arg_parser.parse_args()

html_file = open(args.infile, "r")
Expand All @@ -34,6 +35,7 @@
chunker = HtmlChunker(
max_words_per_aggregate_passage=args.maxwords,
greedily_aggregate_sibling_nodes=args.greedyagg,
html_tags_to_exclude={tag for tag in args.excludetags.split(',')},
)
passages = chunker.chunk(html)

Expand Down
82 changes: 73 additions & 9 deletions seeds/chunker-python/tests/test_html_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,39 @@ def test_handles_escape_codes(self):
["Here's a paragraph."],
)

def test_handles_unicode_characters(self):
html = (
"<p>Here is a"
" \u2119\u212b\u213e\u212b\u210A\u213e\u212b\u2119\u210F.</p>"
)

chunker = HtmlChunker(
max_words_per_aggregate_passage=10,
greedily_aggregate_sibling_nodes=False,
)

self.assertEqual(
chunker.chunk(html),
["Here is a ℙÅℾÅℊℾÅℙℏ."],
)

def test_handles_byte_string(self):
html_bytes = (
b"<p>Here is a"
b" \xe2\x84\x99\xe2\x84\xab\xe2\x84\xbe\xe2\x84\xab\xe2\x84\x8a\xe2\x84\xbe\xe2\x84\xab\xe2\x84\x99\xe2\x84\x8f.</p>"
)

chunker = HtmlChunker(
max_words_per_aggregate_passage=10,
greedily_aggregate_sibling_nodes=False,
)

# When using bytes, we must provide the decoding, in this case utf-8.
self.assertEqual(
chunker.chunk(html_bytes.decode("utf-8")),
["Here is a ℙÅℾÅℊℾÅℙℏ."],
)

def test_strips_whitespace_around_node_text(self):
html = """
<div>
Expand Down Expand Up @@ -189,16 +222,19 @@ def test_does_not_join_split_text_nodes_within_p_tag_when_over_max(self):
],
)

def test_skips_non_content_text(self):
def test_excludes_text_from_default_html_tags(self):
html = """
<head>
<title>Title</title>
<style>.my-tag{display:none}</style>
<head>
<body>
<script type="application/json">{"@context":"https://schema.org"}</script>
<p><!-- A comment -->Paragraph</p>
</body>
<!DOCTYPE html>
<html>
<head>
<title>Title</title>
<style>.my-tag{display:none}</style>
<head>
<body>
<script type="application/json">{"@context":"https://schema.org"}</script>
<p><!-- A comment -->Paragraph</p>
</body>
</html>
"""

chunker = HtmlChunker(
Expand All @@ -213,6 +249,34 @@ def test_skips_non_content_text(self):
],
)

def test_excludes_text_from_given_html_tags(self):
html = """
<!DOCTYPE html>
<html>
<head>
<title>Title</title>
<style>.my-tag{display:none}</style>
<head>
<body>
<script type="application/json">{"@context":"https://schema.org"}</script>
<p><!-- A comment -->Paragraph</p>
</body>
</html>
"""

chunker = HtmlChunker(
max_words_per_aggregate_passage=10,
greedily_aggregate_sibling_nodes=False,
html_tags_to_exclude={" HEAD ", "p"},
)

self.assertEqual(
chunker.chunk(html),
[
'{"@context":"https://schema.org"}',
],
)

def test_greedily_aggregates_sibling_nodes(self):
html = """
<div>
Expand Down
Loading

0 comments on commit 57cae64

Please sign in to comment.