Skip to content

Commit 5da361b

Browse files
authored
Merge pull request #666 from benjeffery/top-level-metadata-python
Add high-level python API for tree-sequence metadata
2 parents 5958f0e + f96a4db commit 5da361b

File tree

7 files changed

+194
-14
lines changed

7 files changed

+194
-14
lines changed

docs/data-model.rst

+2
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,8 @@ interchanged, each row is `base 64 encoded <https://en.wikipedia.org/wiki/Base64
501501
Thus, binary information can be safely printed and exchanged, but may not be
502502
human readable.
503503

504+
The tree sequence itself also has metadata stored as a byte array.
505+
504506
.. _sec_valid_tree_sequence_requirements:
505507

506508
Valid tree sequence requirements

docs/metadata.rst

+9-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
Metadata
55
========
66

7-
Every entity (nodes, mutations, edges, etc.) in a tskit tree sequence can have
7+
The tree-sequence and every entity within it (nodes, mutations, edges, etc.) can have
88
metadata associated with it. This is intended for storing and passing on information
99
that tskit itself does not use or interpret. For example information derived from a VCF
1010
INFO field, or administrative information (such as unique identifiers) relating to
@@ -170,11 +170,17 @@ attribute (e.g. :attr:`tskit.IndividualTable.metadata_schema`). The schemas
170170
for all tables can be retrieved from a :class:`tskit.TreeSequence` by the
171171
:attr:`tskit.TreeSequence.table_metadata_schemas` attribute.
172172

173+
The top-level tree sequence metadata schema is set via
174+
:attr:`tskit.TableCollection.metadata_schema` and can be accessed via
175+
:attr:`tskit.TreeSequence.metadata_schema`.
176+
173177
Each table's ``add_row`` method (e.g. :meth:`tskit.IndividualTable.add_row`) will
174-
validate and encode the metadata using the schema.
178+
validate and encode the metadata using the schema. This encoding will also happen when
179+
tree sequence metadata is set (e.g. ``table_collection.metadata = {...}``.
175180

176181
Metadata will be lazily decoded if accessed via
177-
``tables.individuals[0].metadata`` or ``tree_sequence.individual(0).metadata``.
182+
``tables.individuals[0].metadata``. ``tree_sequence.individual(0).metadata`` or
183+
``tree_sequence.metadata``
178184

179185
In the interests of efficiency the bulk methods of ``set_columns``
180186
(e.g. :meth:`tskit.IndividualTable.set_columns`)

python/CHANGELOG.rst

+3
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ In development
2727
on calls to ``table[j]`` and e.g. ``tree_sequence.node(j)`` See :ref:`sec_metadata`.
2828
(:user:`benjeffery`, :pr:`491`, :pr:`542`, :pr:`543`, :pr:`601`)
2929

30+
- The tree-sequence now has top-level metadata with a schema.
31+
(:user:`benjeffery`, :pr:`666`, :pr:`644`, :pr:`642`)
32+
3033
- Add classes to SVG drawings to allow easy adjustment and styling, and document the new
3134
``tskit.Tree.draw_svg()`` and ``tskit.TreeSequence.draw_svg()`` methods. This also fixes
3235
:issue:`467` for duplicate SVG entity ``id`` s in Jupyter notebooks.

python/tests/test_highlevel.py

+32-2
Original file line numberDiff line numberDiff line change
@@ -1368,7 +1368,37 @@ class TestTreeSequenceMetadata(unittest.TestCase):
13681368
},
13691369
)
13701370

1371-
def test_metadata_schemas(self):
1371+
def test_tree_sequence_metadata_schema(self):
1372+
tc = tskit.TableCollection(1)
1373+
ts = tc.tree_sequence()
1374+
self.assertEqual(str(ts.metadata_schema), str(tskit.MetadataSchema(None)))
1375+
tc.metadata_schema = self.metadata_schema
1376+
ts = tc.tree_sequence()
1377+
self.assertEqual(str(ts.metadata_schema), str(self.metadata_schema))
1378+
with self.assertRaises(AttributeError):
1379+
del ts.metadata_schema
1380+
with self.assertRaises(AttributeError):
1381+
ts.metadata_schema = tskit.MetadataSchema(None)
1382+
1383+
def test_tree_sequence_metadata(self):
1384+
tc = tskit.TableCollection(1)
1385+
ts = tc.tree_sequence()
1386+
self.assertEqual(ts.metadata, b"")
1387+
tc.metadata_schema = self.metadata_schema
1388+
data = {
1389+
"table": "tree-sequence",
1390+
"string_prop": "stringy",
1391+
"num_prop": 42,
1392+
}
1393+
tc.metadata = data
1394+
ts = tc.tree_sequence()
1395+
self.assertEqual(ts.metadata, data)
1396+
with self.assertRaises(AttributeError):
1397+
ts.metadata = {"should": "fail"}
1398+
with self.assertRaises(AttributeError):
1399+
del ts.metadata
1400+
1401+
def test_table_metadata_schemas(self):
13721402
ts = msprime.simulate(5)
13731403
for table in self.metadata_tables:
13741404
tables = ts.dump_tables()
@@ -1405,7 +1435,7 @@ def test_metadata_schemas(self):
14051435
tskit.MetadataSchema({"codec": "json"}),
14061436
)
14071437

1408-
def test_metadata_round_trip_via_row_getters(self):
1438+
def test_table_metadata_round_trip_via_row_getters(self):
14091439
# A tree sequence with all entities
14101440
pop_configs = [msprime.PopulationConfiguration(5) for _ in range(2)]
14111441
migration_matrix = [[0, 1], [1, 0]]

python/tests/test_tables.py

+97
Original file line numberDiff line numberDiff line change
@@ -2109,6 +2109,103 @@ def test_sequence_length_longer_than_edges(self):
21092109
self.assertEqual(len(tree.parent_dict), 0)
21102110

21112111

2112+
class TestTableCollectionMetadata(unittest.TestCase):
2113+
2114+
metadata_schema = metadata.MetadataSchema(
2115+
{
2116+
"codec": "json",
2117+
"title": "Example Metadata",
2118+
"type": "object",
2119+
"properties": {
2120+
"one": {"type": "string"},
2121+
"two": {"type": "number"},
2122+
"three": {"type": "array"},
2123+
"four": {"type": "boolean"},
2124+
},
2125+
"required": ["one", "two", "three", "four"],
2126+
"additionalProperties": False,
2127+
},
2128+
)
2129+
2130+
def metadata_example_data(self, val=0):
2131+
return {
2132+
"one": "val one",
2133+
"two": val,
2134+
"three": list(range(val, val + 10)),
2135+
"four": True,
2136+
}
2137+
2138+
def test_set_metadata_schema(self):
2139+
tc = tskit.TableCollection(1)
2140+
metadata_schema2 = metadata.MetadataSchema({"codec": "json"})
2141+
# Default is no-op metadata codec
2142+
self.assertEqual(str(tc.metadata_schema), str(metadata.MetadataSchema(None)))
2143+
# Set
2144+
tc.metadata_schema = self.metadata_schema
2145+
self.assertEqual(str(tc.metadata_schema), str(self.metadata_schema))
2146+
# Overwrite
2147+
tc.metadata_schema = metadata_schema2
2148+
self.assertEqual(str(tc.metadata_schema), str(metadata_schema2))
2149+
# Remove
2150+
tc.metadata_schema = ""
2151+
self.assertEqual(str(tc.metadata_schema), str(metadata.MetadataSchema(None)))
2152+
# Set after remove
2153+
tc.metadata_schema = self.metadata_schema
2154+
self.assertEqual(str(tc.metadata_schema), str(self.metadata_schema))
2155+
# Del should fail
2156+
with self.assertRaises(AttributeError):
2157+
del tc.metadata_schema
2158+
# None should fail
2159+
with self.assertRaises(ValueError):
2160+
tc.metadata_schema = None
2161+
2162+
def test_set_metadata(self):
2163+
tc = tskit.TableCollection(1)
2164+
# Default is empty bytes
2165+
self.assertEqual(tc.metadata, b"")
2166+
2167+
tc.metadata_schema = self.metadata_schema
2168+
md1 = self.metadata_example_data()
2169+
md2 = self.metadata_example_data(val=2)
2170+
# Set
2171+
tc.metadata = md1
2172+
self.assertEqual(tc.metadata, md1)
2173+
# Overwrite
2174+
tc.metadata = md2
2175+
self.assertEqual(tc.metadata, md2)
2176+
# Del should fail
2177+
with self.assertRaises(AttributeError):
2178+
del tc.metadata
2179+
# None should fail
2180+
with self.assertRaises(exceptions.MetadataValidationError):
2181+
tc.metadata = None
2182+
2183+
def test_default_metadata_schema(self):
2184+
# Default should allow bytes
2185+
tc = tskit.TableCollection(1)
2186+
tc.metadata = b"acceptable bytes"
2187+
self.assertEqual(tc.metadata, b"acceptable bytes")
2188+
# Adding non-bytes metadata should error
2189+
with self.assertRaises(TypeError):
2190+
tc.metadata = self.metadata_example_data()
2191+
2192+
def test_round_trip_metadata(self):
2193+
data = self.metadata_example_data()
2194+
tc = tskit.TableCollection(1)
2195+
tc.metadata_schema = self.metadata_schema
2196+
tc.metadata = data
2197+
self.assertDictEqual(tc.metadata, data)
2198+
2199+
def test_bad_metadata(self):
2200+
metadata = self.metadata_example_data()
2201+
metadata["I really shouldn't be here"] = 6
2202+
tc = tskit.TableCollection(1)
2203+
tc.metadata_schema = self.metadata_schema
2204+
with self.assertRaises(exceptions.MetadataValidationError):
2205+
tc.metadata = metadata
2206+
self.assertEqual(tc.ll_tables.metadata, b"")
2207+
2208+
21122209
class TestTableCollectionPickle(unittest.TestCase):
21132210
"""
21142211
Tests that we can round-trip table collections through pickle.

python/tskit/tables.py

+24
Original file line numberDiff line numberDiff line change
@@ -1884,6 +1884,30 @@ def sequence_length(self, sequence_length):
18841884
def file_uuid(self):
18851885
return self.ll_tables.file_uuid
18861886

1887+
@property
1888+
def metadata_schema(self) -> metadata.MetadataSchema:
1889+
"""
1890+
The :class:`tskit.MetadataSchema` for this TableCollection.
1891+
"""
1892+
return metadata.parse_metadata_schema(self.ll_tables.metadata_schema)
1893+
1894+
@metadata_schema.setter
1895+
def metadata_schema(self, schema: metadata.MetadataSchema) -> None:
1896+
# Check the schema is a valid schema instance by roundtripping it.
1897+
metadata.parse_metadata_schema(str(schema))
1898+
self.ll_tables.metadata_schema = str(schema)
1899+
1900+
@property
1901+
def metadata(self) -> Any:
1902+
"""
1903+
The decoded metadata for this TableCollection.
1904+
"""
1905+
return self.metadata_schema.decode_row(self.ll_tables.metadata)
1906+
1907+
@metadata.setter
1908+
def metadata(self, metadata: Any) -> None:
1909+
self.ll_tables.metadata = self.metadata_schema.validate_and_encode_row(metadata)
1910+
18871911
def asdict(self):
18881912
"""
18891913
Returns a dictionary representation of this TableCollection.

python/tskit/trees.py

+27-9
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
import tskit.drawing as drawing
4444
import tskit.exceptions as exceptions
4545
import tskit.formats as formats
46-
import tskit.metadata as metadata
46+
import tskit.metadata as metadata_module
4747
import tskit.provenance as provenance
4848
import tskit.tables as tables
4949
import tskit.util as util
@@ -2829,19 +2829,21 @@ class _TableMetadataSchemas:
28292829
Convenience class for returning schemas
28302830
"""
28312831

2832-
node: metadata.MetadataSchema
2833-
edge: metadata.MetadataSchema
2834-
site: metadata.MetadataSchema
2835-
mutation: metadata.MetadataSchema
2836-
migration: metadata.MetadataSchema
2837-
individual: metadata.MetadataSchema
2838-
population: metadata.MetadataSchema
2832+
node: metadata_module.MetadataSchema
2833+
edge: metadata_module.MetadataSchema
2834+
site: metadata_module.MetadataSchema
2835+
mutation: metadata_module.MetadataSchema
2836+
migration: metadata_module.MetadataSchema
2837+
individual: metadata_module.MetadataSchema
2838+
population: metadata_module.MetadataSchema
28392839

28402840
def __init__(self, ll_tree_sequence):
28412841
self._ll_tree_sequence = ll_tree_sequence
28422842
metadata_schema_strings = self._ll_tree_sequence.get_table_metadata_schemas()
28432843
metadata_schema_instances = {
2844-
name: metadata.parse_metadata_schema(getattr(metadata_schema_strings, name))
2844+
name: metadata_module.parse_metadata_schema(
2845+
getattr(metadata_schema_strings, name)
2846+
)
28452847
for name in vars(self._TableMetadataSchemas)
28462848
if not name.startswith("_")
28472849
}
@@ -3155,6 +3157,22 @@ def sequence_length(self):
31553157
def get_sequence_length(self):
31563158
return self._ll_tree_sequence.get_sequence_length()
31573159

3160+
@property
3161+
def metadata(self) -> Any:
3162+
"""
3163+
The decoded metadata for this TreeSequence.
3164+
"""
3165+
return self.metadata_schema.decode_row(self._ll_tree_sequence.get_metadata())
3166+
3167+
@property
3168+
def metadata_schema(self) -> metadata_module.MetadataSchema:
3169+
"""
3170+
The :class:`tskit.MetadataSchema` for this TreeSequence.
3171+
"""
3172+
return metadata_module.parse_metadata_schema(
3173+
self._ll_tree_sequence.get_metadata_schema()
3174+
)
3175+
31583176
@property
31593177
def num_edges(self):
31603178
"""

0 commit comments

Comments
 (0)