Skip to content

Commit 42517e7

Browse files
Merge pull request #122 from neherlab/description-refactor
Description refactor
2 parents 038209f + 7d54066 commit 42517e7

File tree

10 files changed

+50
-23
lines changed

10 files changed

+50
-23
lines changed

docs/docs/pypangraph/tutorial4.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,10 @@ For these cases, pypangraph provides a method to quickly survey all changes in c
9292

9393
![minimal synteny units](../assets/pp_t4_minimal_synteny_units.png)
9494

95-
For this part of the tutorial we will analyze the `ecoli_graph.json.gz` graph, containing 10 _E. coli_ chromosomes. The minimal sinteny units for this graph can be extracted with the function:
95+
For this part of the tutorial we will analyze the `graph.json` file created [in the first tutorial](../tutorial/tutorial_1.md#building-the-pangraph), containing 10 _E. coli_ chromosomes. The minimal sinteny units for this graph can be extracted with the function:
9696

9797
```python
98-
graph = pp.Pangraph.from_json("ecoli_graph.json.gz")
98+
graph = pp.Pangraph.from_json("graph.json")
9999

100100
# find MSUs
101101
threshold_len = 100 # minimal length of core blocks to consider
@@ -142,7 +142,7 @@ Similarly to what done for plasmids, we can visualize these units on Bandage. We
142142
pangraph export gfa \
143143
--no-duplicated \
144144
--minimum-depth 10 \
145-
ecoli_graph.json.gz > ecoli.gfa
145+
graph.json > ecoli.gfa
146146
```
147147

148148
And then we can export the dictionary of core-block colors with:

docs/docs/tutorial/tutorial_2.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ A path object has the following structure:
2929
"name": "NZ_CP010150",
3030
"nodes": [10429785587629589393, 10765941013351965021, 7771937209474314297, ...],
3131
"circular": true,
32-
"tot_len": 4827779
32+
"tot_len": 4827779,
33+
"desc": null
3334
},
3435
```
3536

@@ -43,6 +44,7 @@ Here is a complete list containing a description of every entry in the path obje
4344
- `nodes` : the ordered list of node ids that make up the path.
4445
- `circular` : indicates whether the considered sequence is circular (e.g. plasmid) or not. This is controlled by the `--circular` flag of the [`build` command](../reference#pangraph-build).
4546
- `tot_len` : the total length of the path, in nucleotides.
47+
- `desc` : (optional) path description string, corresponding to the description field in the input fasta file. For example `>NZ_CP010150 Escherichia coli strain 1303 chromosome, complete genome` would be split in `name="NZ_CP010150"` and `desc="Escherichia coli strain 1303 chromosome, complete genome"`.
4648

4749

4850
## Nodes

packages/pangraph/src/pangraph/pangraph_block.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,10 @@ impl PangraphBlock {
167167
)
168168
.unwrap();
169169

170-
let id = format!("{node_id} {meta}");
171-
(id, None)
170+
let id = node_id.to_string();
171+
let descr = Some(meta);
172+
173+
(id, descr)
172174
}
173175
RecordNaming::Path => {
174176
let path_id = graph.nodes[node_id].path_id();

packages/pypangraph/pypangraph/class_graph.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,12 @@ def core_genome_alignment(self, guide_strain=None):
200200
# convert to biopython alignment
201201
records = []
202202
for strain, seq in alignment.items():
203-
record = SeqRecord.SeqRecord(Seq.Seq(seq), id=strain, description="")
203+
desc = self.paths[strain].desc
204+
if desc is None:
205+
desc = ""
206+
record = SeqRecord.SeqRecord(
207+
Seq.Seq(seq), id=strain, name="", description=desc
208+
)
204209
records.append(record)
205210
alignment = AlignIO.MultipleSeqAlignment(records)
206211

packages/pypangraph/pypangraph/class_path.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33

44
class Path:
55
"""Pangraph path object. It has attributes:
6-
- name (str): strain name
6+
- name (str): strain name (fasta id pangraph in input file)
7+
- descr (str): strain description (fasta description in pangraph input file)
78
- circular (bool): whether the path object is circular
89
- nodes (list): list of node ids in the path
910
- nuc_len (int): total length of the path in base-pairs
@@ -15,6 +16,7 @@ def __init__(self, pan_path):
1516
self.circular = pan_path["circular"]
1617
self.nodes = pan_path["nodes"]
1718
self.nuc_len = pan_path["tot_len"]
19+
self.desc = pan_path.get("desc", None)
1820

1921
def __len__(self):
2022
"""Returns the number of nodes in the path"""

packages/pypangraph/pypangraph/pangraph_schema.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
"tot_len": {"type": "integer", "format": "uint", "minimum": 0},
3333
"circular": {"type": "boolean"},
3434
"name": {"type": ["string", "null"]},
35+
"desc": {"type": ["string", "null"]},
3536
},
3637
"required": ["id", "nodes", "tot_len", "circular"],
3738
},
-5.62 MB
Binary file not shown.
453 KB
Binary file not shown.

packages/pypangraph/tests/data/plasmids.json

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,8 @@
6666
],
6767
"tot_len": 80596,
6868
"circular": true,
69-
"name": "RCS48_p1"
69+
"name": "RCS48_p1",
70+
"desc": null
7071
},
7172
"1": {
7273
"id": 1,
@@ -135,7 +136,8 @@
135136
],
136137
"tot_len": 80790,
137138
"circular": true,
138-
"name": "RCS49_p1"
139+
"name": "RCS49_p1",
140+
"desc": null
139141
},
140142
"2": {
141143
"id": 2,
@@ -209,7 +211,8 @@
209211
],
210212
"tot_len": 91307,
211213
"circular": true,
212-
"name": "RCS64_p2"
214+
"name": "RCS64_p2",
215+
"desc": null
213216
},
214217
"3": {
215218
"id": 3,
@@ -279,7 +282,8 @@
279282
],
280283
"tot_len": 87487,
281284
"circular": true,
282-
"name": "RCS80_p1"
285+
"name": "RCS80_p1",
286+
"desc": null
283287
},
284288
"4": {
285289
"id": 4,
@@ -357,7 +361,8 @@
357361
],
358362
"tot_len": 106000,
359363
"circular": true,
360-
"name": "RCS100_p1"
364+
"name": "RCS100_p1",
365+
"desc": null
361366
},
362367
"5": {
363368
"id": 5,
@@ -443,7 +448,8 @@
443448
],
444449
"tot_len": 108661,
445450
"circular": true,
446-
"name": "RCS72_p1"
451+
"name": "RCS72_p1",
452+
"desc": null
447453
},
448454
"6": {
449455
"id": 6,
@@ -531,7 +537,8 @@
531537
],
532538
"tot_len": 111333,
533539
"circular": true,
534-
"name": "RCS34_p1"
540+
"name": "RCS34_p1",
541+
"desc": null
535542
},
536543
"7": {
537544
"id": 7,
@@ -617,7 +624,8 @@
617624
],
618625
"tot_len": 108379,
619626
"circular": true,
620-
"name": "RCS75_p1"
627+
"name": "RCS75_p1",
628+
"desc": null
621629
},
622630
"8": {
623631
"id": 8,
@@ -704,7 +712,8 @@
704712
],
705713
"tot_len": 110066,
706714
"circular": true,
707-
"name": "RCS76_p1"
715+
"name": "RCS76_p1",
716+
"desc": null
708717
},
709718
"9": {
710719
"id": 9,
@@ -772,7 +781,8 @@
772781
],
773782
"tot_len": 93239,
774783
"circular": true,
775-
"name": "RCS73_p1"
784+
"name": "RCS73_p1",
785+
"desc": null
776786
},
777787
"10": {
778788
"id": 10,
@@ -849,7 +859,8 @@
849859
],
850860
"tot_len": 89006,
851861
"circular": true,
852-
"name": "RCS29_p1"
862+
"name": "RCS29_p1",
863+
"desc": null
853864
},
854865
"11": {
855866
"id": 11,
@@ -925,7 +936,8 @@
925936
],
926937
"tot_len": 88922,
927938
"circular": true,
928-
"name": "RCS33_p1"
939+
"name": "RCS33_p1",
940+
"desc": null
929941
},
930942
"12": {
931943
"id": 12,
@@ -1004,7 +1016,8 @@
10041016
],
10051017
"tot_len": 91634,
10061018
"circular": true,
1007-
"name": "RCS71_p1"
1019+
"name": "RCS71_p1",
1020+
"desc": null
10081021
},
10091022
"13": {
10101023
"id": 13,
@@ -1077,7 +1090,8 @@
10771090
],
10781091
"tot_len": 94836,
10791092
"circular": true,
1080-
"name": "RCS74_p1"
1093+
"name": "RCS74_p1",
1094+
"desc": null
10811095
},
10821096
"14": {
10831097
"id": 14,
@@ -1160,7 +1174,8 @@
11601174
],
11611175
"tot_len": 117655,
11621176
"circular": true,
1163-
"name": "RCS58_p1"
1177+
"name": "RCS58_p1",
1178+
"desc": null
11641179
}
11651180
},
11661181
"blocks": {
23.1 KB
Binary file not shown.

0 commit comments

Comments
 (0)