diff --git a/.Rbuildignore b/.Rbuildignore old mode 100644 new mode 100755 index 66a530c..6c26863 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,4 +1,20 @@ -^pipeline_files$ -^tests/testthat/results$ -^tests/testthat/pf_rrbs$ +^renv$ +^renv\.lock$ +^\.Rproj$ +^\.Rproj\.user$ +^LICENSE\.md$ ^README\.Rmd$ +^data/.+$ +^docs/.+$ +^fork/.+$ +^tests/.+$ +\.Rprofile +\.renvignore +\.here +\.gitignore +\.DS_Store +^_pkgdown\.yml$ +^\.github$ +^test_.*$ +^vignettes/.+$ +^refs/.+$ diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..da0de83 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +docs/** linguist-vendored diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 index 2b8b259..62f64ca --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,48 @@ -.lintr -.vscode +*.pdf +*.Rhistory +.Rhistory +.RData* +.Rproj.user +archive/ +data/ +docs/example_* +docs/wgbs.nf +*.tar.gz pipeline_files/ -test_result/ -tests/testthat/pf_rrbs -tests/testthat/results -tests/testthat/result* +renv/ +.drake/ +TEST/ +*.RData +pipeline_files/* +slurm/ +results/ +transfer.zip +*./vanloo-main/ +code/**/*.png +code/**/*.stats +code/**/*.bigWig +code/*/* +camdac_paper/ +*.drawio +*.docx +.DS_Store +.here +.Rprofile man/ -CAMDAC_manual/manual_figures/*.svg \ No newline at end of file +inst/doc +*.json +temp/ +wgbs_test/ +test_*.R +.lintr +*.code-workspace +*dev.md +tests/testthat/result* +local/ +wgbs_asm/ +test_wgbs_asm/ +result* +CAMDAC_manual/ +*.log.lintr +data/ +bin/ diff --git a/DESCRIPTION b/DESCRIPTION index a04b6b9..ad1c057 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: CAMDAC Title: Copy-number Aware Methylation Deconvolution and Analysis of Cancers -Version: 0.1.0 +Version: 0.2.0 Authors@R: c( person(given = "Elizabeth", @@ -10,6 +10,9 @@ Authors@R: family = "Mensah", role = c("aut", "cre"), email="nana.mensah@crick.ac.uk"), + person(given = "Siqi", + family = "Lai", + role = c("aut")), person(given = "Carla", family = "Castignani", role = c("aut")), @@ -25,44 +28,52 @@ License: MIT + file LICENSE Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.2 +RoxygenNote: 7.3.1 Suggests: - testthat (>= 3.0.0), - devtools, + testthat, pryr, + devtools, tictoc, mockr, tidyr, knitr, rmarkdown, - pkgdown, - usethis + rcmdcheck biocViews: Imports: + Rsamtools, + fs, + qs, + fst, + parallel, + doParallel, + gtable, + stats, optparse, - logger -Depends: - R (>= 3.6.2), - ASCAT, - stringr, + readr, + scales, + rtracklayer, + R.utils, + Battenberg, + data.table (>= 1.14.6), + foreach, GenomicRanges, GenomicAlignments, - Rsamtools, - scales, + GenomeInfoDb, + IRanges, + S4Vectors, + logging, ggplot2, - gridExtra, - gtable, - data.table, - readr, dplyr, - parallel, - doParallel, - devtools, - fst, - fs, + gridExtra, + grid, + stringr, png, - grid -Remotes: VanLoo-Lab/ascat/ASCAT@v3.2.0, -VignetteBuilder: knitr -Config/testthat/edition: 3 -Config/testthat/start-first: test_download* + MASS, + ASCAT (>= 3.0.0) +Depends: + R (>= 3.6.2) +Remotes: + Battenberg=NMNS93/battenberg@dev-ascat3, + ASCAT=Vanloo-Lab/ascat/ASCAT/@44ddd3080723a2c3640d1cfead13437a093c21d1 +VignetteBuilder: rmarkdown diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..73f0389 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,49 @@ +# Use a lightweight R base image +FROM rocker/r-ver:4.4.0 +# Set the working directory +WORKDIR /app + +# Install system dependencies (if needed) +# For example, if your package needs external libraries, install them here +RUN chmod 1777 /tmp +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + && apt-get install -y libxml2 libodbc1 \ + && apt-get install -y libz-dev libbz2-dev bzip2-doc zlib1g-dev \ + && apt-get install -y liblzma-dev libcurl4-openssl-dev wget \ + && apt-get install -y default-jre git curl bzip2 pandoc + +# Install alleleCounter +RUN git clone https://github.com/cancerit/alleleCount.git \ + && cd alleleCount && bash ./setup.sh /usr/local +RUN echo 'export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH' >> ~/.bashrc +RUN echo 'export PATH=/usr/local/bin:$PATH' >> ~/.bashrc + +# Set WORKDIR +WORKDIR /app + +# Install CAMDAC +RUN R -q -e 'install.packages("remotes")' +RUN R -q -e 'install.packages("devtools")' + +# Create ~/.R directory and configure Makevars file for R-specific flags. +# Required for bioconductor packages used in Battenberg install. +RUN mkdir -p /root/.R && \ + echo 'CXXFLAGS=-Wall -Wno-format-security' >> /root/.R/Makevars && \ + echo 'CFLAGS=-Wall -Wno-format-security' >> /root/.R/Makevars + +# Copy only DESCRIPTION and (optionally) NAMESPACE to install deps first +COPY DESCRIPTION NAMESPACE* ./ +RUN R -q -e 'remotes::install_deps(".", dependencies = TRUE, upgrade = "never", lib = "/usr/local/lib/R/site-library")' + +# Generate docs +RUN R -q -e 'devtools::document()' + +# Install CAMDAC from local directories +COPY R ./R/ +COPY vignettes ./vignettes/ +COPY man ./man/ +RUN Rscript -e 'remotes::install_local(".", dependencies = TRUE, upgrade = "never", lib = "/usr/local/lib/R/site-library")' + +# Set command to be use +CMD ["/usr/local/bin/Rscript"] diff --git a/LICENSE b/LICENSE index f288702..f3c1119 100755 --- a/LICENSE +++ b/LICENSE @@ -1,674 +1,21 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. +# MIT License + +Copyright (c) 2020 CAMDAC + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/NAMESPACE b/NAMESPACE old mode 100644 new mode 100755 index 8748a19..c7f9fcc --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,68 @@ # Generated by roxygen2: do not edit by hand -export(ascat.m.plotRawData) +export(CamConfig) +export(CamSample) export(ascat.m.plotSegmentedData) +export(asm_pipeline) +export(attach_output) +export(cmain_bind_snps) +export(cmain_call_cna) +export(cmain_call_dmps) +export(cmain_call_dmrs) +export(cmain_count_alleles) +export(cmain_deconvolve_methylation) +export(cmain_make_methylation_profile) +export(cmain_make_snps) +export(cmain_run_ascat) +export(cmain_run_battenberg) +export(cwrap_get_allele_counts) export(download_pipeline_files) -export(pipeline_tumor_normal) -export(plot_methylation_info) +export(get_fpath) +export(get_reference_files) +export(load_cna_data) +export(load_loci_for_segment) +export(panel_meth_from_beta) +export(panel_meth_from_counts) +export(pipeline) +export(preprocess_asm) +export(preprocess_wgbs) +export(sort_genomic_dt) +import(GenomeInfoDb) +import(GenomicAlignments) +import(GenomicRanges) +import(MASS) +import(Rsamtools) +import(S4Vectors) +import(data.table) +import(doParallel) +import(dplyr) +import(foreach) +import(ggplot2) +import(gridExtra) +import(logging) +import(png) +import(stringr) +importFrom(IRanges,IRanges) +importFrom(grDevices,adjustcolor) +importFrom(grDevices,dev.off) +importFrom(grDevices,png) +importFrom(grDevices,rgb) +importFrom(graphics,abline) +importFrom(graphics,axis) +importFrom(graphics,par) +importFrom(graphics,plot) +importFrom(graphics,points) +importFrom(graphics,rect) +importFrom(graphics,text) +importFrom(stats,cor) +importFrom(stats,frequency) +importFrom(stats,lm) +importFrom(stats,median) +importFrom(stats,na.omit) +importFrom(stats,optimize) +importFrom(stats,qbeta) +importFrom(stats,rbeta) +importFrom(stats,runif) +importFrom(stats,setNames) +importFrom(utils,read.table) +importFrom(utils,write.table) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..50b40aa --- /dev/null +++ b/NEWS.md @@ -0,0 +1,9 @@ +# CAMDAC 0.2.0 + +* Integrated RRBS and WGBS analysis under a single call to the `pipeline()` function. +* Added option for RRBS paired end analysis +* Upgraded ASCAT to version 3.2.0 (commit: 44ddd3080723a2c3640d1cfead13437a093c21d1) + +# CAMDAC 0.1.0 + +* Minor documentation updates. diff --git a/R/.Rhistory b/R/.Rhistory deleted file mode 100644 index e69de29..0000000 diff --git a/R/allele_counts.R b/R/allele_counts.R new file mode 100755 index 0000000..41ddc9d --- /dev/null +++ b/R/allele_counts.R @@ -0,0 +1,927 @@ +detect_genome_build <- function(bam_file) { + # Get genome build from BAM header for opening reference files + + # Get reference sequence lengths from BAM header + ref_lengths <- Rsamtools::scanBamHeader(bam_file)[[1]][[1]] + # Test whether chromosome X length is expected for hg19 + # X used as can be selected regardless of reference + x_is_hg19 <- ref_lengths[grepl("X$", names(ref_lengths))] == "155270560" + # Set build + build <- ifelse(unname(x_is_hg19), "hg19", "hg38") + + return(build) +} + +get_reads_in_segments <- function(bam_file, segments, min_mapq, paired_end = FALSE) { + # Get reads from pre-determined regions (segments) + + # Set BAM flags required for paired end reads + if (paired_end) { + # For paired-end sequencing, we use the settings below to limit to following flags to ensure we + # are only left with Bismark informative paired reads: + # R1 R2 + # OT 99 147 + # OB 83 163 + flag <- Rsamtools::scanBamFlag( + isPaired = TRUE, isProperPair = TRUE, + isSecondaryAlignment = FALSE, isSupplementaryAlignment = FALSE, + isUnmappedQuery = FALSE, isDuplicate = FALSE + ) + # Adding flag to determine PE OT/OB downstream + what <- c("qname", "rname", "flag", "strand", "pos", "qwidth", "mapq", "seq", "qual", "groupid", "cigar", "mate_status") + asMates <- TRUE # Adds additional mates column to for filtering. BamFile object only. + } else { + flag <- Rsamtools::scanBamFlag() + what <- c("qname", "rname", "strand", "pos", "qwidth", "mapq", "seq", "qual", "cigar") + asMates <- FALSE + } + + # Set parameters and read BamFile. + # ScanBamParam "whats" options are available via scanBamWhat() + param <- Rsamtools::ScanBamParam(which = segments, what = what, flag = flag, mapqFilter = min_mapq) + bam_object <- Rsamtools::BamFile(bam_file, index = paste0(bam_file, ".bai"), asMates = asMates) + # Read BAM file as a list of lists + bam <- Rsamtools::scanBam(bam_object, param = param) + + # Convert BAM to data.table with one row per read + # The lapply operation takes bam from a list-of-lists to a list-of-dataframes, + # which rbindlist combines into a single datatable. + bam_dt <- data.table::rbindlist( + lapply(bam, as.data.frame.list) + ) + + # Format BAM to ucsc format if not already + is_ucsc <- startsWith(seqnames(seqinfo(BamFile(bam_file)))[[1]], "chr") + if (!is_ucsc & nrow(bam_dt) > 0) { + bam_dt$rname <- paste0("chr", bam_dt$rname) + } + + return(bam_dt) + +} + +format_bam_for_loci_overlap <- function(bam_dt, paired_end = FALSE) { + setnames(bam_dt, "rname", "chrom") + setnames(bam_dt, "pos", "start") + bam_dt[, end := (start + (qwidth - 1))] + + # Ensure chrom column is UCSC format + # CAMDAC-RRBS performs Ensembl/UCSC format checks on seqnames. + # CAMDAC-WGBS will use UCSC as default (e.g. chr1) except for ASCAT where Ensembl format is required + chr_entry <- as.character(bam_dt[["chrom"]][[1]]) + if (!startsWith(chr_entry, "chr")) { + bam_dt[, chrom := paste0("chr", chrom)] + } + + keep_columns <- c("qname", "chrom", "strand", "start", "end", "mapq", "seq", "qual", "cigar") + + if (paired_end) { + keep_columns <- c(keep_columns, "flag", "groupid", "mate_status") + } + + # Return data table with columns filtered + return(bam_dt[, ..keep_columns]) +} + +load_loci_as_data_table <- function(loci_file, drop_ccgg = TRUE) { + base::load(loci_file) # Brings loci_subset into environment + loci_dt <- data.table::data.table(data.frame(loci_subset)) + # Filter any CCGG sites out for WGBS analysis + if (drop_ccgg) { + loci_dt <- loci_dt[width != 4] + } + setnames(loci_dt, "seqnames", "chrom") + return(loci_dt) +} + +annotate_bam_with_loci <- function(bam_dt, loci_subset, drop_ccgg = FALSE, paired_end = FALSE) { + loci_subset$chrom <- as.character(loci_subset$chrom) + data.table::setkey(loci_subset, chrom, start, end) + bam_dt$chrom <- as.character(bam_dt$chrom) + data.table::setkey(bam_dt, chrom, start, end) + # Depreciated on 210513 as loci already subset to relevant regions upstream + # First limit loci_subset to regions in BAM to speed up later overlap by ~5x + # lcgr = reduce(GRanges(seqnames=bam_dt$chrom, ranges=IRanges(bam_dt$start, bam_dt$end))) + # lcgr = data.table(data.frame(lcgr)); names(lcgr)[1] = "chrom"; setkey(lcgr, chrom, start, end) + # loci_subset = foverlaps(loci_subset, lcgr) + # loci_subset = loci_subset[!is.na(start)] + # loci_subset[,`:=`(start=NULL, end=NULL, width=NULL, strand=NULL)] + # names(loci_subset) = gsub("^i.","",names(loci_subset)) + # setkey(loci_subset, chrom,start,end) + + # Filter CCGG loci if WGBS + if (drop_ccgg) { + loci_subset <- loci_subset[width != 4] + } + + bam_loci_overlap <- data.table::foverlaps(bam_dt, loci_subset) + + # Filter overlap to expected columns and rename columns to those used in CAMDAC-RRBS + setnames(bam_loci_overlap, "i.start", "read.start") + setnames(bam_loci_overlap, "i.end", "read.end") + setnames(bam_loci_overlap, "mapq", "mq") + # Replace the i.strand column with the strand column + # The strand column is "*" as it derives from loci_subset, + # while i.strand derives from the BAM and contains true strand orientation for each read + bam_loci_overlap[, strand := i.strand] + bam_cols <- c( + "qname", "strand", "chrom", "read.start", "read.end", "POS", "width", + "start", "end", "ref", "alt", "seq", "qual", "mq", "cigar" + ) + if (paired_end) { + bam_cols <- c(bam_cols, "flag", "groupid", "mate_status") + } + + # Filter out rows with no loci data, set expected columns and return + bam_loci_overlap <- bam_loci_overlap[!is.na(width), ..bam_cols] + + return(bam_loci_overlap) +} + +drop_positions_outside_segments <- function(bam_dt, segments) { + # Assuming segments will be an arbitrary number of ranges for a single chrom, + # bind segments in gr with range() and get the start and end + segs_range <- range(segments) + segs_start <- start(segs_range) + segs_end <- end(segs_range) + + # Drop bam_dt positions outside of these ranges + bam_dt <- bam_dt[(start >= segs_start) & (start <= segs_end)] + + return(bam_dt) +} + +infer_ot_ob_strand_from_flag <- function(flag) { + # Check for OT/OB patterns using bitwise logic + # Bismark OT/OB flags: + # OT R1: 64 (R1) + 32 (mate_rev) + 2 (proper) + 1 (paired) = 99 + # OT R2: 128 (R2) + 16 (self_rev) + 2 + 1 = 147 + # OB R1: 64 (R1) + 16 (self_rev) + 2 + 1 = 83 + # OB R2: 128 (R2) + 32 (mate_rev) + 2 + 1 = 163 + + # Bit definitions (SAM spec) + is_paired <- bitwAnd(flag, 0x1) != 0 # 1 + is_proper_pair <- bitwAnd(flag, 0x2) != 0 # 2 + is_reverse <- bitwAnd(flag, 0x10) != 0 # 16 + mate_reverse <- bitwAnd(flag, 0x20) != 0 # 32 + is_R1 <- bitwAnd(flag, 0x40) != 0 # 64 + is_R2 <- bitwAnd(flag, 0x80) != 0 # 128 + + is_OT <- (is_R1 & !is_reverse & mate_reverse & is_proper_pair & is_paired) | + (is_R2 & is_reverse & !mate_reverse & is_proper_pair & is_paired) + + is_OB <- (is_R1 & is_reverse & !mate_reverse & is_proper_pair & is_paired) | + (is_R2 & !is_reverse & mate_reverse & is_proper_pair & is_paired) + + return( + ifelse(is_OT, "+", + ifelse(is_OB, "-", NA_character_)) + ) +} + +fix_pe_strand_with_flags <- function(bam_dt, paired_end = T) { + # Convert "strand" column to CAMDAC-expected strand using Bismark flags for OT/OB + # + # | R1 R2 | CAMDAC strand column + # Bismark flag OT | 99(+) 147(-) | OT = "+" + # Bismark flag OB | 83(-) 163(+) | OB = "-" + # Note, this is the same as viewing in IGV as "first of pair strand". + if (paired_end) { + setkey(bam_dt, groupid) + bam_dt[, strand := infer_ot_ob_strand_from_flag(flag)] + } + return(bam_dt) + +} + +fix_pe_overlap_at_loci <- function(bam_dt) { + # Filter mate pairs that overlap loci. + # We prefentially keep R2 as for Swift Accel-MethylSeq library, the tail of +ve R1 may + # contain adapter contaminant sequences typically trimmed off 5' +ve R2. + # See: https://swiftbiosci.com/wp-content/uploads/2019/02/16-0853-Tail-Trim-Final-442019.pdf + # 210401 - I found that the R1/"first-in-pair" flags actually have better per-base quality + # than their R2 counterparts. As we aren't using PE overlaps, filter R2 to see if it improves score + + # Set flag for R1 status + bam_dt[, is_r1:=(bitwAnd(flag, 0x40)!=0)] + + # Get data table of duplicated reads, + dups <- bam_dt[, .N, by = .(chrom, width, start, groupid)][N > 1, .(start, groupid)] + bam_dt <- bam_dt[ + # Using an anti-join, remove non-R1 reads from loci with pe overlap + !bam_dt[dups, on = .(start, groupid)][is_r1==FALSE], + on = .(chrom, width, start, groupid, flag) # Note: must include flag in join + ] + + # Clean R1 flag field + bam_dt[, is_r1:=NULL] + + return(bam_dt) +} + +add_loci_read_position_skipCIGAR <- function(bam_dt) { + # Take only reads without indels or clipping + bam_dt <- bam_dt[grepl("^\\d+M$", cigar)] + + # Set rstart and rend by parsing relative to genome + # This depends on strand orientation simply because read start and end are reversed + bam_dt[, `:=`( + rstart = (start - read.start + 1), + rend = (end - read.start + 1) + )] + + # Set ccgg sites to 0 for now + # TODO: Process PE CCGG (RRBS) in CAMDAC? + bam_dt[width == 4, `:=`(rstart = 0, rend = 0)] + + return(bam_dt) +} + +add_loci_read_position <- function(bam_dt) { + # Setup data as GRanges and aln object + aln <- GAlignments( + seqnames = as.character(bam_dt$chrom), pos = bam_dt$read.start, + cigar = as.character(bam_dt$cigar), strand = GenomicAlignments::strand(bam_dt$strand), names = as.character(bam_dt$qname) + ) + gr <- GRanges(seqnames = bam_dt$chrom, ranges = IRanges(bam_dt$start, bam_dt$end)) + # Get loci position in read + res <- pmapToAlignments(gr, aln) + + # Get loci by mapping alignments + bam_dt[, rstart := start(res)] + bam_dt[, rend := end(res)] + bam_dt[, rwidth := width(res)] + + # Remove any loci where CIGAR operation indicates no read at position + bam_dt <- bam_dt[width == rwidth][, rwidth := NULL] + + # Return result + return(bam_dt) +} + +add_loci_read_position_legacy <- function(bam_dt, skip_cigar = T) { + # Quick-parse to only take reads without indels or clipping + if (skip_cigar) { + # Note updated CIGAR format may use X to denote mismatched bases but this is not the case + # in our TCGA or PGP data. + bam_dt <- bam_dt[grepl("^\\d+M$", cigar)] + + # Set rstart and rend by parsing relative to genome + # This depends on strand orientation simply because read start and end are reversed, + # Therefore do not apply camdac `fix_pe_strand_with_flags` before this function. + bam_dt[, `:=`( + rstart = (start - read.start + 1), + rend = (end - read.start + 1) + )] + + # Set ccgg sites to 0 for now + bam_dt[width == 4, `:=`(rstart = 0, rend = 0)] + + return(bam_dt) + } + + ccgg_dt <- bam_dt[width == 4] + ccgg_dt[, `:=`(rstart = 0, rend = 0)] # Add extra columns for downstream rbind + + bam_dt <- bam_dt[width != 4] + # Get unique reads as GAlignment object, using data.table for fast unique call + aln <- unique( + data.table( + seqnames = as.character(bam_dt$chrom), pos = bam_dt$read.start, cigar = as.character(bam_dt$cigar), + strand = GenomicAlignments::strand(bam_dt$strand), names = as.character(bam_dt$qname) + ) + ) + + aln <- GAlignments( + seqnames = aln$seqnames, pos = aln$pos, + cigar = aln$cigar, strand = Rle(aln$strand), + names = aln$names + ) + + # Load SNP loci as GRanges. GenomicAlignments CIGAR parser requirement. + gr <- GRanges(seqnames = bam_dt$chrom, ranges = IRanges(bam_dt$start, bam_dt$end)) + + # Run GenomicAlignments CIGAR parser `mapToAlignments` + res <- mapToAlignments(gr, aln) + + # Format the GRanges hits and the alignment hits as a single data table + gh <- data.table(data.frame(gr[res$xHits])) + names(gh) <- c("chrom", "snp_start", "snp_end", "snp_width", "snp_strand") + ah <- data.table(data.frame(aln[res$alignmentsHits])) + names(ah) <- gsub("^", "ah_", names(ah)) + res <- cbind( + gh, ah, data.frame(IRanges::ranges(res)), + data.table(qname = as.character(seqnames(res))) + ) + rm(gh, ah, gr, aln) + + # Filter loci where the nucleotide could not be detected. These loci have + # widths that don't match the input, which only occurs where there has been an + # indel or soft-clipping. Propagating will only lead to NAs downstream. + res <- res[width == snp_width] + # Subset and rename columns + res <- res[, .(qname, ah_strand, start, end, chrom, snp_start, snp_width)] + names(res) <- c("qname", "strand", "rstart", "rend", "chrom", "start", "width") + # Remove any duplicate records due to multiple hits in `aln`. + # Failure to do so leads to bam_dt annotated multiple timesres + res <- unique(res) + + # Merge with BAM and return. Adds new rstart/rend columns for nucleotide + # start and end relative to read string index + setkey(res, qname, strand, chrom, start, width) + setkey(bam_dt, qname, strand, chrom, start, width) + bam_dt <- merge(bam_dt, res) + + # Return CCGG sites + bam_dt <- rbind(bam_dt, ccgg_dt) + + return(bam_dt) +} + +drop_pe_fields <- function(bam_dt) { + pe_fields <- c("flag", "groupid", "mate_status", "cigar", "rstart", "rend") + # Drop paired-end fields from bam_dt + return(bam_dt[, !pe_fields, with = FALSE]) +} + +get_dinucs_from_seq <- function(start, read.start, seq, strand, offset = 1) { + # Returns a vector of dinucleotide strings. + # Offset is required as counting and indexing differ: + # For example, assume the refence sequence "ATCGG" and a read aligned at "CGG", + # A to C is 2 steps away (start-read.start), + # but indexing C from the string requires position 3 (start-read.start+1) + # Offset differs for RRBS CCGG sites (=2) so added as argument. + dinuc_starts <- start - read.start + offset + dinuc_ends <- dinuc_starts + 1 + # Get dinucleotide from sequence with strand appended + dinucs <- paste0(substr(seq, dinuc_starts, dinuc_ends), strand) + return(dinucs) +} + +get_snps_from_seq <- function(POS, read.start, seq, strand, offset = 1) { + # Returns a vector of SNP nucleotides + snp_position_in_seq <- POS - read.start + offset + snp_allele <- paste0(substr(seq, snp_position_in_seq, snp_position_in_seq), strand) + return(snp_allele) +} + +get_qual_dinucs <- function(start, read.start, qual, offset = 1) { + # Returns a vector of qual scores for the corresponding dinucleotide + dinuc_starts <- start - read.start + offset + dinuc_ends <- dinuc_starts + 1 + qual_dinucs <- substr(qual, dinuc_starts, dinuc_ends) + return(qual_dinucs) +} + +get_qual_snps <- function(POS, read.start, qual, offset = 1) { + # Returns a vector of qual scores for the corresponding SNP + snp_position_in_qual <- POS - read.start + offset + qual_snps <- substr(qual, snp_position_in_qual, snp_position_in_qual) + return(qual_snps) +} + +get_alleles_and_qual <- function(bam_dt) { + # Set CCGG columns (for column name continuity with RRBS version) + bam_dt[width == 4, CCGG := TRUE] + bam_dt[width == 4, alleles.CCGG := paste0(substr(seq, rstart, rend), strand)] + # Set dinucleotides at CG sites for methylation rate calculation + bam_dt[width == 2, alleles.dinucs := paste0(substr(seq, rstart, rend), strand)] + bam_dt[width == 2, qual.dinucs := paste0(substr(qual, rstart, rend), strand)] + + # Set nucleotides at SNP sites, including CG-SNPs + # As POS always gives the SNP position, we need to determine how far this is from the + # feature position (SNP/CG/CCGG, given by `start`) and adjust the read string index `rstart` accordingly. + # Hence, snp_pos = rstart + (POS-start) + bam_dt[!is.na(POS), alleles.SNP := paste0(substr(seq, rstart + (POS - start), rstart + (POS - start)), strand)] + bam_dt[!is.na(POS), qual.SNP := paste0(substr(qual, rstart + (POS - start), rstart + (POS - start)), strand)] + return(bam_dt) +} + +filter_bam_by_quality <- function(bam_dt, min_mapq) { + # Set boolean flags for either SNP or dinucs above base quality 20 + # Q score encoding reference: https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/QualityScoreEncoding_swBS.htm + # In the regular expression, ":-@" captures all Q score ASCII symbols between ":" and "@" + # This is capturing base quality scores at q20 and above + hi_qual_dinucs <- data.table::like(bam_dt$qual.dinucs, "([5-9A-K:-@])([5-9A-K:-@])") + hi_qual_snps <- data.table::like(bam_dt$qual.SNP, "([5-9A-K:-@])") + + # Set dinuc data to NA at positions where only SNP passed the quality filter + # This keeps the SNP data for downstream BAF/LogR but excludes the site from + # methylation rate calculations + bam_dt[ + width >= 2 & !hi_qual_dinucs, + c("alleles.dinucs", "qual.dinucs") := NA + ] + + # Filter BAM for high quality dinucleotides and SNPs + bam_dt <- bam_dt[ + (width >= 2 & hi_qual_dinucs) | # Hi quality dinucleotide filter + (!is.na(POS) & hi_qual_snps) # Hi quality SNP filter + ] + + # Filter records for minimum mapping quality + # Note: mq filtering can also be applied by ScanBam + bam_dt <- bam_dt[mq >= min_mapq] + + return(bam_dt) +} + +# Remove reads that do not cover entire dinucleotide at one +filter_clipped_dinucleotides <- function(bam_dt) { + bam_dt[!(width >= 2 & read.start == end)] +} + +annotate_nucleotide_counts <- function(bam_dt, rrbs = FALSE) { + # Creates columns with binary flags for nucleotides on forward and reverse reads + bam_dt[, Af := fifelse(alleles.SNP == "A+", 1, 0)] + bam_dt[, Ar := fifelse(alleles.SNP == "A-", 1, 0)] + bam_dt[, Cf := fifelse(alleles.SNP == "C+", 1, 0)] + bam_dt[, Cr := fifelse(alleles.SNP == "C-", 1, 0)] + bam_dt[, Gf := fifelse(alleles.SNP == "G+", 1, 0)] + bam_dt[, Gr := fifelse(alleles.SNP == "G-", 1, 0)] + bam_dt[, Tf := fifelse(alleles.SNP == "T+", 1, 0)] + bam_dt[, Tr := fifelse(alleles.SNP == "T-", 1, 0)] + # Set dinucleotdies expected at CpG sites + bam_dt[, CGr := fifelse(alleles.dinucs == "CG-", 1, 0)] + bam_dt[, CGf := fifelse(alleles.dinucs == "CG+", 1, 0)] + bam_dt[, CAr := fifelse(alleles.dinucs == "CA-", 1, 0)] + bam_dt[, TGf := fifelse(alleles.dinucs == "TG+", 1, 0)] + + if (rrbs) { + # Assign fragments breakpoint for RRBS + bam_dt[, CCGG := fifelse(CCGG == "5pCCGG", 1, 0)] + } else { + # Set CCGG to 0 for WGBS + bam_dt[, CCGG := 0] + } + return(bam_dt) +} + +flatten_pileup_to_counts <- function(bam_dt) { + allele_counts <- bam_dt[, .( + "Af" = sum(Af, na.rm = TRUE), + "Ar" = sum(Ar, na.rm = TRUE), + "Cf" = sum(Cf, na.rm = TRUE), + "Cr" = sum(Cr, na.rm = TRUE), + "Gf" = sum(Gf, na.rm = TRUE), + "Gr" = sum(Gr, na.rm = TRUE), + "Tf" = sum(Tf, na.rm = TRUE), + "Tr" = sum(Tr, na.rm = TRUE), + "CGf" = sum(CGf, na.rm = TRUE), + "CGr" = sum(CGr, na.rm = TRUE), + "TGf" = sum(TGf, na.rm = TRUE), + "CAr" = sum(CAr, na.rm = TRUE), + # total_depth counts reads contributing to position defined by "keyby" field, + # hence Af selection is arbitrary and any field could be used. + "total_depth" = length(Af), + "CCGG" = sum(CCGG, na.rm = TRUE), + "mq" = median(as.numeric(mq), na.rm = TRUE) + ), + keyby = .(chrom, start, end, width, POS, ref, alt) + ] + return(allele_counts) # df_summary in CAMDAC_RRBS +} + +get_snp_allele_counts <- function(pileup_summary) { + # Set SNP, a string of ref-alt combined from loci annotation + pileup_summary[!is.na(ref), SNP := paste0(ref, alt)] + # Count REF alleles. Any missing loci are set to NA + pileup_summary[, ref_counts := data.table::fcase( + SNP == "AC", Af + Ar, + SNP == "CA", Tf + Cr + Cf, + SNP == "AG", Af, + SNP == "GA", Gf, + SNP == "AT", Af + Ar, + SNP == "TA", Tf + Tr, + SNP == "GT", Gf + Ar + Gr, + SNP == "TG", Tf + Tr, + SNP == "CG", Tf + Cr + Cf, + SNP == "GC", Gf + Ar + Gr, + SNP == "CT", Cr, + SNP == "TC", Tr + )] + + # Count ALT alleles. Any loci not present are set to NA + pileup_summary[, alt_counts := data.table::fcase( + SNP == "AC", Tf + Cr + Cf, + SNP == "CA", Af + Ar, + SNP == "AG", Gf, + SNP == "GA", Af, + SNP == "AT", Tf + Tr, + SNP == "TA", Af + Ar, + SNP == "GT", Tf + Tr, + SNP == "TG", Gf + Ar + Gr, + SNP == "CG", Gf + Ar + Gr, + SNP == "GC", Tf + Cr + Cf, + SNP == "CT", Tr, + SNP == "TC", Cr + )] + # Count total reads contributing to SNP ref/alt counts + pileup_summary[, total_counts := ref_counts + alt_counts] + + # Count all reads with nucleotides expected by CAMDAC rules. This includes + # positions where we couldn't distinguish between bisulfite conversion and SNPs. + # This will later be subtracted from total depth to determine unexpected nucleotide count. + pileup_summary[, all_counts := data.table::fcase( + is.na(ref), TGf + CAr + CGf + CGr, + # For CT/AG SNPs, expected nucleotides are not in total_counts because they + # confound bisulfite conversion, however we add them here for all_counts. + SNP %like% "([GA][AG])", Af + Gf + Ar + Gr, + SNP %like% "([CT][TC])", Cr + Tr + Tf + Cf, + !is.na(ref), total_counts # All other positions get ref/alt counts + )] + + # Count reads that do not have expected bases at SNP positions (i.e. SNV) + pileup_summary[, other_counts := total_depth - all_counts] + + return(pileup_summary) +} + +get_naive_snp_allele_counts <- function(pileup_summary) { + logging::logdebug("Using naive", logger="CAMDAC") + # Set SNP, a string of ref-alt combined from loci annotation + pileup_summary[!is.na(ref), SNP := paste0(ref, alt)] + # Count REF alleles. Any missing loci are set to NA + pileup_summary[, ref_counts := data.table::fcase( + ref == "A", Af + Ar, + ref == "C", Cf + Cr, + ref == "G", Gf + Gr, + ref == "T", Tf + Tr + )] + + # Count ALT alleles. Any loci not present are set to NA + pileup_summary[, alt_counts := data.table::fcase( + alt == "A", Af + Ar, + alt == "C", Cf + Cr, + alt == "G", Gf + Gr, + alt == "T", Tf + Tr + )] + + # Count total reads contributing to SNP ref/alt counts + pileup_summary[, total_counts := ref_counts + alt_counts] + + # Count all reads with nucleotides expected by CAMDAC rules. This includes + # positions where we couldn't distinguish between bisulfite conversion and SNPs. + # This will later be subtracted from total depth to determine unexpected nucleotide count. + pileup_summary[, all_counts := data.table::fcase( + is.na(ref), TGf + CAr + CGf + CGr, + !is.na(ref), total_counts # All other positions get ref/alt counts + )] + + # Count reads that do not have expected bases at SNP positions (i.e. SNV) + pileup_summary[, other_counts := total_depth - all_counts] + + return(pileup_summary) +} + +# TODO: Set min_cov in CAMDAC user config +get_methylation_counts <- function(pileup_summary, min_cov) { + # Set M from reads reporting methylation at CG dinucleotides, + # ignoring reads at CG-destroying SNPs. + # Note: In RRBS version, CCGGs must be matched to SNP positions with a +1 offset + pileup_summary[ + width > 1, # Calculate for CpG loci only, setting non-CpGs to NA + M := data.table::fcase( + is.na(SNP), CGf + CGr, + # When CpG starts at C/T SNP loci, we can't differentiate SNP from bisulfite conversion. + # Therefore, count the reverse strand (bottom in directional library) only + start == POS & SNP %like% "([CT][TC])", CGr, + # When CpG ends at A/G SNP loci, we can't differentiate SNP from bisulfite conversion. + # Therefore, count the forward strand (top in directional library) only + end == POS & SNP %like% "([AG][GA])", CGf, + # Count CpG dinucleotides at all other SNPs loci. This works as fcase + # moves through conditions in order. + !is.na(SNP), CGf + CGr + ) + ] + + # Set UM as reads reporting unmethylated CG dinucleotides as above, + # ignoring reads at CG-destroying SNPs + pileup_summary[ + width > 1, + UM := data.table::fcase( + is.na(SNP), TGf + CAr, + start == POS & SNP %like% "([CT][TC])", CAr, + end == POS & SNP %like% "([AG][GA])", TGf, + !is.na(SNP), TGf + CAr + ) + ] + + # Set methylation counts to NA at loci with insufficient dinucleotide depth + # This ensures that CpGs at SNP loci, where only the SNP has informative reads, + # are not counted in downstream methylation analysis. + pileup_summary[width > 1 & (M + UM <= min_cov), c("M", "UM") := NA] + + # Set total_counts_m as the number of reads reporting the CG methylation state + pileup_summary[width > 1, total_counts_m := M + UM] + + return(pileup_summary) +} + +get_naive_methylation_counts <- function(pileup_summary, min_cov) { + # Set M from reads reporting methylation at CG dinucleotides, + # ignoring reads at CG-destroying SNPs. + # Note: In RRBS version, CCGGs must be matched to SNP positions with a +1 offset + pileup_summary[ + width > 1, # Calculate for CpG loci only, setting non-CpGs to NA + M := CGf + CGr + ] + + # Set UM as reads reporting unmethylated CG dinucleotides as above, + # ignoring reads at CG-destroying SNPs + pileup_summary[ + width > 1, + UM := TGf + CAr + ] + + # Set methylation counts to NA at loci with insufficient dinucleotide depth + # This ensures that CpGs at SNP loci, where only the SNP has informative reads, + # are not counted in downstream methylation analysis. + pileup_summary[width > 1 & (M + UM <= min_cov), c("M", "UM") := NA] + + # Set total_counts_m as the number of reads reporting the CG methylation state + pileup_summary[width > 1, total_counts_m := M + UM] + + # Get without dinucleotide information too ---- + + # Set M from reads reporting methylation at CG dinucleotides, + # ignoring reads at CG-destroying SNPs. + # Note: In RRBS version, CCGGs must be matched to SNP positions with a +1 offset + pileup_summary[ + width > 1, # Calculate for CpG loci only, setting non-CpGs to NA + M_snuc := Cf + Cr + ] + + # Set UM as reads reporting unmethylated CG dinucleotides as above, + # ignoring reads at CG-destroying SNPs + pileup_summary[ + width > 1, + UM_snuc := Tf + Tr + ] + + # Set methylation counts to NA at loci with insufficient dinucleotide depth + # This ensures that CpGs at SNP loci, where only the SNP has informative reads, + # are not counted in downstream methylation analysis. + pileup_summary[width > 1 & (M_snuc + UM_snuc <= min_cov), c("M_snuc", "UM_snuc") := NA] + + # Set total_counts_m as the number of reads reporting the CG methylation state + pileup_summary[width > 1, total_counts_m_snuc := M_snuc + UM_snuc] + + + return(pileup_summary) +} + +filter_bad_allele_count_rows <- function(pileup_summary, min_cov) { + # Remove positions without values for total_counts_m or total_counts + # total_counts is NA when not a SNP. total_counts_m is NA when not met min_meth_loci read filter in previous functions. + pileup_summary <- pileup_summary[!is.na(total_counts_m) | !is.na(total_counts)] + loci_all <- nrow(pileup_summary) # Save initial record count for alerting users (see below) + + # Remove positions with unexpected bases at SNPs (i.e. SNV) + pileup_summary <- pileup_summary[other_counts <= 0.05 * total_depth | (other_counts <= 1)] + loci_low_unexpected <- nrow(pileup_summary) + loci_high_unexpected <- loci_all - loci_low_unexpected + # Alert users to how many loci filtered for unexpected reads + if (loci_high_unexpected > 0) { + # TODO: Report CpG sites filtered for logfile + } + + # Remove positions without distinguishable ref/alt alleles or without methylation + # Note: minimum reads filter could be implemented elsewhere in pipeline + pileup_summary <- pileup_summary[total_counts >= min_cov | total_counts_m >= min_cov] + + return(pileup_summary) +} + +compute_methylation_rates <- function(pileup_summary) { + pileup_summary[width > 1 & total_counts_m > 0, m := M / (M + UM)] + return(pileup_summary) +} + +compute_BAFs <- function(pileup_summary) { + pileup_summary[!is.na(ref) & total_counts > 0, BAF := alt_counts / total_counts] + + # Note: After computing BAFs, CAMDAC RRBS will remove duplicated records due + # to SNPs overlapping CG and CCGG sites. This is not implemented for WGBS. + return(pileup_summary) +} + +format_get_reads_result <- function(dt) { + if (nrow(dt) == 0) { + return(empty_count_alleles_result()) + } + + # Set a CHR column + dt <- dt[, CHR := chrom] + # This should always contain the CHR prefix and no sites should be NA. + stopifnot(startsWith(as.character(dt$CHR[[1]]), "chr")) + stopifnot(all(!is.na(dt$CHR))) + + # Format 'chrom' in Ensembl integer format for compatibility with ASCAT + dt[, chrom := factor( + sub("chr", "", chrom), + levels = c(1:22, "X", "Y"), + ordered = TRUE + )] + + # Ensure rownames reflect row order + rownames(dt) <- 1:nrow(dt) + + # Reorder columns + dt <- dt[, c( + "CHR", "chrom", "start", "end", "width", "POS", "ref", "alt", "alt_counts", "ref_counts", "total_counts", "BAF", "total_depth", "other_counts", + "all_counts", "M", "UM", "total_counts_m", "m", "Af", "Ar", "Cf", "Cr", "Tf", "Tr", "Gf", "Gr", "CAr", "TGf", "CGf", "CGr", "CCGG", "mq" + )] + + return(dt) +} + +format_naive_get_reads_result <- function(dt) { + if (nrow(dt) == 0) { + return(empty_count_alleles_result()) + } + + # Set a CHR column + dt <- dt[, CHR := chrom] + # This should always contain the CHR prefix and no sites should be NA. + stopifnot(startsWith(as.character(dt$CHR[[1]]), "chr")) + stopifnot(all(!is.na(dt$CHR))) + + # Format 'chrom' in Ensembl integer format for compatibility with ASCAT + dt[, chrom := factor( + sub("chr", "", chrom), + levels = c(1:22, "X", "Y"), + ordered = TRUE + )] + + # Ensure rownames reflect row order + rownames(dt) <- 1:nrow(dt) + + # Reorder columns + dt <- dt[, c( + "CHR", "chrom", "start", "end", "width", "POS", "ref", "alt", "alt_counts", "ref_counts", "total_counts", "BAF", "total_depth", "other_counts", + "all_counts", "M", "UM", "M_snuc", "UM_snuc", "total_counts_m", "total_counts_m_snuc", "m", "Af", "Ar", "Cf", "Cr", "Tf", "Tr", "Gf", "Gr", "CAr", "TGf", "CGf", "CGr", "CCGG", "mq" + )] + + return(dt) +} + +format_and_write_output <- function(data, output_file) { + fs::dir_create(fs::path_dir(output_file)) # Creates directory only if it doesn't exist + data$chrom <- as.character(data$chrom) # Ensure 'chrom' field is character - May turn to integer if X/Y regions not present + data <- sort_genomic_dt(data) # Ensure data is sorted by chrom and POS + data.table::fwrite(data, output_file, compress = "gzip") + return(output_file) +} + +empty_count_alleles_result <- function() { + cols <- c( + "CHR", "chrom", "start", "end", "width", "POS", "ref", "alt", "alt_counts", "ref_counts", "total_counts", "BAF", "total_depth", "other_counts", + "all_counts", "M", "UM", "total_counts_m", "m", "Af", "Ar", "Cf", "Cr", "Tf", "Tr", "Gf", "Gr", "CAr", "TGf", "CGf", "CGr", "CCGG", "mq" + ) + bam_dt <- as.data.table(matrix(NA, nrow = 0, ncol = length(cols))) + names(bam_dt) <- cols + return(bam_dt) +} + +filter_multi_snp_loci <- function(pileup_summary) { + # Reference files may carry potential SNPs on both CG nucleotides + # CAMDAC currently only handles one CG-SNP pair, therefore filter for the more informative position + # Select loci based on: + # - One pair has a SNP, we take this one as it informs us to what degree the CG is imbalanced + # - Take the member of the pair with the highest CpG coverage + # - Otherwise, take the cytosine site + # FUTURE: A tool like bis-SNP to determine SNPs per sample + + # Label multi_snp_loci (msl) based on duplicates + pileup_summary[, msl := duplicated(pileup_summary, by = c("chrom", "start", "end"), fromLast = F) | + duplicated(pileup_summary, by = c("chrom", "start", "end"), fromLast = T)] + + # Return NULL if no MSL loci found + if(nrow(pileup_summary[msl==T])==0) { + return(pileup_summary) + } + + # Label loci as SNP based on BAF + pileup_summary[msl == T, is_snp := dplyr::between(BAF, 0.1, 0.9)] + + # Label loci with maximum coverage (could be both) + pileup_summary[msl == T, max_cov := total_counts_m == max(total_counts_m), by = c("chrom", "start", "end")] + pileup_summary[msl == T, max_cov := ifelse(is.na(max_cov), FALSE, max_cov)] + + # Any remaining duplicates are due to equivalance in the criteria. We can therefore take the first (C) element: + is_first <- !duplicated(pileup_summary[msl == T], by = c("chrom", "start", "end")) + pileup_summary[msl == T, is_first := is_first] + + # Apply filters by position to determine which to keep + pileup_summary[msl == T, + keep := ( + (!all(is_snp) & is_snp) | # One is a SNP + (!all(max_cov) & max_cov) # Max coverage + ), + by = c("chrom", "start", "end") + ] + pileup_summary[msl == T, keep_t := ( + (!sum(keep) == 1 & is_first) | + (sum(keep) == 1) & keep), + by = c("chrom", "start", "end") + ] + + # Filter out duplicates + pileup_summary <- pileup_summary[msl == F | (msl == T & !is.na(keep_t) & keep_t == T)] + # Remove any remaining duplicates (potential >2 multi-SNP loci. Not currently in references.) + pileup_summary <- pileup_summary[!duplicated(pileup_summary, by = c("chrom", "start", "end"))] + + # Remove extra columns + pileup_summary[, `:=`( + msl = NULL, is_snp = NULL, is_first = NULL, max_cov = NULL, keep = NULL, keep_t = NULL + )] + + return(pileup_summary) +} + +clean_and_limit_segments <- function(seg, bam_file) { + # Set segment chromosome names based on sequence style in BAM + # Required to read BAM correctly: + # e.g.: BAM having hg38 vs GRCh38 contigs + # e.g.: Segments having chromosomes not currently in BAM + bchroms <- seqnames(seqinfo(BamFile(bam_file))) + is_ucsc <- startsWith(bchroms[[1]], "chr") + bamstyle <- ifelse(is_ucsc, "UCSC", "NCBI") + GenomeInfoDb::seqlevelsStyle(seg) <- bamstyle + GenomeInfoDb::seqlevels(seg, pruning.mode="coarse") <- bchroms + return(seg) +} + +# Wrapper ---- + +#' @export +cwrap_get_allele_counts <- function(bam_file, seg, loci_dt = NA, paired_end, drop_ccgg, min_mapq = 1, min_cov = 3) { + # Loci may be NA if loci for segment chromosome (i.e. chromY) + # is missing. Return early in these cases as no alleles to count. + # Two conditions required to avoid error raised using is.na alone. + if (all(class(loci_dt) == "logical")) { + if (is.na(loci_dt)) { + return(empty_count_alleles_result()) + } + } + + # Limit segments to chromosomes covered in BAM file + seg <- clean_and_limit_segments(seg, bam_file) + if(length(seg) == 0) { + return(empty_count_alleles_result()) + } + + # Read BAM and annotate SNP and CPG loci + bam_dt <- get_reads_in_segments(bam_file, seg, min_mapq, paired_end = paired_end) + if (nrow(bam_dt) == 0) { + return(empty_count_alleles_result()) + } + + # Overlap with loci + bam_dt <- format_bam_for_loci_overlap(bam_dt, paired_end = paired_end) + bam_dt <- annotate_bam_with_loci(bam_dt, loci_dt, drop_ccgg = drop_ccgg, paired_end = paired_end) + bam_dt <- drop_positions_outside_segments(bam_dt, seg) + if (nrow(bam_dt) == 0) { + return(empty_count_alleles_result()) + } + + if (paired_end) { + # For paired end samples, we must select a single read at overlapping and then fix the + # strand column to reflect read orientation (as per single-end CAMDAC) + bam_dt <- fix_pe_overlap_at_loci(bam_dt) + bam_dt <- add_loci_read_position(bam_dt) + bam_dt <- fix_pe_strand_with_flags(bam_dt) + bam_dt <- get_alleles_and_qual(bam_dt) + bam_dt <- drop_pe_fields(bam_dt) + } else { + bam_dt <- add_loci_read_position(bam_dt) + bam_dt <- get_alleles_and_qual(bam_dt) + } + + # Additional filtering + bam_dt <- filter_clipped_dinucleotides(bam_dt) + bam_dt <- filter_bam_by_quality(bam_dt, min_mapq = min_mapq) + if (nrow(bam_dt) == 0) { + return(empty_count_alleles_result()) + } + + # Get nucleotide counts and flatten pileup + bam_dt <- annotate_nucleotide_counts(bam_dt) + pileup_summary <- flatten_pileup_to_counts(bam_dt) + rm(bam_dt) + + # Apply CADMAC rules to get allele counts, methylation rates and BAFs + pileup_summary <- get_snp_allele_counts(pileup_summary) + pileup_summary <- get_methylation_counts(pileup_summary, min_cov) + pileup_summary <- filter_bad_allele_count_rows(pileup_summary, min_cov) + pileup_summary <- compute_methylation_rates(pileup_summary) + pileup_summary <- compute_BAFs(pileup_summary) + pileup_summary <- filter_multi_snp_loci(pileup_summary) + + # Format result for output + result <- format_get_reads_result(pileup_summary) + + return(result) +} + diff --git a/R/ascat.R b/R/ascat.R new file mode 100755 index 0000000..ded42bc --- /dev/null +++ b/R/ascat.R @@ -0,0 +1,672 @@ +# ASCAT-WGBS + +# Sort genomic loci +#' @title sort_genomic_dt +#' @keywords internal +sort_genomic_dt <- function(dt, with_chr = F) { + if (with_chr) { + fact_levels <- paste0("chr", c(1:22, "X", "Y")) + } else { + fact_levels <- c(1:22, "X", "Y") + } + dt[, chrom := factor(chrom, levels = fact_levels)] + return(dt[order(chrom, POS)]) +} + + +# Randomise BAF for ASCAT +randomise_BAF <- function(BAF) { + # Create a vector of randomly selected boolean values the length of input data + selector <- base::sample(c(TRUE, FALSE), length(BAF), replace = T, prob = c(0.5, 0.5)) + # Randomly assign or flip BAF values for a more uniform profile + rBAF <- data.table::fifelse(selector, BAF, 1 - BAF) + return(rBAF) +} + + +# Load a sample's SNP profile from allele_counts +load_snp_profile <- function(ac_file, loci_files) { + # Load data and subset to SNP sites and relevant columns + snps <- fread_chrom( + ac_file, + select = c("CHR", "chrom", "POS", "total_counts", "total_depth", "ref", "alt", "BAF") + ) + onames <- names(snps) # save original columns for later + + # Filter to SNPs certified for allele counting using annotations in loci files + ascat_pos <- rbindlist(lapply(loci_files, function(x) { + load(x) + df <- loci_subset[(!is.na(loci_subset$CNA) & !is.na(loci_subset$ASCAT) & !is.na(loci_subset$POS))] + seqlevelsStyle(df) <- "NCBI" + dt <- data.table(chrom = as.character(seqnames(df)), POS = df$POS) + return(dt) + })) + setkey(ascat_pos, chrom, POS) + snps <- ascat_pos[snps[!is.na(POS)], on = .(chrom, POS)] # Filters SNPs to positions in ASCAT_POS + + # Select SNP loci based on heading + snps <- snps[!is.na(BAF), c("chrom", "POS", "total_counts", "total_depth", "ref", "alt", "BAF")] + + # Randomise BAF for downstream ASCAT analysis + snps[, BAFr := randomise_BAF(BAF)] + + # Ensure chromosome field is properly formatted + snps$chrom <- as.character(snps$chrom) + + return(snps) +} + +annotate_normal <- function(tsnps, nsnps, min_cov) { + # Ensure normal BAF randomised and LogR is 0 + nsnps[, BAFr := randomise_BAF(BAF)] + nsnps[, LogR := 0] + + # Add suffix to normal snps, subset columns and join to tumour snps data + nsnps <- nsnps[, .(chrom, POS, total_depth, total_counts, BAF, BAFr, LogR)] + to_suffix <- c("total_depth", "BAF", "BAFr", "LogR", "total_counts") + setnames(nsnps, old = to_suffix, new = paste0(to_suffix, "_n")) + setkey(nsnps, chrom, POS) + + tsnps <- merge(tsnps, nsnps, on = c("chrom", "POS")) + + tsnps <- tsnps[total_counts_n >= min_cov] + + return(tsnps) +} + +annotate_normal_tumor_only <- function(tsnps, nsnps) { + bool_to_hets <- function(tsnps) { + # Call hets with probablistic model and filter + baf_set <- ifelse(tsnps$BAF < .5, (1 - tsnps$BAF) * tsnps$total_counts, tsnps$BAF * tsnps$total_counts) + hets <- is_het(baf_set, tsnps$total_counts) == "Heterozygous" + return(hets) + } + + # If no SNP data for normal return tumor only data + if (is.null(nsnps)) { # Must be NULL due to object loading + tsnps <- tsnps[bool_to_hets(tsnps), ] + tsnps[, BAF_n := 0.5] + # Randomize + tsnps[, BAFr_n := randomise_BAF(BAF_n)] + tsnps[, total_counts_n := total_counts] + tsnps[, total_depth_n := total_depth] + tsnps[, LogR_n := 0] + return(tsnps) + } + + # Annotate counts depending on dataset available + setnames(nsnps, "total_counts", "total_counts_n", T) + setnames(nsnps, "BAF", "BAF_n", T) + if ("BAF_n" %in% names(nsnps)) { + nsnps$BAFr_n <- randomise_BAF(nsnps$BAF_n) + } else { + nsnps$BAF_n <- NA + nsnps$BAFr_n <- NA + } + + nsnps$LogR_n <- 0 + tsnps <- merge(tsnps, nsnps, by = c("chrom", "POS")) + + if (!("total_counts_n" %in% names(tsnps))) { + tsnps[, total_counts_n := total_counts] + } + + return(tsnps) +} + +# Anscombe transform the logr to stabilise variance for tumor-only LogR +anscombe_transform <- function(x) { + return(2 * sqrt(x + 3 / 8)) +} + +logr_to_ansc <- function(logr) { + trans <- anscombe_transform(2^logr) + trans <- trans - median(trans, na.rm = T) # Rescale so LogR median is 0 + return(trans) +} + + +# Calculate the tumour LogR from the tumour and normal sample +calculate_logr <- function(sample_cov, normal_cov, is_autosome = NULL) { + # If in tumor only mode + if (all(is.na(normal_cov))) { + stopifnot(length(is_autosome) == length(sample_cov)) + + # For LogR, take median across autosomes + med_cov <- median(sample_cov[is_autosome], na.rm = T) + LogR <- round( + log2(sample_cov) - log2(med_cov), + digits = 3 + ) + + # Use anscombe's transform to stabilise LogR variance + LogR <- logr_to_ansc(LogR) + + return(LogR) + } + + # Tumour LogR is the log2 normalised ratio of tumour and normal coverage + # at each position. + tumour_normal_coverage <- sample_cov / normal_cov + mean_coverage_ratio <- mean(tumour_normal_coverage, na.rm = T) + LogR <- log2(tumour_normal_coverage / mean_coverage_ratio) + return(LogR) +} + +annotate_gc <- function(tsample, gc_refs, min_window = 100, max_window = 10000, n_cores = 1) { + # Note: min_window included due to memory constraints + chrom <- as.character(tsample$chrom) + start <- tsample$POS + LogR <- tsample$LogR + + # Fix chromosome to UCSC format + chrom <- data.table::fifelse(startsWith(chrom, "chr"), chrom, paste0("chr", chrom)) + + # Create data table with LogR values + dt <- data.table::data.table(seqnames = chrom, start = start, end = start, LogR = LogR) + data.table::setkey(dt, seqnames, start, end) + + # Get GC-LogR correlations for all window sizes below maximum + logging::loginfo("Running GC correlation check", logger="CAMDAC") + doParallel::registerDoParallel(cores = n_cores) + gc_correlations <- foreach::foreach(gc_file = gc_refs) %dopar% { + # Get window size from filename + window_size <- as.numeric(gsub(".csv.gz", "", regmatches(gc_file, regexpr("(\\d+).csv.gz", gc_file)))) + + # Skip large windows. Here we correct for GC bias at the level of insert size + if (window_size > max_window | window_size < min_window) { + return(list(gc_corr = NA, GC = NA, window = NA)) + } + + # Load data + gcdf <- data.table::fread(gc_file, showProgress = FALSE)[, .(seqnames, start, end, GC)] + setkey(gcdf, seqnames, start, end) + overlap <- data.table::foverlaps(dt, gcdf) + overlap <- overlap[complete.cases(overlap)] + gc_corr <- abs(cor(overlap$GC, overlap$LogR)) + + return(list(gc_corr = gc_corr, GC = overlap$GC, window = window_size)) + } + doParallel::stopImplicitCluster() + + # Select best GC correlation window + gcc_ <- sapply(gc_correlations, "[[", "gc_corr") + gcc_vec <- sapply(gcc_, function(x) if (is.null(x)) NA_real_ else as.numeric(x)) + best_corr_index <- which.max(gcc_vec) + best_corr <- gc_correlations[[best_corr_index]] + + logging::loginfo("GC correlation check complete", logger="CAMDAC") + return(cbind(tsample, data.table(GC = best_corr$GC, GC_window = best_corr$window, GC_corr = best_corr$gc_corr))) +} + +annotate_repli <- function(tsample, repli_file) { + # Set variables for vectors + chrom <- tsample$chrom + start <- tsample$POS + end <- tsample$POS + + # Load replication timing data + repli <- data.table::fread(repli_file) + names(repli)[names(repli) == "chromosome"] <- "chrom" + setkey(repli, chrom, start, end) + + # Create table for sample data, ensuring UCSC chromosome names + dt <- tsample[, .(chrom, start = POS, end = POS, LogR = LogR)] + chrom <- dt$chrom + chrom <- data.table::fifelse(startsWith(chrom, "chr"), chrom, paste0("chr", chrom)) + dt$chrom <- chrom + + # Find the repliseq data nearest to each SNP + repli_ranges <- GRanges(seqnames = repli$chrom, ranges = IRanges(start = repli$start, end = repli$end)) + tumour_ranges <- GRanges(seqnames = dt$chrom, IRanges(start = dt$start, end = dt$end)) + rep_match <- repli[IRanges::nearest(tumour_ranges, repli_ranges)] + names(rep_match) <- sapply(names(rep_match), function(x) { + if (grepl("chrom|start|end", x)) { + gsub("$", "_repli", x) + } else { + x + } + }) + nearest_repli <- cbind(rep_match, dt) + + # Get LogR correlation for best cell line + cell_line_cols <- !grepl("chrom|start|end|LogR", names(nearest_repli)) + correl <- apply(nearest_repli[, ..cell_line_cols], MARGIN = 2, FUN = function(x) abs(cor(x, nearest_repli$LogR))) + best_line <- names(which.max(correl)) + logging::loginfo("Replication timing correction: %s", best_line, logger="CAMDAC") + + # Combine with original dataframe and return + result <- cbind(tsample, data.table(repli = nearest_repli[[best_line]])) + + return(data.table(result)) +} + +spline_regress_logr <- function(LogR, GC, repli) { + model <- lm( + LogR ~ splines::ns(x = GC, df = 5, intercept = T) + + splines::ns(x = repli, df = 5, intercept = T), + y = FALSE, model = FALSE, na.action = "na.exclude" + ) + return(model$residuals) +} + +# Function from ASCAT/CAMDAC-RRBS +split_genome_WGBS <- function(chrom, POS) { + # Convert chromosomes to numeric, including X and Y + # suppressWarnings() used to stop warning that NAs introduced after coercion. + # This is simply an effect of the way fcase handles the final condition. No NAs present. + chrom <- suppressWarnings( + data.table::fcase( + chrom == "X", 23, + chrom == "Y", 24, + # chrom that doesn't match condition is simply returned + rep_len(TRUE, length(chrom)), as.double(chrom) + ) + ) + + # Identify large GAP regions and chromosome segment boundaries (1MB) + # Diff goes pairwise through vector calculating differences. Which tells us where these are + # and +1 required as it's actually the diff from the first element + holesOver1Mb <- which(diff(POS) >= 1000000) + 1 + # Finds the indexes of chromosome borders + chrBorders <- which(diff(as.numeric(chrom)) != 0) + 1 + # Holes is a sorted list of indexes where the value preceding is a 1MB hole. + holes <- unique(sort(c(holesOver1Mb, chrBorders))) + + startseg <- c(1, holes) + endseg <- c(holes - 1, length(chrom)) + + chr <- lapply(seq(length(startseg)), function(x) startseg[x]:endseg[x]) + + return(chr) +} + +assign_genotypes <- function(BAF, as_logical = F) { + geno <- data.table::fcase( + BAF < 0.1, TRUE, + BAF > 0.9, TRUE, + default = FALSE + ) + + if (!as_logical) { + geno <- factor(data.table::fcase( + geno == TRUE, "Homozygous", + geno == FALSE, "Heterozygous" + ), levels = c("Homozygous", "Heterozygous")) + } + + return(geno) +} + +#' load_ascat_bc +#' +#' Create an ascat.bc object from input data vectors. Data must be sorted by genomic co-ordinate. +#' +#' @return List. An object containing the following fields: +#' - Tumor_LogR +#' - Tumor_BAF +#' - Germline_LogR +#' - Germline_BAF +#' - SNPpos. dataframe of SNP positions with columns `chrom, POS` +#' - chr. Genome segments, output from `split_genome` +#' - samples. character vector of sample names +#' - gender. The patient's sex. "XX" or "XY". +#' - genotypes. The patient's Genotype profile +#' - chrs. A vector of chromosome identifiers +#' - ch. List. An element for each chrom with the SNPpos df indexes for SNPs belonging to that chromosome. +#' +#' @keywords internal +#' @noRd +load_ascat_bc <- function(logr_t, baf_t, logr_n, baf_n, chrom, POS, samples, sex) { + # Build ch, a numeric for each chromosome + chrom_names <- c(1:22, "X", "Y") + ch <- lapply(chrom_names, function(x) which(as.character(chrom) == x)) + names(ch) <- chrom_names + + # Extract genotypes from normal BAF. + genotypes <- assign_genotypes(baf_n) + + # Split genome to create `chr`, a list of items, one per chromosome. + # Function used depends on bsseq lib. + chr <- split_genome_WGBS(chrom, POS) + + ascat.bc <- list( + Tumor_LogR = data.frame(Tumor_LogR = logr_t), + Tumor_BAF = data.frame(Tumor_BAF = baf_t), + Germline_LogR = data.frame(Germline_LogR = logr_n), + Germline_BAF = data.frame(Germline_BAF = baf_n), + Tumor_LogR_segmented = NULL, Tumor_BAF_segmented = NULL, + Tumor_counts = NULL, Germline_counts = NULL, + SNPpos = data.frame(Chr = chrom, Position = POS), chr = chr, + samples = paste(samples, sep = "."), + chrs = c(1:22, "X", "Y"), ch = ch, + gender = sex, sexchromosomes = c("X", "Y"), + genotypes = data.frame(ggtypes = genotypes), + failedarrays = NULL + ) + + names(ascat.bc$Tumor_LogR) <- samples + names(ascat.bc$Tumor_BAF) <- samples + names(ascat.bc$Germline_LogR) <- samples + names(ascat.bc$Germline_BAF) <- samples + + return(ascat.bc) +} + +#' @title bseq_bool +#' A helper function for filtering out reference and alternate +#' SNPs where bisulfite conversion cannot be distinguished. +#' @param ref Reference allele +#' @param alt Alternate allele +#' @noRd +bseq_bool <- function(ref, alt) { + return(!( + (ref == "C" & alt == "T") | + (ref == "T" & alt == "C") | + (ref == "A" & alt == "G") | + (ref == "G" & alt == "A"))) +} + +# rm_low_cov_singletons +# Remove low cov singletons. These are low-confidence SNPs that may cause ASCAT to produce small spurious segments. +# This function is ported from CAMDAC-RRBS +rm_low_cov_singletons <- function(dt_sample_SNPs, min = 3) { + # subselect relevant columns + dt <- dt_sample_SNPs[, c("chrom", "POS", "total_counts")] + + # flag all SNPs with cov < 10 + low_cov <- dt$total_counts <= min + + # get neighbouring SNP index + ranges <- -5:5 + idxs <- outer(which(low_cov == TRUE), ranges, `+`) + # Creates a length(low_cov) x 11 matrix, where column #6 is the position of low-cov SNPs in our array, + # while columns either side give the indexes of SNPs -5 and +6 + # NM: received error previously so filtering out negatives + idxs <- pmax(idxs, 1) # pmax returns the maximum of the two values, so if 0 is higher for any item in the array, it's replaced + + # get neighbouring SNP genomic coordinates + POSS <- suppressWarnings(matrix(dt$POS[idxs], ncol = length(ranges))) + # Subtract each row from the position of the low coverage SNP + POSS <- abs(POSS - dt[low_cov == TRUE]$POS) + # Set low-cov SNPs greater than 1Mb away from neighbour to NA + idxs[POSS > 1E6] <- NA + + # Get the coverage at these loci (low cov snps and neighbours) + covs <- matrix(dt$total_counts[idxs], ncol = length(ranges)) + # Get the average coverage across the region. + mean_covs <- rep(as.numeric(NA), length = nrow(dt)) + mean_covs[low_cov == TRUE] <- rowMeans(covs, na.rm = TRUE) + # Note the vector of mean coverage is the length of original data + + # Flag and remove low coverage singletons on the following criteria: + # 1) Low coverage AND there is an NA in the mean coverage values ## PROBLEM: Isn't this capturing chromosome boundaries? + low_cov_na_mean <- low_cov & is.na(mean_covs) + # 2) Mean coverage is less than the minimum coverage + mean_below_min <- !is.na(mean_covs) & mean_covs < min + low_cov_singleton <- low_cov_na_mean | mean_below_min + + dt_sample_SNPs + + # Flag and remove low coverage singletons + low_cov_singleton <- ifelse(low_cov == FALSE, FALSE, + ifelse(low_cov == TRUE & is.na(mean_covs), TRUE, + ifelse(mean_covs > min, TRUE, FALSE) + ) + ) + dt_sample_SNPs <- dt_sample_SNPs[low_cov_singleton == FALSE, ] + + # WARNING: NAs filtered during this function may include genome gaps as we are searching for + # SNPs at a large distance from other SNPs. + return(dt_sample_SNPs) +} + + +# Use BAF from normal allele counts file +use_external_normal_baf <- function(tumour, external_ac_file, config) { + stopifnot(fs::file_exists(external_ac_file)) + + # Load TSNPs + tsnps_file <- CAMDAC::get_fpath(tumour, config, "tsnps") + tsnps <- data.table::fread(tsnps_file) + tsnps$chrom <- as.character(tsnps$chrom) + + # Create a backup of the original tSNPs file for record + # This is instead of appending a column as normal SNPs overlap may remove loci + fs::file_copy(tsnps_file, fs::path(tsnps_file, ext = "initial"), overwrite = T) + + # Load external allele counts file + ext_ac <- data.table::fread(external_ac_file) + stopifnot( + all(c("#CHR", "POS", "Count_A", "Count_C", "Count_G", "Count_T", "Good_depth") %in% names(ext_ac)) + ) + setnames(ext_ac, "#CHR", "chrom") + ext_ac[, chrom := gsub("chr", "", chrom)] + ext_ac <- sort_genomic_dt(ext_ac) + + # Overlap loci and calculate BAF + # Note: Good_depth used to simply select ext_ac rows that are not NA + setkey(tsnps, chrom, POS) + tsnps <- tsnps[ext_ac, , on = .(chrom, POS)][!is.na(BAF) & !is.na(Good_depth)] + tsnps[, BAF_n := data.table::fcase( + ref == "C" & alt == "T", Count_T / (Count_C + Count_T), + ref == "C" & alt == "A", Count_A / (Count_C + Count_A), + ref == "C" & alt == "G", Count_G / (Count_C + Count_G), + ref == "G" & alt == "C", Count_C / (Count_G + Count_C), + ref == "G" & alt == "T", Count_T / (Count_G + Count_T), + ref == "G" & alt == "A", Count_A / (Count_G + Count_A), + ref == "A" & alt == "T", Count_T / (Count_A + Count_T), + ref == "A" & alt == "G", Count_G / (Count_A + Count_G), + ref == "A" & alt == "C", Count_C / (Count_A + Count_C), + ref == "T" & alt == "C", Count_C / (Count_T + Count_C), + ref == "T" & alt == "A", Count_A / (Count_T + Count_A), + ref == "T" & alt == "G", Count_G / (Count_T + Count_G) + )] + + tsnps[, BAFr_n := randomise_BAF(BAF_n)] + + tsnps[, `:=`( + Count_A = NULL, Count_C = NULL, Count_G = NULL, Count_T = NULL, Good_depth = NULL + )] + + tsnps <- tsnps[total_depth_n >= 10] + tsnps <- tsnps[!is.na(BAF_n)] + + data.table::fwrite(tsnps, tsnps_file, compress = "gzip") + return(tsnps_file) +} + +write_acf_and_ploidy_file <- function(tsnps, ascat.output, ascat.frag, sample_prefix, outdir) { + # Get genotypes for het/hom counts. TRUE for Hom and FALSE for Het + genos <- assign_genotypes(tsnps$BAFr, as_logical = T) + + fdata <- data.frame( + ploidy = ascat.output$ploidy, + ACF = ascat.output$aberrantcellfraction, + num_het_SNPs_seg = nrow(ascat.frag$Tumor_LogR_segmented), + num_hom_SNPs_seg = nrow(ascat.frag$Tumor_BAF_segmented[[1]]), + num_het_SNPs_camdac = sum(!genos), + num_hom_SNPs_camdac = sum(genos), + median_depth_camdac = median(tsnps$total_depth), + median_n_depth_camdac = median(tsnps$total_depth_n) + ) + rownames(fdata) <- NULL + + # Write to ouptut + write.table(fdata, file = fs::path(outdir, paste0(sample_prefix, ".ACF.and.ploidy.txt")), sep = "\t", row.names = F, col.names = T, quote = F) +} + +run_ascat.m2 <- function(tumour, tsnps, outdir, rho_manual = NA, psi_manual = NA, penalty = 200) { + sample_prefix <- paste(tumour$patient_id, tumour$id, sep = ".") + + # Load ASCAT object + ascat.bc <- load_ascat_bc( + logr_t = tsnps$LogR, baf_t = tsnps$BAFr, + logr_n = tsnps$LogR_n, baf_n = tsnps$BAFr_n, + chrom = tsnps$chrom, POS = tsnps$POS, + samples = sample_prefix, + sex = tumour$sex + ) + + # Plot raw data + # ascat.plotRawData(ascat.bc, img.dir=outdir, img.prefix=sample_prefix) # base ASCAT plotter + ascat.mw.plotRawData(ascat.bc, outdir = outdir) + + # Get germline genotypes + # gg$germlinegenotypes = N x 1 matrix of booleans where FALSE == Heterozygous + gg <- get_germline_geno(ascat.bc) + + # Call ASPCF + ascat.frag <- ASCAT::ascat.aspcf(ascat.bc, + ascat.gg = gg, penalty = penalty, + out.dir = outdir + ) + + # Plot segmented data + # ascat.plotSegmentedData(ascat.frag, img.dir=outdir, img.prefix=sample_prefix) # base ASCAT plotter + ascat.mw.plotSegmentedData(ascat.frag, fname = sample_prefix, outdir = outdir) + + # Run ASCAT + ascat.output <- ASCAT::ascat.runAscat(ascat.frag, + gamma = 1, img.dir = outdir, rho_manual = rho_manual, psi_manual = psi_manual + ) + + # Write ACF and ploidy text file. Format adapted from CAMDAC-RRBS + write_acf_and_ploidy_file(tsnps, ascat.output, ascat.frag, sample_prefix, outdir) + + # Return ASCAT results + return(list( + ascat.bc = ascat.bc, + ascat.output = ascat.output, + ascat.frag = ascat.frag + )) +} + +# Winsorize extreme values in tumor BAF +# Our rule is that we only remove SNPs that fall on 0/1 and are outliers from the median +winsorize <- function(BAF) { + # Calculate the running median + # param: k determins how far the running median will be calculated + medianFilter <- function(x, k) { + n <- length(x) + filtWidth <- 2 * k + 1 + + # Make sure filtWidth does not exceed n + if (filtWidth > n) { + if (n == 0) { + filtWidth <- 1 + } else if (n %% 2 == 0) { + # runmed requires filtWidth to be odd, ensure this: + filtWidth <- n - 1 + } else { + filtWidth <- n + } + } + + runMedian <- stats::runmed(x, k = filtWidth, endrule = "median") + + return(runMedian) + } + + # Set data points to + psi <- function(d, z) { + # d is raw_baf - running_median. Set + xwin <- d + # z is raw value for tau*SD of the MAD of the running median + # If the difference is greater than z in either + or - direction, winsorize it to z + xwin[d < -z] <- -z + xwin[d > z] <- z + return(xwin) + } + + # Perform MAD winsorization: + # Tau is how many SDs away from median MAD we winsorize + # K is how many probes the running median is calculated against + madWins <- function(x, tau = 2.5, k = 40, digits = 4) { + # Calculate the running median median + xhat <- medianFilter(x, k) + # Get the difference and SD + d <- x - xhat + SD <- stats::mad(d) + # Set the winsorization threshold, i.e. X standard deviations of the MAD + z <- tau * SD + # xwin is the factor by which we adjust each value of xhat + # If d is within our z range, we simply add it back to get the value of x, + # otherwise we add z to winsorize + xwin <- xhat + psi(d, z) + + # Detect outliers, i.e. SNPs where winsorization has been applied + # This is done by simply copying the winsorizing rules + + # Detect outliers + outliers <- (d < -z) | (d > z) + return(list(ywin = xwin, sdev = SD, outliers = outliers)) + } + + return(madWins(BAF)) +} + +# probabilistic approach to assign heterozygous SNPs directly from tumour BAF profiles +is_het <- function(x, y, pbin = 0.01, probHom = .99, na.rm = TRUE) { + flag <- logical(length = length(x)) + flag <- !(pbinom(unlist(x), size = unlist(y), prob = probHom, log.p = FALSE) > pbin) + flag <- ifelse(flag == TRUE, "Heterozygous", "Homozygous") + return(flag) +} + +bind_snps_protocol <- function(tsnps, normal, config) { + # Annotate tumour SNPs + # Four modes: + # 1) Default mode: annotate tumour SNPs with normal SNPs + # 2) Position only mode: ignore BAF and use to select SNP loci (BAFr_n=F) + # 3) Coverage mode: Use coverage from normal to select (BAFr_n=F) + # 4) No normal mode: Use tumor only for all selections + if (is.null(normal)) { + # No normal mode + logging::loginfo("No germline normal. Tumor-only SNP profile", logger="CAMDAC") + # TODO: Refactor. Currently filters SNPs on tumor set, or not at all + tsnps <- annotate_normal_tumor_only(tsnps, nsnps = NULL) + return(tsnps) + } + + # Load normal SNPs + logging::loginfo("Generating SNP profiles for tumor-normal", logger="CAMDAC") + nsnps_f <- get_fpath(normal, config, "snps") + nsnps <- fread_chrom(nsnps_f) + + default_mode <- all(c("chrom", "POS", "ref", "alt", "BAF", "BAFr") %in% names(nsnps)) + + if (!default_mode) { + logging::loginfo("Normal SNP profile is external. Applying to tumor only mode", logger="CAMDAC") + tsnps <- annotate_normal_tumor_only(tsnps, nsnps = nsnps) + } else { + tsnps <- annotate_normal(tsnps, nsnps, min_cov = config$min_cov) + } + + return(tsnps) +} + +select_heterozygous_snps <- function(tsnps) { + # Used to select het SNPs from T-N prior to Battenberg. + # However, this breaks ASCAT, so should be used with caution. + # Note that ASCAT.m will select at 0.1 <> 0.9 as germline hom stretches required + # This must therefore be higher. Does not influence battenberg.m + tsnps <- tsnps[BAF_n >= 0.08 & BAF_n <= 0.92] + return(tsnps) +} + + +get_germline_geno <- function(ascat.bc) { + gg <- list(germlinegenotypes = matrix(assign_genotypes(ascat.bc$Germline_BAF, as_logical = T))) + + # ASCAT will not run if no germline homozygous stretches are found. This is a problem for test data. + # We insert a dummy stretch (3 SNPs) at the start of chromosome 1 it avoid this error. + ghs <- ASCAT:::predictGermlineHomozygousStretches(ascat.bc$chr, gg$germlinegenotypes) + + if (length(ghs) == 0) { + logwarn("No shared SNPs identified. Adding false genotypes for test data.") + gg$germlinegenotypes[1:1000] = T + } + + return(gg) +} diff --git a/R/ascat_plots.R b/R/ascat_plots.R new file mode 100755 index 0000000..44c0343 --- /dev/null +++ b/R/ascat_plots.R @@ -0,0 +1,311 @@ + +#' @title ascat.m.plotRawData +#' @description Plot tumour and germline BAF and LogR +#' @param ASCATobj an ASCAT object (e.g. data structure from ascat.loadData) +#' @param raw_LogR vector with the LogR values before correction # NM: Not used so removed +#' @param pch type of data points in plot +#' @param cex size of data points in plot +#' @param lim_logR y-axis limits on logR plot +#' +#' @return Produces png files showing the logR and BAF values for tumour and germline samples +#' @author Peter Van Loo +#' @noRd +#' @keywords internal +ascat.mw.plotRawData <- function(ASCATobj, outdir, pch = 10, cex = 0.2, lim_logR = 2.5) { + print.noquote("Plotting tumor data") + for (i in 1:dim(ASCATobj$Tumor_LogR)[2]) { + colls <- ifelse(ASCATobj$Germline_BAF[, i] < 0.85 & ASCATobj$Germline_BAF[, i] > 0.15, "red", "red") + # set point colours to show SNP germline genotype + outfile_t <- fs::path(outdir, paste(ASCATobj$samples[i], ".tumour.png", sep = "")) + png(filename = outfile_t, width = 2000, height = 1250, res = 200) + par( + mar = c(0.5, 5, 5, 0.5), mfrow = c(3, 1), cex = 0.4, cex.main = 3, cex.axis = 2, + pch = ifelse(dim(ASCATobj$Tumor_LogR)[1] > 100000, ".", 20) + ) + plot(c(1, dim(ASCATobj$Tumor_LogR)[1]), c(-lim_logR, lim_logR), + type = "n", xaxt = "n", main = paste(ASCATobj$samples[i], ", tumor data, LogR", sep = ""), + xlab = "", ylab = "" + ) + points(ASCATobj$Tumor_LogR[, i], col = "red", cex = 0.2) + # points(ASCATobj$Tumor_LogR[,i],col=rainbow(24)[ASCATobj$SNPpos$Chr]) + abline(v = 0.5, lty = 1, col = "lightgrey") + chrk_tot_len <- 0 + for (j in 1:length(ASCATobj$ch)) { + chrk <- ASCATobj$ch[[j]] + chrk_tot_len_prev <- chrk_tot_len + chrk_tot_len <- chrk_tot_len + length(chrk) + vpos <- chrk_tot_len + tpos <- (chrk_tot_len + chrk_tot_len_prev) / 2 + text(tpos, 2, ASCATobj$chrs[j], pos = 1, cex = 2) + abline(v = vpos + 0.5, lty = 1, col = "lightgrey") + } + # Note: no corrected data currently passed NM edit + # plot(c(1,dim(ASCATobj$Tumor_LogR)[1]), c(-lim_logR ,lim_logR ), + # type = "n", xaxt = "n", main = paste(ASCATobj$samples[i], ", tumor data, corrected LogR", sep = ""), + # xlab = "", ylab = "") + # points(ASCATobj$Tumor_LogR[,i],col="red") + # #points(ASCATobj$Tumor_LogR[,i],col=rainbow(24)[ASCATobj$SNPpos$Chr]) + # abline(v=0.5,lty=1,col="lightgrey") + # chrk_tot_len = 0 + # for (j in 1:length(ASCATobj$ch)) { + # chrk = ASCATobj$ch[[j]]; + # chrk_tot_len_prev = chrk_tot_len + # chrk_tot_len = chrk_tot_len + length(chrk) + # vpos = chrk_tot_len; + # tpos = (chrk_tot_len+chrk_tot_len_prev)/2; + # text(tpos,2,ASCATobj$chrs[j], pos = 1, cex = 2) + # abline(v=vpos+0.5,lty=1,col="lightgrey") + # } + plot(c(1, dim(ASCATobj$Tumor_BAF)[1]), c(0, 1), + type = "n", xaxt = "n", + main = paste(ASCATobj$samples[i], ", tumor data, BAF", sep = ""), xlab = "", ylab = "" + ) + points(ASCATobj$Tumor_BAF[, i], col = colls, pch = pch, cex = 0.2) + abline(v = 0.5, lty = 1, col = "lightgrey") + chrk_tot_len <- 0 + for (j in 1:length(ASCATobj$ch)) { + chrk <- ASCATobj$ch[[j]] + chrk_tot_len_prev <- chrk_tot_len + chrk_tot_len <- chrk_tot_len + length(chrk) + vpos <- chrk_tot_len + tpos <- (chrk_tot_len + chrk_tot_len_prev) / 2 + text(tpos, 1, ASCATobj$chrs[j], pos = 1, cex = 2) + abline(v = vpos + 0.5, lty = 1, col = "lightgrey") + } + dev.off() + } + + if (!is.null(ASCATobj$Germline_LogR)) { + print.noquote("Plotting germline data") + for (i in 1:dim(ASCATobj$Germline_LogR)[2]) { + outfile_g <- fs::path(outdir, paste(ASCATobj$samples[i], ".germline.png", sep = "")) + png(filename = outfile_g, width = 2000, height = 750, res = 200) + par( + mar = c(0.5, 5, 5, 0.5), mfrow = c(2, 1), cex = 0.4, cex.main = 3, cex.axis = 2, + pch = ifelse(dim(ASCATobj$Tumor_LogR)[1] > 100000, ".", 20) + ) + plot(c(1, dim(ASCATobj$Germline_LogR)[1]), c(-1, 1), + type = "n", xaxt = "n", + main = paste(ASCATobj$samples[i], ", germline data, LogR", sep = ""), xlab = "", ylab = "" + ) + points(ASCATobj$Germline_LogR[, i], col = "red", cex = 0.2) + abline(v = 0.5, lty = 1, col = "lightgrey") + chrk_tot_len <- 0 + for (j in 1:length(ASCATobj$ch)) { + chrk <- ASCATobj$ch[[j]] + chrk_tot_len_prev <- chrk_tot_len + chrk_tot_len <- chrk_tot_len + length(chrk) + vpos <- chrk_tot_len + tpos <- (chrk_tot_len + chrk_tot_len_prev) / 2 + text(tpos, 2, ASCATobj$chrs[j], pos = 1, cex = 2) + abline(v = vpos + 0.5, lty = 1, col = "lightgrey") + } + plot(c(1, dim(ASCATobj$Germline_BAF)[1]), c(0, 1), + type = "n", xaxt = "n", + main = paste(ASCATobj$samples[i], ", germline data, BAF", sep = ""), xlab = "", ylab = "" + ) + points(ASCATobj$Germline_BAF[, i], col = colls, pch = pch, cex = 0.2) + abline(v = 0.5, lty = 1, col = "lightgrey") + chrk_tot_len <- 0 + for (j in 1:length(ASCATobj$ch)) { + chrk <- ASCATobj$ch[[j]] + chrk_tot_len_prev <- chrk_tot_len + chrk_tot_len <- chrk_tot_len + length(chrk) + vpos <- chrk_tot_len + tpos <- (chrk_tot_len + chrk_tot_len_prev) / 2 + text(tpos, 1, ASCATobj$chrs[j], pos = 1, cex = 2) + abline(v = vpos + 0.5, lty = 1, col = "lightgrey") + } + dev.off() + } + } +} + + +#' @title ascat.m.plotSegmentedData +#' @description Plot segmentated BAF LogR +#' @param ASCATobj an ASCAT object (e.g. data structure from ascat.loadData) +#' +#' @return Produces png files showing the logR and BAF values for tumour and germline samples +#' @author Peter Van Loo +#' @noRd +#' @keywords internal +ascat.mw.plotSegmentedData <- function(ASCATobj, fname = "", outdir, lim_logR = 2) { + for (arraynr in 1:dim(ASCATobj$Tumor_LogR)[2]) { + outfile_t <- fs::path(outdir, paste(ASCATobj$samples[arraynr], ".ASPCF.png", + sep = "" + )) + Select_nonNAs <- rownames(ASCATobj$Tumor_BAF_segmented[[arraynr]]) + AllIDs <- 1:dim(ASCATobj$Tumor_LogR)[1] + names(AllIDs) <- rownames(ASCATobj$Tumor_LogR) + HetIDs <- AllIDs[Select_nonNAs] + png(filename = outfile_t, width = 2000, height = 1000, res = 200) + par( + mar = c(0.5, 5, 5, 0.5), mfrow = c(2, 1), cex = 0.4, + cex.main = 3, cex.axis = 2 + ) + r <- ASCATobj$Tumor_LogR_segmented[ + rownames(ASCATobj$Tumor_BAF_segmented[[arraynr]]), + arraynr + ] + beta <- ASCATobj$Tumor_BAF_segmented[[arraynr]][, , drop = FALSE] + plot(c(1, length(r)), c(-lim_logR, lim_logR), + type = "n", xaxt = "n", + main = paste(fname, ", LogR", sep = ""), xlab = "", ylab = "" + ) + points(ASCATobj$Tumor_LogR[ + rownames(ASCATobj$Tumor_BAF_segmented[[arraynr]]), + arraynr + ], col = rgb(1, 0, 0, 0.5), pch = ".", cex = 0.20) + points(r, col = "blue", cex = 0.2) + abline(v = 0.5, lty = 1, col = "lightgrey") + chrk_tot_len <- 0 + for (j in 1:length(ASCATobj$ch)) { + chrk <- intersect(ASCATobj$ch[[j]], HetIDs) + chrk_tot_len_prev <- chrk_tot_len + chrk_tot_len <- chrk_tot_len + length(chrk) + vpos <- chrk_tot_len + tpos <- (chrk_tot_len + chrk_tot_len_prev) / 2 + text(tpos, lim_logR - 0.5, ASCATobj$chrs[j], pos = 1, cex = 2) + abline(v = vpos + 0.5, lty = 1, col = "lightgrey") + } + plot(c(1, length(beta)), c(0, 1), + type = "n", xaxt = "n", + main = paste(fname, ", BAF", sep = ""), xlab = "", ylab = "" + ) + points(ASCATobj$Tumor_BAF[ + rownames(ASCATobj$Tumor_BAF_segmented[[arraynr]]), + arraynr + ], col = rgb(1, 0, 0, 0.5), pch = ".", cex = 0.20) + points(beta, col = "blue", cex = 0.2) + points(1 - beta, col = "blue", cex = 0.2) + abline(v = 0.5, lty = 1, col = "lightgrey") + chrk_tot_len <- 0 + for (j in 1:length(ASCATobj$ch)) { + chrk <- intersect(ASCATobj$ch[[j]], HetIDs) + chrk_tot_len_prev <- chrk_tot_len + chrk_tot_len <- chrk_tot_len + length(chrk) + vpos <- chrk_tot_len + tpos <- (chrk_tot_len + chrk_tot_len_prev) / 2 + text(tpos, 1, ASCATobj$chrs[j], pos = 1, cex = 2) + abline(v = vpos + 0.5, lty = 1, col = "lightgrey") + } + dev.off() + } +} + +# Copied and adjusted from Tom's github repo: +# https://github.com/tlesluyes/ascat/blob/dc53b739504be83d309be6cffa62c16a1de770df/ASCAT/R/ascat.plots.R#L389 +ascat.mw.plotAdjustedAscatProfile <- function(camdac_cna, outfile_name, sample_id = "SAMPLE", REF = "hg38", y_limit = 5, plot_unrounded = F, battenberg = F, ascat_colours = F) { + # convert CAMDAC_CNA to expected values + ASCAT_output_object <- list() + ASCAT_output_object$segments_raw <- data.frame( + camdac_cna$cna[, .(sample = sample_id, chr = chrom, startpos = start, endpos = end, nMajor = nA, nMinor = nB, nAraw = nA, nBraw = nB)] + ) + ASCAT_output_object$segments <- data.frame( + camdac_cna$cna[, .(sample = sample_id, chr = chrom, startpos = start, endpos = end, nMajor = nA, nMinor = nB)] + ) + # Set variables for final section of plot + SAMPLE <- sample_id + ASCAT_output_object$purity[SAMPLE] <- camdac_cna$purity + ASCAT_output_object$ploidy[SAMPLE] <- camdac_cna$ploidy + ASCAT_output_object$goodnessOfFit[SAMPLE] <- camdac_cna$fit + ASCAT_output_object$nonaberrantarrays[SAMPLE] <- F + + if (plot_unrounded) { + SEGMENTS <- ASCAT_output_object$segments_raw[, c(1:4, 7:8)] + colnames(SEGMENTS)[5:6] <- c("nMajor", "nMinor") + SEGMENTS$nMajor <- SEGMENTS$nMajor + SEGMENTS$nMinor + colourA <- "#c725e3" # purple + colourB <- "#e37825" # orange + } else { + SEGMENTS <- ASCAT_output_object$segments + SEGMENTS$nMajor <- SEGMENTS$nMajor - 0.1 + SEGMENTS$nMinor <- SEGMENTS$nMinor + 0.1 + colourA <- "#e03546" # red + colourB <- "#3557e0" # blue + } + SEGMENTS$nMajor <- ifelse(SEGMENTS$nMajor > y_limit, y_limit + 0.1, SEGMENTS$nMajor) + SEGMENTS$nMinor <- ifelse(SEGMENTS$nMinor > y_limit, y_limit + 0.1, SEGMENTS$nMinor) + + if (battenberg) { + colourA <- "#e4a329" + colourB <- "#000000" + } + if (ascat_colours) { + colourA <- "#00fd31" + colourB <- "#fd2b1a" + } + + if (REF == "hg19") { + REF <- data.frame( + chrom = c(1:22, "X"), + start = rep(1, 23), + end = c( + 249250621, 243199373, 198022430, 191154276, 180915260, 171115067, 159138663, 146364022, 141213431, + 135534747, 135006516, 133851895, 115169878, 107349540, 102531392, 90354753, 81195210, 78077248, + 59128983, 63025520, 48129895, 51304566, 155270560 + ) + ) + } else if (REF == "hg38") { + REF <- data.frame( + chrom = c(1:22, "X"), + start = rep(1, 23), + end = c( + 248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636, 138394717, + 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345, 83257441, 80373285, + 58617616, 64444167, 46709983, 50818468, 156040895 + ) + ) + } else { + stopifnot(is.data.frame(REF)) + stopifnot(identical(colnames(REF), c("chrom", "start", "end"))) + } + + SEGMENTS$chr <- gsub("^chr", "", SEGMENTS$chr) + stopifnot(all(ASCAT_output_object$segments$chr %in% REF$chrom)) + REF$size <- REF$end - REF$start + 1 + REF$middle <- 0 + for (i in 1:nrow(REF)) { + if (i == 1) { + REF$middle[i] <- REF$size[i] / 2 + } else { + REF$middle[i] <- sum(as.numeric(REF$size[1:(i - 1)])) + REF$size[i] / 2 + } + } + rm(i) + REF$cumul <- cumsum(as.numeric(REF$size)) + REF$add <- cumsum(as.numeric(c(0, REF$size[1:(nrow(REF) - 1)]))) + + SEGMENTS$startpos_adjusted <- SEGMENTS$startpos + SEGMENTS$endpos_adjusted <- SEGMENTS$endpos + for (CHR in unique(REF$chrom)) { + INDEX <- which(SEGMENTS$chr == CHR) + if (length(INDEX) > 0) { + SEGMENTS$startpos_adjusted[INDEX] <- SEGMENTS$startpos_adjusted[INDEX] + REF$add[which(REF$chrom == CHR)] + SEGMENTS$endpos_adjusted[INDEX] <- SEGMENTS$endpos_adjusted[INDEX] + REF$add[which(REF$chrom == CHR)] + } + rm(INDEX) + } + rm(CHR) + + + for (SAMPLE in sort(unique(SEGMENTS$sample))) { + SEGS <- SEGMENTS[which(SEGMENTS$sample == SAMPLE), ] + if (nrow(SEGS) == 0) warning(paste0("No segments for sample: ", SAMPLE)) + maintitle <- paste(SAMPLE, " Ploidy: ", sprintf("%1.2f", ASCAT_output_object$ploidy[SAMPLE]), ", purity: ", sprintf("%2.0f", ASCAT_output_object$purity[SAMPLE] * 100), "%, goodness of fit: ", sprintf("%2.1f", ASCAT_output_object$goodnessOfFit[SAMPLE]), "%", ifelse(isTRUE(ASCAT_output_object$nonaberrantarrays[SAMPLE]), ", non-aberrant", ""), sep = "") + png(filename = outfile_name, width = 2000, height = (y_limit * 100), res = 200) + par(mar = c(0.5, 5, 5, 0.5), cex = 0.4, cex.main = 3, cex.axis = 2.5) + ticks <- seq(0, y_limit, 1) + plot(c(1, REF$cumul[nrow(REF)]), c(0, y_limit), type = "n", xaxt = "n", yaxt = "n", main = maintitle, xlab = "", ylab = "") + axis(side = 2, at = ticks) + abline(h = ticks, col = "lightgrey", lty = 1) + rect(SEGS$startpos_adjusted, (SEGS$nMajor - 0.07), SEGS$endpos_adjusted, (SEGS$nMajor + 0.07), col = ifelse(SEGS$nMajor >= y_limit, adjustcolor(colourA, red.f = 0.75, green.f = 0.75, blue.f = 0.75), colourA), border = ifelse(SEGS$nMajor >= y_limit, adjustcolor(colourA, red.f = 0.75, green.f = 0.75, blue.f = 0.75), colourA)) + rect(SEGS$startpos_adjusted, (SEGS$nMinor - 0.07), SEGS$endpos_adjusted, (SEGS$nMinor + 0.07), col = ifelse(SEGS$nMinor >= y_limit, adjustcolor(colourB, red.f = 0.75, green.f = 0.75, blue.f = 0.75), colourB), border = ifelse(SEGS$nMinor >= y_limit, adjustcolor(colourB, red.f = 0.75, green.f = 0.75, blue.f = 0.75), colourB)) + abline(v = c(1, REF$cumul), lty = 1, col = "lightgrey") + text(REF$middle, y_limit, REF$chrom, pos = 1, cex = 2) + dev.off() + rm(SEGS, ticks, maintitle) + } + rm(SAMPLE) +} diff --git a/R/asm_allele_counts.R b/R/asm_allele_counts.R new file mode 100644 index 0000000..b8bd336 --- /dev/null +++ b/R/asm_allele_counts.R @@ -0,0 +1,314 @@ +#' Count alleles for reads phased to SNPs in a BAM file +#' @param bam_file Path to BAM file +#' @param snps_gr GRanges object with heterozygous SNP loci for phasing +#' @param loci_dt Data table with CAMDAC CpG loci from reference files +#' @param paired_end Logical indicating if BAM is paired end +#' @param drop_ccgg Logical indicating if CCGG should be dropped (i.e. rrbs mode) +#' @param min_mapq Minimum mapping quality to consider a read +#' @param min_cov Minimum coverage to consider a read +#' @return A list with three slots: stats, qnames and asm_cg. stats describes counts of reads phased, +#' qnames determines which SNPs each read was phased to and asm_cg is the data table with read counts +#' @keywords internal +cwrap_asm_get_allele_counts <- function( + bam_file, snps_gr, loci_dt, + paired_end, drop_ccgg, min_mapq = min_mapq, min_cov = min_cov) { + # Read BAM + bam_dt <- get_reads_in_segments(bam_file, snps_gr, min_mapq, paired_end = paired_end) + bam_dt <- format_bam_for_loci_overlap(bam_dt, paired_end = paired_end) + + # Early strand adjustment for paired end: strands now reflect Watson/Crick strand (directional lib) + bam_dt <- fix_pe_strand_with_flags(bam_dt, paired_end) + + # Overlap with SNP loci + bam_dt <- phase_reads_to_snps(bam_dt, snps_gr) + bam_dt <- select_read_snp_pair(bam_dt) + # If both R1 and R2 overlap the same SNP, this code selects one + # This throws away information at CpGs where one read informs of CpGs the other does not. + # As such, we should still avoid double-counting downstream + + # Assign alleles using CAMDAC rules + bam_dt[, hap_is_ref := assign_het_allele(hap_bsseq, hap_ref, hap_alt, "ref")] + bam_dt[, hap_is_alt := assign_het_allele(hap_bsseq, hap_ref, hap_alt, "alt")] + + # Get haplotype stats + hap_stats <- asm_hap_stats(bam_dt) + + # Annotate BAM with CpG and SNP loci + bam_dt <- annotate_bam_with_loci_asm(bam_dt, loci_dt, drop_ccgg, paired_end) + + # For each CpG site, only one read can be counted + bam_dt <- fix_pe_overlap_at_loci(bam_dt) + + # Get qname to cpg mapping + qname_hap_cg <- unique(bam_dt[, .(qname, hap_id, chrom=chrom, start=start, end=end)]) + qname_hap_cg$chrom = gsub("chr", "", qname_hap_cg$chrom) + + # Split by ref and alt after filtering + ref_bam <- bam_dt[hap_is_ref == T] + alt_bam <- bam_dt[hap_is_alt == T] + rm(bam_dt) + + # Get counts for reads phased to each allele + alt_cg <- asm_bam_to_counts(alt_bam, "alt", loci_dt, + drop_ccgg = drop_ccgg, paired_end = paired_end, min_cov = min_cov + ) + ref_cg <- asm_bam_to_counts(ref_bam, "ref", loci_dt, + drop_ccgg = drop_ccgg, paired_end = paired_end, min_cov = min_cov + ) + + # Combine counts + if (nrow(alt_cg) == 0) { + asm_cg <- ref_cg + } else if (nrow(ref_cg) == 0) { + asm_cg <- alt_cg + } else { + asm_cg <- merge(alt_cg, ref_cg, + by = c( + "CHR", "chrom", "start", "end", + "width", "POS", "ref", "alt" + ), all = TRUE + ) + } + + # Filter duplicates (potential due to multi-snp loci) + asm_cg <- asm_cg[!duplicated(asm_cg, by = c("chrom", "start", "end"))] + + # Complete results object form hap_stats + return( + list( + "asm_cg" = asm_cg, + "hap_stats" = hap_stats, + "map" = qname_hap_cg + ) + ) + return(hap_stats) +} + +haps_as_numeric <- function(v) { + # v = c("1234;12", "1", "123;12;1") + hap <- stringr::str_split(v, ";", simplify = T) + hap <- as.numeric(hap) + hap <- hap[!is.na(hap)] + hap <- unique(hap) + return(hap) +} + +# FUTURE: Select based on counts +select_read_snp_pair <- function(bam_dt) { + # Reads may map to multiple SNPs. + # Ensure each read (R1 and R2) is represented only once, selecting the SNP with the highest coverage. + unique(bam_dt, by = c("qname", "flag")) +} + +phase_reads_to_snps <- function(bam_dt, snps_gr) { + # Find overlap between reads and SNP pairs + snps_dt <- data.table(data.frame(snps_gr)) + setnames("seqnames", "chrom", x = snps_dt) + setkey(bam_dt, chrom, start, end) + setkey(snps_dt, chrom, start, end) + bphase <- foverlaps(bam_dt, snps_dt, which = T, nomatch = NULL) + setnames(bphase, "xid", "bam") + setnames(bphase, "yid", "snps") + + # Run GAlignments parser on pairs to get the SNP position in reads + bam_dt <- bam_dt[bphase$bam, ] + snps_ph <- snps_dt[bphase$snps, ] + aln <- GAlignments( + seqnames = as.character(bam_dt$chrom), pos = bam_dt$start, + cigar = as.character(bam_dt$cigar), + strand = GenomicAlignments::strand(bam_dt$strand), + names = as.character(bam_dt$qname) + ) + gr <- GRanges(seqnames = snps_ph$chrom, ranges = IRanges(snps_ph$start, snps_ph$end)) + rpos <- pmapToAlignments(gr, aln) + + # Set haplotype information + bam_dt$hap_ref <- snps_ph$ref + bam_dt$hap_alt <- snps_ph$alt + bam_dt$hap_id <- snps_ph$hap_id + bam_dt$hap_POS <- snps_ph$start + bam_dt$hap_allele <- substr(bam_dt$seq, start(rpos), end(rpos)) + bam_dt$hap_qual <- substr(bam_dt$qual, start(rpos), end(rpos)) + bam_dt$hap_bsseq <- paste0(bam_dt$hap_allele, bam_dt$strand) + return(bam_dt) +} + +assign_het_allele <- function(bseq_strand, ref, alt, call) { + stopifnot(call %in% c("ref", "alt")) + SNP <- paste0(ref, alt) + if (call == "ref") { + bool <- dplyr::case_when( + SNP == "AC" & (bseq_strand %in% c("A+", "A-")) ~ TRUE, + SNP == "CA" & (bseq_strand %in% c("T+", "C-", "C+")) ~ TRUE, + SNP == "AG" & (bseq_strand %in% c("A+")) ~ TRUE, + SNP == "GA" & (bseq_strand %in% c("G+")) ~ TRUE, + SNP == "AT" & (bseq_strand %in% c("A+", "A-")) ~ TRUE, + SNP == "TA" & (bseq_strand %in% c("T+", "T-")) ~ TRUE, + SNP == "GT" & (bseq_strand %in% c("G+", "A-", "A+")) ~ TRUE, + SNP == "TG" & (bseq_strand %in% c("T+", "T-")) ~ TRUE, + SNP == "CG" & (bseq_strand %in% c("T+", "C-", "C+")) ~ TRUE, + SNP == "GC" & (bseq_strand %in% c("G+", "A-", "G-")) ~ TRUE, + SNP == "CT" & (bseq_strand %in% c("C-")) ~ TRUE, + SNP == "TC" & (bseq_strand %in% c("T-")) ~ TRUE, + TRUE ~ FALSE + ) + } + + + if (call == "alt") { + bool <- dplyr::case_when( + SNP == "AC" & (bseq_strand %in% c("T+", "C-", "C+")) ~ TRUE, + SNP == "CA" & (bseq_strand %in% c("A+", "A-")) ~ TRUE, + SNP == "AG" & (bseq_strand %in% c("G+")) ~ TRUE, + SNP == "GA" & (bseq_strand %in% c("A+")) ~ TRUE, + SNP == "AT" & (bseq_strand %in% c("T+", "T-")) ~ TRUE, + SNP == "TA" & (bseq_strand %in% c("A+", "A-")) ~ TRUE, + SNP == "GT" & (bseq_strand %in% c("T+", "T-")) ~ TRUE, + SNP == "TG" & (bseq_strand %in% c("G+", "A-", "G-")) ~ TRUE, + SNP == "CG" & (bseq_strand %in% c("G+", "A-", "G-")) ~ TRUE, + SNP == "GC" & (bseq_strand %in% c("C-", "T+", "C+")) ~ TRUE, + SNP == "CT" & (bseq_strand %in% c("T-")) ~ TRUE, + SNP == "TC" & (bseq_strand %in% c("C-")) ~ TRUE, + TRUE ~ FALSE + ) + } + return(bool) +} + +# Summarise read counts on haplotypes after CAMDAC allele counting rules +asm_hap_stats <- function(bam_dt) { + # Get count of sites that could not be included in counts + # These represent unexpected nucleotides (e.g. SNV) and sites where bisulfite leaves ambiguous + bam_dt[hap_is_ref == F & hap_is_alt == F, hap_unexp := 1] + unexp_dt <- bam_dt[, .(hap_unexp = sum(hap_unexp, na.rm = T)), by = c("chrom", "hap_ref", "hap_alt", "hap_POS", "hap_id")] + + # Select reads that would be taken for downstream analysis + bam_dt <- bam_dt[hap_is_ref == T | hap_is_alt == T] + + # Count reads aligned to input haplotype/SNP + stats <- bam_dt[, .(hap_BAF = sum(hap_is_alt) / .N, hap_reads = .N), by = c("chrom", "hap_ref", "hap_alt", "hap_POS", "hap_id")] + stats <- merge(stats, unexp_dt, all.x = T) + + # Ensure BAM chrom field fits expected format for downstream joins + stats$chrom <- gsub("chr", "", stats$chrom) + + # Return stats + return(stats) +} + +asm_bam_to_counts <- function( + asm_dt, asm_type, loci_dt, drop_ccgg = FALSE, + paired_end = FALSE, min_mapq = 0, min_cov = 0) { + stopifnot(asm_type %in% c("ref", "alt")) + + # Set minimal columns for output. Enables merge downstream even if empty + default_cols <- c( + "CHR", "chrom", "start", "end", "width", "POS", "ref", + "alt" + ) + + if (paired_end) { + asm_dt <- fix_pe_overlap_at_loci(asm_dt) + # N.B. pe strand fixed earlier in pipeline + asm_dt <- add_loci_read_position(asm_dt) + asm_dt <- get_alleles_and_qual(asm_dt) + asm_dt <- drop_pe_fields(asm_dt) + } else { + asm_dt <- add_loci_read_position(asm_dt) + asm_dt <- get_alleles_and_qual(asm_dt) + } + + # Additional filtering + asm_dt <- filter_clipped_dinucleotides(asm_dt) + asm_dt <- filter_bam_by_quality(asm_dt, min_mapq = min_mapq) + + # Pileup + asm_dt <- annotate_nucleotide_counts(asm_dt) + pileup_summary <- flatten_pileup_to_counts(asm_dt) + rm(asm_dt) + + # Empty data return + if (nrow(pileup_summary) == 0) { + empty_out <- data.table(matrix(nrow = 0, ncol = length(default_cols))) + names(empty_out) <- default_cols + return(empty_out) + } + + # Apply CADMAC rules to get allele counts, methylation rates and BAFs + pileup_summary <- get_snp_allele_counts(pileup_summary) + pileup_summary <- get_methylation_counts(pileup_summary, min_cov) + pileup_summary <- filter_bad_allele_count_rows(pileup_summary, min_cov) + pileup_summary <- compute_methylation_rates(pileup_summary) + pileup_summary <- compute_BAFs(pileup_summary) + + # Format result for output + result <- format_get_reads_result(pileup_summary) + + # Drop any sites with SNPs counts only but no methylation + result <- result[!is.na(total_counts_m), ] + + # Select fields to keep + result <- result[, .( + CHR, chrom, start, end, width, POS, ref, alt, + alt_counts, ref_counts, total_counts, BAF, total_depth, + other_counts, all_counts, M, UM, total_counts_m, m, CCGG + )] + + rename_cols <- setdiff( + names(result), + default_cols + ) + # Give ref/alt names to essential columns + for (n in rename_cols) { + setnames(result, n, paste0(asm_type, "_", n)) + } + + return(result) +} + +write_asm_counts_output <- function(result, sample, config) { + cg_outfile <- get_fpath(sample, config, "asm_counts") + data.table::fwrite(result$asm_cg, cg_outfile) + + phase_outfile <- get_fpath(sample, config, "asm_phase_map") + data.table::fwrite(result$map, phase_outfile) + + stats_outfile <- get_fpath(sample, config, "asm_hap_stats") + data.table::fwrite(result$hap_stats, stats_outfile) + + return(cg_outfile) +} + +annotate_bam_with_loci_asm <- function(bam_dt, loci_subset, drop_ccgg = F, paired_end = F) { + # Set keys for join + loci_subset$chrom <- as.character(loci_subset$chrom) + data.table::setkey(loci_subset, chrom, start, end) + bam_dt$chrom <- as.character(bam_dt$chrom) + data.table::setkey(bam_dt, chrom, start, end) + + # Filter CCGG loci if WGBS + if (drop_ccgg) { + loci_subset <- loci_subset[width != 4] + } + + # Overlap + bam_loci_overlap <- data.table::foverlaps(bam_dt, loci_subset) + + # Rename read fields + setnames(bam_loci_overlap, "i.start", "read.start") + setnames(bam_loci_overlap, "i.end", "read.end") + setnames(bam_loci_overlap, "mapq", "mq") + bam_loci_overlap[, strand := i.strand] # Set strand as previous + + # Filter out rows with no loci data, set expected columns and return + bam_loci_overlap <- bam_loci_overlap[!is.na(width), ] + + return(bam_loci_overlap) +} + +load_asm_loci_for_segment <- function(snps_gr, loci_files) { + snps_region <- reduce(snps_gr + 1000) # Get regions in 1kb non-overlapping regions around SNPs + loci_dt <- load_loci_for_segment(snps_region, loci_files) + loci_dt <- loci_dt[width > 1, ] # Ensure only CG sites are mapped for ASM + return(loci_dt) +} diff --git a/R/asm_cmain.R b/R/asm_cmain.R new file mode 100644 index 0000000..5735573 --- /dev/null +++ b/R/asm_cmain.R @@ -0,0 +1,545 @@ +cmain_asm_allele_counts <- function(sample, config) { + loginfo("ASM allele counting for %s", paste0(sample$patient_id, ":", sample$id)) + + # Skip if no BAM file provided + if (is.null(sample$bam)) { + loginfo("No BAM. Skipping allele counting for %s", paste0(sample$patient_id, ":", sample$id)) + return(NULL) + } + + #  Check if outputs exist and skip if required + outfile <- get_fpath(sample, config, "asm_counts") + if (file.exists(outfile) && !config$overwrite) { + logwarn("Skipping ASM allele counting for %s", paste0(sample$patient_id, ":", sample$id)) + return(outfile) + } + + # Create temporary directory for allele counts files + # Use tempfile to create unique suffix and avoid overwrites on failed runs + tempdir <- tempfile(pattern = "asm_counts", tmpdir = get_fpath(sample, config, "asm_counts", dir = T)) + fs::dir_create(tempdir) + + # Get SNP loci as segments to analyse. Parallelised over config$n_seg_split + snps_gr <- load_asm_snps_gr(sample, config) + # If regions to analyse are given, limit SNPs to regions + if (!is.null(config$regions)) { + segments <- read_segments_bed(config$regions) + seg_gr <- Reduce(c, segments) + snps_gr <- subsetByOverlaps(snps_gr, seg_gr) + } + + # Split by chromosome for parallelisation + snps_grl <- split(snps_gr, seqnames(snps_gr)) + + + # List files containing SNP and CpG loci for reference genome + loci_files <- get_reference_files(config, type = "loci_files") + + # Load sample data + bam_file <- sample$bam + paired_end <- is_pe(config) + drop_ccgg <- is_ccgg(config) + min_mapq <- config$min_mapq + min_cov <- config$min_cov + + # Initialise parallel workers. + doParallel::registerDoParallel(cores = config$n_cores) + + # Set warn=2 to ensure foreach fails if any of the parallel workers are terminated due to memory. + # without this option, foreach simply returns a warning and software continues + options(warn = 2) + + tmpfiles <- foreach(seg = snps_grl, .combine = "c") %dopar% { + # Loop over SNPs to phase + loci_dt <- load_asm_loci_for_segment(seg, loci_files) + ac_file <- cwrap_asm_get_allele_counts(bam_file, seg, loci_dt, paired_end, drop_ccgg, min_mapq = min_mapq, min_cov = min_cov) + tmp <- tempfile(tmpdir = tempdir, fileext = ".qs") + qs::qsave(ac_file, tmp) + rm(loci_dt, ac_file, seg) + return(tmp) + } + options(warn = 0) + + # Define function to combine the allele counts objects into a single list + bind_asm_obs <- function(x, y) { + nobj <- list() + nobj$asm_cg <- rbind(x$asm_cg, y$asm_cg, fill = T) + nobj$hap_stats <- rbind(x$hap_stats, y$hap_stats, fill = T) + nobj$map <- rbind(x$map, y$map, fill = T) + return(nobj) + } + # Combine temporary files with allele counts results into a single data table + result <- foreach(i = tmpfiles, .combine = bind_asm_obs) %dopar% { + qs::qread(i) + } + + # Write to output(s) file + asm_ac_out <- write_asm_counts_output(result, sample, config) + + # Delete temporary files + fs::dir_delete(tempdir) + + # Stop parallel workers. When running the pipeline multiple times in an R session, + # R re-uses workers but does not clear memory. Hence large objects in foreach loops will remain. + doParallel::stopImplicitCluster() + return(asm_ac_out) +} + +cmain_asm_make_methylation <- function(sample, config) { + # Skip if asm_counts_file doesn't exist + asm_counts_file <- get_fpath(sample, config, "asm_counts") + if (!file.exists(asm_counts_file)) { + logwarn("No ASM allele counts. Skipping ASM methylation for %s", paste0(sample$patient_id, ":", sample$id)) + stop() + return(NULL) + } + + # Skip if asm_meth_file exists + asm_meth_file <- get_fpath(sample, config, "asm_meth") + if (file.exists(asm_meth_file) && !config$overwrite) { + logwarn("ASM methylation already exists. Skipping ASM methylation for %s", paste0(sample$patient_id, ":", sample$id)) + return(asm_meth_file) + } + + # Load DNA methylation object for asm + asm_counts <- fread_chrom(asm_counts_file) + + # Select DNA methylation fields + asm_meth <- asm_counts[ + width == 2, + .(chrom, start, end, alt_total_counts_m, ref_total_counts_m, alt_m, ref_m) + ] + + + # Set counts and m as hdi fields + hdi_fields <- c("total_counts_m", "m") + # Append HDI + # Get table of only meth_c values eligible for HDI calculation (i.e. counts present) + ix_asm_hdi <- sel_asm_hdi_pass(asm_meth, "ref", hdi_fields) # Select sites eligible for HDI + ref_hdi <- calculate_asm_hdi_bulk(asm_meth[ix_asm_hdi, ], "ref") # Calculate HDI + asm_meth[ix_asm_hdi, colnames(ref_hdi)] <- data.frame(ref_hdi) # Assign HDI at eligible sites + + # Repeat as above for alt allele + ix_asm_hdi <- sel_asm_hdi_pass(asm_meth, "alt", hdi_fields) # Select sites eligible for HDI + alt_hdi <- calculate_asm_hdi_bulk(asm_meth[ix_asm_hdi, ], "alt") # Calculate HDI + asm_meth[ix_asm_hdi, colnames(alt_hdi)] <- data.frame(alt_hdi) # Assign HDI at eligible sites + + # Save ASM methylation + asm_meth_outfile <- get_fpath(sample, config, "asm_meth") + fs::dir_create(fs::path_dir(asm_meth_outfile)) + data.table::fwrite(asm_meth, asm_meth_outfile) + + return(asm_meth_outfile) +} + + +fread_chrom_if_char <- function(x) { + if (is.character(x)) { + fread_chrom(x) + } else { + x + } +} + + +cmain_asm_make_snps <- function(tumor, germline, infiltrates, origin, config) { + # Check that ASM snps file is availabe for tumor. If so, return NULL + asm_snps_file <- get_fpath(tumor, config, "asm_snps") + if (file.exists(asm_snps_file)) { + loginfo("ASM snps file found for tumor.") + # Loop over remaining objects + for (i in list(germline, infiltrates, origin)) { + if (is.null(i)) { + next + } + # Add ASM snps from tumor to object if currently Null + i_asm_snps <- get_fpath(i, config, "asm_snps") + if (!file.exists(i_asm_snps)) { + loginfo("Attaching existing ASM SNPs to %s", i$id) + attach_output(i, config, "asm_snps", asm_snps_file) + } + } + return(NULL) + } + + # Else, raise an error if germline is NULL + if (is.null(germline)) { + stop(paste0( + "No ASM snps file available for tumor. Germline sample is required to extract SNPs from,", + "or ASM snps file must be provided for tumor object." + )) + } + + # If ASM snps are not available for the tumor, run bulk allele-counts on germline and extract SNPs + loginfo("ASM snps file not found for tumor. Extracting SNPs from germline for tumor.") + cmain_count_alleles(germline, config) + cmain_make_snps(germline, config) + + # Load het SNPs + nsnps_f <- get_fpath(germline, config, "snps") + n_snp <- fread_chrom(nsnps_f) + n_snp <- n_snp[dplyr::between(BAF, 0.1, 0.9), .(chrom, pos = POS, ref, alt, BAF)] + + # Save hets as ASM SNPs for germline + n_asm_snps_file <- get_fpath(germline, config, "asm_snps") + fs::dir_create(fs::path_dir(n_asm_snps_file)) + data.table::fwrite(n_snp, n_asm_snps_file) + + # Save hets as ASM SNPs for tumor + fs::dir_create(fs::path_dir(asm_snps_file)) + data.table::fwrite(n_snp, asm_snps_file) + + # Save hets as ASM SNPs for origin and infiltrates if present + if (!is.null(infiltrates)) { + i_asm_snps_file <- get_fpath(infiltrates, config, "asm_snps") + fs::dir_create(fs::path_dir(i_asm_snps_file)) + data.table::fwrite(n_snp, i_asm_snps_file) + } + if (!is.null(origin)) { + o_asm_snps_file <- get_fpath(origin, config, "asm_snps") + fs::dir_create(fs::path_dir(o_asm_snps_file)) + data.table::fwrite(n_snp, o_asm_snps_file) + } + loginfo("ASM SNPS file created from germline for: %s:%s", tumor$patient_id, tumor$id) +} + +cmain_asm_call_cna <- function(tumor, germline, config) { + # Check that CNA file is available for the tumor + asm_cna_file <- get_fpath(tumor, config, "asm_cna") + if (fs::file_exists(asm_cna_file)) { + loginfo("CNA file found for tumor.") + return(NULL) + } + + # Preprocess CpG, SNP and methylation data for all samples + preprocess( + list(tumor, germline), + config + ) + + # Combine tumor-germline SNPs and call CNAs + cmain_bind_snps(tumor, germline, config) + cmain_call_cna(tumor, config) + cna_file <- get_fpath(tumor, config, "cna") + attach_output(tumor, config, "asm_cna", cna_file) + loginfo("CNA file created for tumor: %s:%s", tumor$patient_id, tumor$id) +} + +cmain_fit_meth_cna <- function(tumor, config) { + # Skip if meth_cn file exists + asm_meth_cna <- get_fpath(tumor, config, "asm_meth_cna") + if (file.exists(asm_meth_cna) && !config$overwrite) { + logwarn("ASM methylation CNA already exists. Skipping ASM methylation CNA for %s", paste0(tumor$patient_id, ":", tumor$id)) + return(asm_meth_cna) + } + + # Get CNA solution for tumor + cna_file <- get_fpath(tumor, config, "asm_cna") + if (!fs::file_exists(cna_file)) { + stop("Error. CNA file not found for tumor.") + } + cna <- fread_chrom(cna_file) + + + # Get BAF for tumor at phased SNPs + hap_stats <- fread_chrom(get_fpath(tumor, config, "asm_hap_stats")) + + # Get CG-hap map + phase_map <- fread_chrom(get_fpath(tumor, config, "asm_phase_map")) + + # Get allele-specific bulk methylation + asm_meth <- fread_chrom(get_fpath(tumor, config, "asm_meth")) + + # Overlap three datasets from ASM counter. For each CpG, return asm, BAF, cna and methylation. + asm_hap <- merge_asm_hap(asm_meth, hap_stats, phase_map) + # Overlap DNA methylation with CNA. + asm_hap_cna <- overlap_meth_cna(asm_hap, cna) + + # Assign each phased CpG to a CNA state + amc <- assign_asm_cna(asm_hap_cna) + + # Save file to system in expected location + fs::dir_create(fs::path_dir(asm_meth_cna)) + data.table::fwrite(amc, asm_meth_cna) + return(asm_meth_cna) +} + +cmain_asm_deconvolve <- function(tumor, infiltrates, config) { + # Load tumor and normal methylation + t_meth <- fread_chrom(get_fpath(tumor, config, "asm_meth_cna")) + n_meth <- fread_chrom(get_fpath(infiltrates, config, "asm_meth")) + + # Combine objects + n_meth <- dplyr::rename_with(n_meth, ~ paste0(.x, "_i"), !matches("chrom|start|end")) + setkey(n_meth, chrom, start, end) + setkey(t_meth, chrom, start, end) + meth_c <- merge(t_meth, n_meth, all.x = T) + + # Deconvolve ref and alt + loginfo("Deconvolving ASM") + meth_c <- deconvolve_asm_methylation(meth_c) + + # Filter: CN=0 + # Bulk filters not yet implemented: effective cov_t>= 3, is.na(mt-raw) + meth_c <- meth_c[major_cn + minor_cn != 0, ] + + loginfo("Calculating ASM HDI") + # Calculate m_t HDI # parallel, long-running function + meth_c <- calculate_asm_m_t_hdi(meth_c, config$n_cores) + + outfile <- get_fpath(tumor, config, "asm_meth_pure") + fs::dir_create(fs::path_dir(outfile)) + data.table::fwrite(meth_c, outfile) + return(outfile) +} + +# Helper functions ---- +deconvolve_asm_methylation <- function(meth_c) { + # Deconvolve methylation for ref + meth_c[ + , + ref_m_t_raw := calculate_mt( + ref_m, ref_m_i, purity, ref_CN + ) + ] + + # Deconvolve methylation for alt + meth_c[ + , + alt_m_t_raw := calculate_mt( + alt_m, alt_m_i, purity, alt_CN + ) + ] + + # Correct pure tumour methylation rates set outside 0 and 1 after deconvolution + correct_meth <- function(x) { + data.table::fcase( + x < 0, 0, + x > 1, 1, + rep(TRUE, length(x)), x + ) + } + meth_c[, ref_m_t := correct_meth(ref_m_t_raw)] + meth_c[, alt_m_t := correct_meth(alt_m_t_raw)] + + # Calculate tumour coverage by deconvolution + meth_c[, ref_cov_t := calculate_mt_cov(ref_total_counts_m, purity, ref_CN)] + meth_c[, alt_cov_t := calculate_mt_cov(alt_total_counts_m, purity, alt_CN)] + + return(meth_c) +} + +calculate_asm_m_t_hdi <- function(meth_c, n_cores, itersplit = 1e5) { + # Split into tables of length given by itersplit for parallel processing + inp_len <- nrow(meth_c) + split_factor <- make_split_factor(inp_len, itersplit) + msplit <- iterators::isplit(meth_c, split_factor) + + # Calculate HDI for both alleles + doParallel::registerDoParallel(cores = n_cores) + hdi <- foreach(x = msplit, .combine = "rbind") %dopar% { + v <- x$value + + # Set empty data table to store results + res <- data.frame( + matrix(nrow = nrow(v), ncol = 0) + ) + + # Set hdi fields for tumour + hdi_fields = c("total_counts_m", "m_t", "total_counts_m_i", "m_i", "CN") + # Get table of only meth_c values eligible for HDI calculation (i.e. counts present) + ix_asm_hdi <- sel_asm_hdi_pass(v, "ref", hdi_fields) # Select sites eligible for HDI + ref_hdi <- calculate_asm_hdi_pure(v[ix_asm_hdi, ], "ref") # Calculate HDI + res[ix_asm_hdi, colnames(ref_hdi)] <- data.frame(ref_hdi) # Assign HDI at eligible sites + + # Repeat as above for alt allele + ix_asm_hdi <- sel_asm_hdi_pass(v, "alt", hdi_fields) # Select sites eligible for HDI + alt_hdi <- calculate_asm_hdi_pure(v[ix_asm_hdi, ], "alt") # Calculate HDI + res[ix_asm_hdi, colnames(alt_hdi)] <- data.frame(alt_hdi) # Assign HDI at eligible sites + + # Return result for binding + return(res) + } + doParallel::stopImplicitCluster() + + meth_c <- cbind(meth_c, hdi) + + return(meth_c) +} + +sel_asm_hdi_pass <- function(x, allele, hdi_fields) { + # Helper function to select sites eligible for HDI calculation for a signle allele + + # Return TRUE if allele field has counts, pure, count_i, meth_i and CN data present, otherwise return false + # Use complete.cases function to streamline + col_prefix <- ifelse(allele == "ref", "ref_", "alt_") + fields <- paste0( + col_prefix, + hdi_fields + ) + complete.cases(x[, ..fields]) +} + +calculate_asm_hdi_bulk <- function(meth_c, allele, itersplit=1e5){ + + col_prefix <- ifelse(allele == "ref", "ref_", "alt_") + counts <- paste0(col_prefix, "total_counts_m") + meth <- paste0(col_prefix, "m") + # Generate inputs for HDI calculation + M <- round(meth_c[[counts]] * meth_c[[meth]], 0) + UM <- meth_c[[counts]] - M + # Calculate ASM hdi for bulk data + u_hdi <- unique_calculate_counts_hdi(M, UM, n_cores = n_cores, itersplit = itersplit) + u_hdi <- round(u_hdi, digits = 5) + names(u_hdi) = c("M", "UM", paste0(col_prefix, "m_lo"), paste0(col_prefix, "m_hi")) + # Map back to original M and UM values + inpdf = data.frame(M=M, UM=UM) + # add id field to keep the original row order + inpdf$id <- 1:nrow(inpdf) + u_inpdf = merge(inpdf, u_hdi, by=c("M", "UM"), all.x=TRUE, sort=FALSE) + # output should match the input row order + out <- u_inpdf[order(u_inpdf$id), ] + return(out[, 4:5]) + +} + +calculate_asm_hdi_pure <- function(meth_c, allele, itersplit=1e5) { + + # Helper function to calculate HDI for a single allele + # Set fields based on allele + col_prefix <- ifelse(allele == "ref", "ref_", "alt_") + counts <- paste0(col_prefix, "total_counts_m") + pure <- paste0(col_prefix, "m_t") + counts_i <- paste0(col_prefix, "total_counts_m_i") + meth_i <- paste0(col_prefix, "m_i") + CN <- paste0(col_prefix, "CN") + + # Generate inputs for HDI calculation + M <- round(meth_c[[counts]] * meth_c[[pure]], 0) + UM <- meth_c[[counts]] - M + M_n <- round(meth_c[[counts_i]] * meth_c[[meth_i]], 0) + UM_n <- meth_c[[counts_i]] - M_n + hdi <- vec_HDIofMCMC_mt(M, UM, M_n, UM_n, meth_c[["purity"]], meth_c[[CN]], credMass = 0.95) + colnames(hdi) <- paste0(col_prefix, c("m_t_low", "m_t_high")) + return(hdi) +} + +cmain_asm_ss_dmps <- function(sample, config) { + # Get pure methylation if it exists, else get bulk methylation + asm_file <- get_fpath(sample, config, "asm_meth_pure") + if (!file.exists(asm_file)) { + asm_file <- get_fpath(sample, config, "asm_meth") + } + + # Skip if output file exists + out_file <- get_fpath(sample, config, "asm_ss_dmp") + if (file.exists(out_file)) { + loginfo("Skipping %s", sample$id) + return() + } + + loginfo("Running ASM DMP calls for %s", sample$id) + #  Calculate AS-DMP within-sample, including CAMDAC where available + dt <- fread_chrom(asm_file) + + # Params + # TODO: Set CAMDAC DMP thresholds in config + itersplit <- 1e5 + effect_size <- 0.2 + prob <- 0.99 + + # Calculate differential methylation given effect size + asm_b_diff <- dt[["alt_m"]] - dt[["ref_m"]] + + # Calculate DMP probability from bulk data + # Probabilities are calculated on bulk counts + M_alt <- round(dt[["alt_total_counts_m"]] * dt[["alt_m"]], 0) + UM_alt <- dt[["alt_total_counts_m"]] - M_alt + M_ref <- round(dt[["ref_total_counts_m"]] * dt[["ref_m"]], 0) + UM_ref <- dt[["ref_total_counts_m"]] - M_ref + + if ("ref_m_t" %in% colnames(dt)) { + asm_t_diff <- dt[["ref_m_t"]] - dt[["alt_m_t"]] + } else { + asm_t_diff <- NULL + } + + # Make DMP call + # TODO: Use ncores arg from config + dmp_call <- dmp_call_pipe(asm_b_diff, M_ref, UM_ref, M_alt, UM_alt, asm_t_diff, effect_size, prob, itersplit) + + # Reformat and merge + asm_names <- c("prob_DMP", "asm_b_diff", "asm_DMP_b", "asm_t_diff", "asm_DMP_t") + names(dmp_call) <- asm_names[1:ncol(dmp_call)] + dt <- cbind(dt, dmp_call) + + # Return data + ss_dmp_out <- get_fpath(sample, config, "asm_ss_dmp") + data.table::fwrite(dt, ss_dmp_out, quote = FALSE, na = "NA") + return(ss_dmp_out) +} + +cmain_asm_dmps <- function(sample, origin, config) { + # Calculate AS-DMP between-samples + asm_file <- get_fpath(sample, config, "asm_meth_pure") + origin_file <- get_fpath(origin, config, "asm_meth") + + # Skip if output file exists + out_file <- get_fpath(sample, config, "asm_dmp") + if (file.exists(out_file)) { + loginfo("Skipping ASM DMP calls for %s against %s", sample$id, origin$id) + return(out_file) + } + + loginfo("Running ASM DMP calls for %s against %s", sample$id, origin$id) + + # TODO: move to config + effect_size <- 0.2 + prob <- 0.99 + itersplit <- 1e5 + + #  Calculate AS-DMP within-sample, including CAMDAC where available + abb <- fread_chrom(asm_file) + ori <- fread_chrom(origin_file) + # Rename non chrom start end names with dplyr + ori <- dplyr::rename_with(ori, ~ paste0(.x, "_o"), .cols = !matches("chrom|start|end")) + + # Overlap datasets + setkey(abb, chrom, start, end) + dt <- merge(abb, ori, by = c("chrom", "start", "end"), all.x = TRUE) + + # Run DMP callin for ref allele + mbdiff <- dt[["ref_m"]] - dt[["ref_m_o"]] + M <- round(dt[["ref_total_counts_m_o"]] * dt[["ref_m_o"]], 0) + UM <- dt[["ref_total_counts_m_o"]] - M + M_n <- round(dt[["ref_total_counts_m"]] * dt[["ref_m"]], 0) + UM_n <- dt[["ref_total_counts_m"]] - M_n + mtdiff <- dt[["ref_m_t"]] - dt[["ref_m_o"]] + + # Make ref DMP call and merge + loginfo("ASM: Calling tumor-normal REF DMPs") + dmp_call <- dmp_call_pipe(mbdiff, M, UM, M_n, UM_n, mtdiff, effect_size, prob, itersplit) + asm_names <- c("prob_ref_DMP", "ref_m_diff", "ref_DMP_b", "ref_m_t_diff", "ref_DMP_t") + names(dmp_call) <- asm_names[1:ncol(dmp_call)] + dt <- cbind(dt, dmp_call) + + # Run DMP calling for alt allele + mbdiff <- dt[["alt_m"]] - dt[["alt_m_o"]] + M <- round(dt[["alt_total_counts_m_o"]] * dt[["alt_m_o"]], 0) + UM <- ori[["alt_total_counts_m"]] - M + M_n <- round(dt[["alt_total_counts_m"]] * dt[["alt_m"]], 0) + UM_n <- dt[["alt_total_counts_m"]] - M_n + mtdiff <- dt[["alt_m_t"]] - dt[["alt_m_o"]] + + # Make alt DMP calla nd merge + loginfo("ASM: Calling tumor-normal ALT DMPs") + dmp_call <- dmp_call_pipe(mbdiff, M, UM, M_n, UM_n, mtdiff, effect_size, prob, itersplit) + asm_names <- c("prob_alt_DMP", "alt_m_diff", "alt_DMP_b", "alt_m_t_diff", "alt_DMP_t") + names(dmp_call) <- asm_names[1:ncol(dmp_call)] + dt <- cbind(dt, dmp_call) + + # Save data + dmp_out <- get_fpath(sample, config, "asm_dmp") + data.table::fwrite(dt, dmp_out, quote = FALSE, na = "NA") + return(dmp_out) +} diff --git a/R/asm_meth_cna.R b/R/asm_meth_cna.R new file mode 100644 index 0000000..fa2aaeb --- /dev/null +++ b/R/asm_meth_cna.R @@ -0,0 +1,126 @@ +merge_asm_hap <- function(asm_meth, hap_stats, phase_map) { + # Annotate BAF of SNP for each phased CpG + baf <- hap_stats[, .( + hap_POS, hap_id, hap_BAF, hap_reads, + ref_allele = hap_ref, alt_allele = hap_alt + )] + phase <- unique(phase_map[, .(chrom, start, end, hap_id)]) + phase$chrom <- stringr::str_replace(phase$chrom, "chr", "") + phase <- merge(phase, baf, all.y = T, by = "hap_id") + + # Merge methylation and haplotype stats + setkey(phase, chrom, start, end) + res <- merge(asm_meth, phase, all.x = T, by = c("chrom", "start", "end")) + return(res) +} + +overlap_meth_cna <- function(asm_hap, cna) { + # Format data for merge + setkey(asm_hap, chrom, start, end) + setkey(cna, chrom, start, end) + + # Merge methylation and cna + amc <- data.table::foverlaps(asm_hap, cna, mult = "first", nomatch = NA) + amc$cna_start <- amc$start + amc$cna_end <- amc$end + amc$start <- amc$i.start # Set back to CG + amc$end <- amc$i.end # Set back to CG + amc$i.start <- NULL + amc$i.end <- NULL + setkey(amc, chrom, start, end) + + return(amc) +} + +assign_asm_cna <- function(ol) { + # TODO: Use battenberg phasing where available + + # Select essential fields + ab <- ol[, .(major_cn, minor_cn, hap_BAF, hap_reads)] + ab$cnstate <- paste0(ab$major_cn, "+", ab$minor_cn) + ab$cnix <- seq_len(nrow(ab)) + + # Split by balance + ab_bal <- ab[major_cn == minor_cn, ] + ab_imbal <- ab[major_cn != minor_cn, ] + + # Assign major to ref or alt allele + ab_bal[, maj_assign := maj_by_gauss(hap_BAF, balanced = T)$maj, by = cnstate] + ab_imbal[, maj_assign := maj_by_gauss(hap_BAF, balanced = F)$maj, by = cnstate] + ab_phas <- rbind(ab_bal, ab_imbal, ab[is.na(major_cn) | is.na(minor_cn)], fill = T)[order(cnix), ] + + # Assign ref and alt CN + ab_phas[, ref_CN := data.table::fcase( + maj_assign == "ref", major_cn, + maj_assign == "alt", minor_cn, + maj_assign == "balanced", major_cn + )] + ab_phas[, alt_CN := data.table::fcase( + maj_assign == "ref", minor_cn, + maj_assign == "alt", major_cn, + maj_assign == "balanced", major_cn + )] + + # Cleanup fields + ab_phas$cnix <- NULL + add_cols <- setdiff(names(ab_phas), names(ol)) + res <- cbind(ol, ab_phas[, ..add_cols]) + return(res) +} + +maj_by_gauss <- function(BAF, balanced = T) { + # Reserve original BAF values to determine ref or alt as major or minor + BAF_orig <- BAF + # First, ensure BAF is mirrored by flipping to 0.5 and 1 window + BAF[BAF < 0.5] <- 1 - BAF[BAF < 0.5] + + # Classify BAF as major, minor, balanced, or NA + # truncate data to values less than 1. + truncated_data <- BAF[BAF != 1] + if (length(truncated_data) == 0) { + return(list("maj" = "NA", "params" = c("NA", "NA", "NA", "NA"))) + } + + # Get thresholds from gaussian fit. Does not fit to extreme values on 0 or 1 + # This avoids fitting to outliers yet should identify peak regardless + fit <- fit_gaussian(truncated_data) + thresh <- qnorm(c(0.025, 0.975), fit$mean, fit$sd) + lower <- thresh[[1]] + upper <- thresh[[2]] + + # Set default threshold for filtering out hets in imbalances + hets_upper <- qnorm(c(0.99), mean = 0.5, sd = 0.01) + + # If state is balanced, return "balanced" + if (balanced) { + maj <- ifelse(BAF < upper, "balanced", "NA") + } else { + # One model fits at peak, another fits at 0.5 + in_dist <- dplyr::between(BAF, lower, upper) + is_ref <- BAF_orig < 0.5 + is_het_outdist <- BAF < hets_upper + maj <- dplyr::case_when( + is_het_outdist ~ "NA", + is_ref & in_dist ~ "ref", + !is_ref & in_dist ~ "alt", + TRUE ~ "NA" + ) + } + + # Setup parameters for return + params <- c( + mean = fit$mean, + sd = fit$sd, + qlower = lower, + qupper = upper + ) + + return(list("maj" = maj, "params" = params)) +} + +fit_gaussian <- function(data) { + fit <- fitdistr(data, "normal") # fit Gaussian distribution to truncated data + mean <- fit$estimate[1] # extract mean from fitted parameters + sd <- fit$estimate[2] # extract standard deviation from fitted parameters + return(list(mean = mean, sd = sd)) +} diff --git a/R/asm_pipeline.R b/R/asm_pipeline.R new file mode 100644 index 0000000..d3fc206 --- /dev/null +++ b/R/asm_pipeline.R @@ -0,0 +1,66 @@ +#' Run allele-specific methylation analysis pipeline +#' @param tumor. CamSample object for tumor sample. +#' @param germline. CamSample object for germline sample. Used for CNA calling. +#' @param infiltrates. CamSample object for infiltrating normal sample. Used for deconvolution. +#' @param origin. CamSample object for cell of origin sample. Used for differential methylation. +#' @param config. CamConfig object. +#' @export +#' @keywords internal +asm_pipeline <- function(tumor, germline = NULL, infiltrates = NULL, origin = NULL, config) { + # Log + loginfo("CAMDAC:::asm_pipeline start for %s", tumor$patient_id) + sample_list <- list(tumor, germline, infiltrates, origin) + + # Checks that ASM SNPs file is available, otherwise, creates from bulk allele counts on germline + # and attach ASM SNPs to infiltrates and origin objects if present + cmain_asm_make_snps(tumor, germline, infiltrates, origin, config) + + # Check that ASM CNA file is available, otherwise, create from CAMDAC CNA calls + cmain_asm_call_cna(tumor, germline, config) + + # Preprocess CpG, SNP and methylation data for all samples + loginfo("Preprocessing ASM data") + preprocess_asm( + sample_list, + config + ) + + # Assign ASM CNA to per-allele CG sites + cmain_fit_meth_cna(tumor, config) + + # Run ASM deconvolution + cmain_asm_deconvolve(tumor, infiltrates, config) + + # Run ASM differential methylation within-sample + for (s in sample_list) { + if (!is.null(s)) { + cmain_asm_ss_dmps(s, config) + } + } + + # Run ASM differential methylation between samples + cmain_asm_dmps(tumor, origin, config) + + # Log complete + loginfo("CAMDAC:::asm_pipeline complete for %s", tumor$patient_id) +} + + +#' Preprocess a list of CamSample objects for ASM analysis +#' @param sample_list. List of CamSample objects. +#' @param config. CamConfig object. +#' @export +preprocess_asm <- function(sample_list, config) { + for (s in sample_list) { + # Go to next part of loop if its null + if (is.null(s)) { + next + } + + # Count SNP and CpG alleles if a BAM file is provided + cmain_asm_allele_counts(s, config) + + # Format methylation rates for ASM + cmain_asm_make_methylation(s, config) + } +} diff --git a/R/battenberg.R b/R/battenberg.R new file mode 100755 index 0000000..f0ec9b1 --- /dev/null +++ b/R/battenberg.R @@ -0,0 +1,418 @@ + +# Convert CAMDAC allele counts object to per-chromosome allele frequency files for battenberg +create_allele_frequencies_chr <- function(tsnps, outdir, sample_af_prefix, min_depth, is_tumor=T) { + # Load data table of SNP positions + dt <- tsnps[, .(chrom, POS, ref, alt)] + + if(is_tumor){ + # For tumor, alt counts are derived from BAF + dt$alt_counts = round(tsnps$BAF * tsnps$total_counts, 0) + dt$ref_counts = tsnps$total_counts - dt$alt_counts + }else{ + # For normal, alt counts are derived from BAF_n + dt$alt_counts = round(tsnps$BAF_n * tsnps$total_counts_n, 0) + dt$ref_counts = tsnps$total_counts_n - dt$alt_counts + } + + + # Filter for min depth + dt <- dt[ref_counts + alt_counts >= min_depth] + # Split per chromosome for parallel processing + dt_per_chrom <- split(dt, dt$chrom) + rm(dt) + + af_files <- foreach(dt_chrom = dt_per_chrom) %dopar% { + + # Set output filename with correct suffix for X and Y chromosomes + chromosome <- unique(dt_chrom$chrom)[[1]] + outfile_chrom <- data.table::fcase( + chromosome == "X", "23", + chromosome == "Y", "24", + !(chromosome %in% c("X", "Y")), chromosome + ) + outfile <- fs::path(outdir, sprintf("%s_alleleFrequencies_chr%s.txt", sample_af_prefix, outfile_chrom)) + + # Set CHR column to allelefreq format expected + data.table::setnames(dt_chrom, "chrom", "#CHR") + + # Set counts based on ref and alt + nucs = c("A", "C", "G", "T") + for( nn in nucs ){ + dt_chrom[ ref == nn, paste0("Count_", nn) := ref_counts ] + dt_chrom[ alt == nn, paste0("Count_", nn) := alt_counts ] + } + + # Cleanup data table + dt_chrom = dt_chrom[!is.na(POS)] + dt_chrom[, names(dt_chrom) := lapply(.SD, function(x) {x[is.na(x)] <- 0 ; x}) ] + + # Stack ref and alt count columns + headings <- c("#CHR", "POS", "Count_A", "Count_C", "Count_G", "Count_T") + dt_chrom = dt_chrom[, ..headings] + dt_chrom$Good_depth = rowSums(dt_chrom[, .(Count_A, Count_C, Count_G, Count_T)]) + + # Order by position. Possible as only a single chromosome is present + dt_chrom <- dt_chrom[order(POS)] + + # Write to output file + write.table(dt_chrom, file = outfile, sep = "\t", row.names = F, col.names = T, quote = F) + return(outfile) + } + return(af_files) +} + +# AF files for tumour and normal must refer to the same SNP loci +# Note that this is true of BAF and LogR but already handled by the use of tsnps object +filter_allele_frequencies_to_overlap <- function(outdir, tumour_prefix, normal_prefix) { + chroms <- c(1:23) + doParallel::registerDoParallel(cores = 10) + foreach(chrom = chroms) %dopar% { + tumour_af <- fs::path(outdir, paste0(tumour_prefix, "_alleleFrequencies_chr", chrom, ".txt")) + normal_af <- fs::path(outdir, paste0(normal_prefix, "_alleleFrequencies_chr", chrom, ".txt")) + chrom_files_exist <- (all(sapply(c(tumour_af, normal_af), fs::file_exists))) + + # Skip chromosomes where data is missing from either tumour or normal + if (!chrom_files_exist) { + return(NULL) + } + + tumour_dt <- fread(tumour_af) + setkey(tumour_dt, "#CHR", "POS") + normal_dt <- fread(normal_af) + setkey(normal_dt, "#CHR", "POS") + + stopifnot(unique(tumour_dt$`#CHR`) == unique(normal_dt$`#CHR`)) # Ensure same chroms + + # Get overlapping positions + positions <- intersect(tumour_dt$POS, normal_dt$POS) + tumour_dt <- tumour_dt[POS %in% positions] + normal_dt <- normal_dt[POS %in% positions] + + # Overwrite allele freq files + write.table(tumour_dt, file = tumour_af, sep = "\t", row.names = F, col.names = T, quote = F) + write.table(normal_dt, file = normal_af, sep = "\t", row.names = F, col.names = T, quote = F) + } + doParallel::stopImplicitCluster() +} + +make_allele_frequencies_chrX_chrY <- function(outdir, tumour_prefix, normal_prefix) { + # For some reason, BB can read imputeinfo and fail to capture chrom X and Y allelfReq files, so I'm copying here. + mapping <- list("chr23" = "chrX", "chr24" = "chrY") + af_files <- c(fs::dir_ls(outdir, glob = "*chr23*"), fs::dir_ls(outdir, glob = "*chr24*")) + for (i in af_files) { + old_file <- i + for (n in seq_along(mapping)) { # Set based on any match in mapping + i <- gsub(names(mapping[n]), mapping[n][[1]], i) + } + fs::file_copy(old_file, i, overwrite = T) + } +} + +create_impute_info_file <- function(bb_38_dir, outdir) { + impute_template <- fs::path(bb_38_dir, "imputation", "impute_info.txt") + data <- read.table(impute_template, header = F) + path_sub <- function(x, prefix) gsub("\\$\\{REF_PATH\\}", fs::path(bb_38_dir, prefix), x) + data[, 2] <- path_sub(data[[2]], "shapeit2") + data[, 3] <- path_sub(data[[3]], "imputation") + data[, 4] <- path_sub(data[[4]], "shapeit2") + + impute_info_out <- fs::path(outdir, "impute_info.txt") + write.table(data, file = impute_info_out, quote = F, sep = "\t", row.names = F, col.names = F) + return(impute_info_out) +} + +camdac_to_battenberg_allele_freqs <- function(tsnps, tumour_prefix, normal_prefix, outdir, min_normal_depth) { + create_allele_frequencies_chr(tsnps, outdir, tumour_prefix, min_depth = 1, is_tumor=T) + create_allele_frequencies_chr(tsnps, outdir, normal_prefix, min_depth = min_normal_depth, is_tumor=F) + filter_allele_frequencies_to_overlap(outdir, tumour_prefix, normal_prefix) + make_allele_frequencies_chrX_chrY(outdir, tumour_prefix, normal_prefix) +} + +# Battenberg LogR writer taken from : https://github.com/Wedge-lab/battenberg/blob/c257a710d88b23986f936e0b7b38131279d07f7e/R/prepare_wgs.R#L348 +# Battenberg BAF writer taken from : https://github.com/Wedge-lab/battenberg/blob/c257a710d88b23986f936e0b7b38131279d07f7e/R/prepare_wgs.R#L130 +create_logr_and_baf_files <- function(tsnps, tumour_prefix, normal_prefix, outdir) { + # Set output file column var + outfile_columns <- c("Chromosome", "Position") + + # Format TSNPs file containing CAMDAC LogR+BAF + data.table::setnames(tsnps, "chrom", outfile_columns[[1]]) + data.table::setnames(tsnps, "POS", outfile_columns[[2]]) + + # Create mutant LogR and BAF files + mutant_logr_outfile <- fs::path(outdir, sprintf("%s_mutantLogR.tab", tumour_prefix)) + mutant_baf_outfile <- fs::path(outdir, sprintf("%s_mutantBAF.tab", tumour_prefix)) + outfile_columns[[3]] <- tumour_prefix # Temporarily set last outfile column to mutant name for writing + data.table::setnames(tsnps, "LogR", outfile_columns[[3]]) + readr::write_tsv(x = tsnps[, ..outfile_columns], mutant_logr_outfile) + data.table::setnames(tsnps, outfile_columns[[3]], "LogR") # Return tnsps table value + data.table::setnames(tsnps, "BAFr", outfile_columns[[3]]) + write.table(tsnps[, ..outfile_columns], file = mutant_baf_outfile, row.names = F, quote = F, sep = "\t", col.names = outfile_columns) + data.table::setnames(tsnps, outfile_columns[[3]], "BAFr") # Return tnsps table value + + # Create mutant LogR gc corrected file + mutant_logr_gc_outfile <- fs::path(outdir, sprintf("%s_mutantLogR_gcCorrected.tab", tumour_prefix)) + outfile_columns[[3]] <- tumour_prefix # Temporarily set last outfile column to mutant name for writing + data.table::setnames(tsnps, "LogR_corr", outfile_columns[[3]]) + readr::write_tsv(x = tsnps[, ..outfile_columns], mutant_logr_gc_outfile) + data.table::setnames(tsnps, outfile_columns[[3]], "LogR_corr") # Return tnsps table value + + # Create normal LogR and BAF files + normal_logr_outfile <- fs::path(outdir, sprintf("%s_normalLogR.tab", normal_prefix)) + normal_baf_outfile <- fs::path(outdir, sprintf("%s_normalBAF.tab", normal_prefix)) + outfile_columns[[3]] <- normal_prefix # Temporarily set last outfile column to normal name for writing + tsnps[, normalLogR := 0] # Set normal LogR to 0 + data.table::setnames(tsnps, "normalLogR", outfile_columns[[3]]) + readr::write_tsv(x = tsnps[, ..outfile_columns], normal_logr_outfile) + data.table::setnames(tsnps, outfile_columns[[3]], "normalLogR") # Return tnsps table value + data.table::setnames(tsnps, "BAFr_n", outfile_columns[[3]]) + write.table(tsnps[, ..outfile_columns], file = normal_baf_outfile, row.names = F, quote = F, sep = "\t", col.names = outfile_columns) + data.table::setnames(tsnps, outfile_columns[[3]], "BAFr_n") # Return tnsps table value + + return(NULL) +} + +#' Generate alleleCounter file from CAMDAC +#' +#' `camdac_to_battenberg_prepare_wgbs` converts CAMDAC allele counter results to a format for processing. +#' +#' @param camdac_tumour_ac CAMDAC tumour allele counts filepath. Expected *.gz +#' @param camdac_normal_ac CAMDAC normal allele couts filepath. Expected *.gz +#' @param camdac_tnsps CAMDAC tumour-normal-snps object. Expected *.gz +#' @param output_file allelecounter formatted-file output directory. +#' +#' @returns File handle for allele counter file generated +#' @keywords internal +camdac_to_battenberg_prepare_wgbs <- function(tumour_prefix, normal_prefix, camdac_tsnps, outdir) { + # Return record of files if all files exist + filenames <- c( + fs::path(outdir, paste0(tumour_prefix, "_mutantLogR.tab")), + fs::path(outdir, paste0(tumour_prefix, "_mutantLogR_gcCorrected.tab")), + fs::path(outdir, paste0(tumour_prefix, "_mutantBAF.tab")), + fs::path(outdir, paste0(normal_prefix, "_normalLogR.tab")), + fs::path(outdir, paste0(normal_prefix, "_normalBAF.tab")) + ) + + if (all(sapply(filenames, fs::file_exists))) { + return(filenames) + } + + # Create the mutantLogR and mutantBAF files + # Create the normalLogR and normalBAF files + # Create the mutantLogR_gcCorrected file + tsnps = fread_chrom(camdac_tsnps) + create_logr_and_baf_files(tsnps, tumour_prefix, normal_prefix, outdir) + + # TODO: Create ASCAT plots and simplify the tables by passing sample name to the write output function: + # see: https://github.com/Wedge-lab/battenberg/blob/43686673566cf5adbd8d00e2450d70eced27696d/R/prepare_wgs.R#L154 + + return(filenames) +} + +check_callChrXSubclones <- function(TUMOURNAME) { + # Helper function to check whether we can run callChrX subclones i.e. enough SNPs in non-par regions + # N.B. May not be valid if reference significantly changes. + PCFinput <- data.frame(Battenberg::read_table_generic(paste0(TUMOURNAME, "_mutantLogR_gcCorrected.tab")), stringsAsFactors = F) + PCFinput <- PCFinput[which(PCFinput$Chromosome == "X" & PCFinput$Position > 2.6e6 & PCFinput$Position < 156e6), ] # get nonPAR + colnames(PCFinput)[3] <- TUMOURNAME + nrow(PCFinput) > 0 +} + +# Run phasing to end of BB pipeline +# The Battenberg::battenberg function runs the main pipeline, however in the recent dev version, +# internally parallelised tasks fail on our system due to the use of parallell::makeCluster to +# initialise foreach. +# We redefine the Battenberg wrapper to resolve this issue. +battenberg_wgbs_wrapper <- function(tumourname, + normalname, + imputeinfofile, + problemloci, + ismale, + beaglejar, + beagleref.template, + beagleplink.template, + nthreads = 5, + externalhaplotypefile = NA, + allelecounts_file = NULL, + sampleidx = 1, + usebeagle = TRUE, + beaglewindow = 40, + beagleoverlap = 4, + beaglemaxmem = 10, + beaglenthreads = 1, + gccorrectprefix = NULL, + data_type = "wgs", + impute_exe = NULL, + platform_gamma = 1, + phasing_gamma = 1, + segmentation_gamma = 10, + segmentation_kmin = 3, + phasing_kmin = 1, + clonality_dist_metric = 0, + ascat_dist_metric = 1, + min_ploidy = 1.6, + max_ploidy = 4.8, + min_rho = 0.1, + min_goodness = 0.63, + uninformative_BAF_threshold = 0.51, + min_normal_depth = 10, + min_base_qual = 20, + min_map_qual = 35, + calc_seg_baf_option = 3, + skip_allele_counting = T, # T + skip_preprocessing = T, # T + skip_phasing = F, + javajre = "java", + write_battenberg_phasing = T, + multisample_relative_weight_balanced = 0.25, + multisample_maxlag = 100, + segmentation_gamma_multisample = 5, + snp6_reference_info_file = NA, + heterozygousFilter = "none", + prior_breakpoints_file = NULL, + GENOMEBUILD = "hg38", + use_preset_rho_psi = F, # Added to expose manual setting + preset_rho = NA, + preset_psi = NA) { + # Battenberg WGBS (currently) begins from haplotyping step. First, ensure expected files are present. + stopifnot(all(sapply( + c( + imputeinfofile, + beaglejar, + problemloci, + paste0(tumourname, c("_mutantBAF.tab", "_mutantLogR.tab")), + paste0(normalname, c("_normalBAF.tab", "_normalLogR.tab")) + ), + fs::file_exists + ))) + + # Set analysis variables + chrom_names <- Battenberg::get.chrom.names(imputeinfofile, TRUE) + # Retain autosomes only. Update to latest battenberg DEV to support ChrX + chrom_names <- chrom_names[!(chrom_names %in% c("X","Y"))] + logr_file <- paste(tumourname, "_mutantLogR.tab", sep = "") + externalhaplotypeprefix <- ifelse(!is.na(externalhaplotypefile), paste0(normalname, "_external_haplotypes_chr"), NA) + + # 1. Run haplotying + #doParallel::registerDoParallel(cores = nthreads) + #foreach::foreach(i = 1:length(chrom_names), .errorhandling = "remove") %do% { + for(i in 1:length(chrom_names)) { + chrom <- chrom_names[i] + logging::loginfo(paste0("Battenberg haplotyping chromosome ", chrom, "."), logger="CAMDAC") + + invisible(capture.output({ + Battenberg::run_haplotyping( + chrom = chrom, + tumourname = tumourname[sampleidx], + normalname = normalname, + ismale = ismale, + imputeinfofile = imputeinfofile, + problemloci = problemloci, + impute_exe = impute_exe, + min_normal_depth = min_normal_depth, + chrom_names = chrom_names, + snp6_reference_info_file = snp6_reference_info_file, + heterozygousFilter = heterozygousFilter, + usebeagle = usebeagle, + beaglejar = beaglejar, + beagleref = gsub("CHROMNAME", chrom, beagleref.template), + beagleplink = gsub("CHROMNAME", chrom, beagleplink.template), + beaglemaxmem = beaglemaxmem, + beaglenthreads = beaglenthreads, + beaglewindow = beaglewindow, + beagleoverlap = beagleoverlap, + externalhaplotypeprefix = externalhaplotypeprefix, + use_previous_imputation = (sampleidx > 1) + )} + ) + ) + + logging::logdebug(paste0("Battenberg RUN HAPLO COMPLETE for ", chrom), logger="CAMDAC") + } + # doParallel::stopImplicitCluster() + + # 2. Combine all the BAF output into a single file + Battenberg::combine.baf.files( + inputfile.prefix = paste(tumourname[sampleidx], "_chr", sep = ""), + inputfile.postfix = "_heterozygousMutBAFs_haplotyped.txt", + outputfile = paste(tumourname[sampleidx], "_heterozygousMutBAFs_haplotyped.txt", sep = ""), + chr_names = chrom_names + ) + + # Raise error if haplotyping fails to yield tumor BAFs + tryCatch( + nrow(fread(paste(tumourname[sampleidx], "_heterozygousMutBAFs_haplotyped.txt", sep = ""))), + error = function(e) { + logging::logerror("Error: Battenberg haplotyping did not yield Tumor BAF results. Quitting.", logger="CAMDAC") + stop(e) + } + ) + + # 3. Segment the phased and haplotyped BAF data + Battenberg::segment.baf.phased( + samplename = tumourname[sampleidx], + inputfile = paste(tumourname[sampleidx], "_heterozygousMutBAFs_haplotyped.txt", sep = ""), + outputfile = paste(tumourname[sampleidx], ".BAFsegmented.txt", sep = ""), + prior_breakpoints_file = prior_breakpoints_file, + gamma = segmentation_gamma, + phasegamma = phasing_gamma, + kmin = segmentation_kmin, + phasekmin = phasing_kmin, + calc_seg_baf_option = calc_seg_baf_option + ) + + # 4. Fit a clonal copy number profile + Battenberg::fit.copy.number( + samplename = tumourname[sampleidx], + outputfile.prefix = paste(tumourname[sampleidx], "_", sep = ""), + inputfile.baf.segmented = paste(tumourname[sampleidx], ".BAFsegmented.txt", sep = ""), + inputfile.baf = paste(tumourname[sampleidx], "_mutantBAF.tab", sep = ""), + inputfile.logr = logr_file, + dist_choice = clonality_dist_metric, + ascat_dist_choice = ascat_dist_metric, + min.ploidy = min_ploidy, + max.ploidy = max_ploidy, + min.rho = min_rho, + min.goodness = min_goodness, + uninformative_BAF_threshold = uninformative_BAF_threshold, + gamma_param = platform_gamma, + use_preset_rho_psi = use_preset_rho_psi, + preset_rho = preset_rho, + preset_psi = preset_psi, + read_depth = 30, + analysis = "paired" + ) + + # 5. Go over all segments, determine which segements are a mixture of two states and fit a second CN state + Battenberg::callSubclones( + sample.name = tumourname[sampleidx], + baf.segmented.file = paste(tumourname[sampleidx], ".BAFsegmented.txt", sep = ""), + logr.file = logr_file, + rho.psi.file = paste(tumourname[sampleidx], "_rho_and_psi.txt", sep = ""), + output.file = paste(tumourname[sampleidx], "_copynumber.txt", sep = ""), + output.figures.prefix = paste(tumourname[sampleidx], "_subclones_chr", sep = ""), + output.gw.figures.prefix = paste(tumourname[sampleidx], "_BattenbergProfile", sep = ""), + masking_output_file = paste(tumourname[sampleidx], "_segment_masking_details.txt", sep = ""), + prior_breakpoints_file = prior_breakpoints_file, + chr_names = chrom_names, + gamma = platform_gamma, + segmentation.gamma = NA, + siglevel = 0.05, + maxdist = 0.01, + noperms = 1000, + calc_seg_baf_option = calc_seg_baf_option + ) + + # 6. If patient is male, get copy number status of ChrX based only on logR segmentation (due to hemizygosity of SNPs) + if (ismale) { + if (check_callChrXSubclones(tumourname[sampleidx])) { + Battenberg::callChrXsubclones( + tumourname = tumourname[sampleidx], + X_gamma = 1000, + X_kmin = 100, + genomebuild = GENOMEBUILD, + AR = TRUE, + chrom_names = chrom_names + ) + } + } +} diff --git a/R/cmain.R b/R/cmain.R new file mode 100755 index 0000000..53acbce --- /dev/null +++ b/R/cmain.R @@ -0,0 +1,539 @@ +#' Count alleles +#' +#' @param sample A camdac sample object +#' @param config A camac allele object +#' @export +cmain_count_alleles <- function(sample, config) { + + # Skip if no BAM file provided + if (is.null(sample$bam)) { + logging::logdebug("No BAM. Skipping allele counting for %s", paste0(sample$patient_id, ":", sample$id), logger="CAMDAC") + return(NULL) + } + + # Error if BAM file does not exist + if (!file.exists(sample$bam)){ + logging::logerror("BAM file does not exist for: %s:%s", sample$patient_id, sample$id, logger="CAMDAC") + stop() + } + + #  Check if outputs exist and skip if required + output_filename <- get_fpath(sample, config, "counts") + if (file.exists(output_filename) && !config$overwrite) { + logging::logdebug("Skipping allele counting for %s", paste0(sample$patient_id, ":", sample$id), logger="CAMDAC") + return(output_filename) + } + + # Create temporary directory for allele counts files + tempdir <- tempfile( + pattern = "counts", + tmpdir = fs::path_dir(output_filename) + ) + fs::dir_create(tempdir) + + # Load BAM regions to analyse (segments) as a list of GRanges + if (is.null(config$regions)) { + # Create segments across entire reference genome + # The number of sections per chromosome is given by the config n_seg_split option. + segments_rds <- get_reference_files(config, type = "segments_files") + segments <- split_segments_gr(segments_rds, config$n_seg_split) + } else { + # Read segments BED file as a list of granges + segments <- read_segments_bed(config$regions) + } + # Load SNP and CpG loci for reference genome + loci_files <- get_reference_files(config, type = "loci_files") + + # Load sample data + bam_file <- sample$bam + paired_end <- is_pe(config) + drop_ccgg <- is_ccgg(config) + min_mapq <- config$min_mapq + min_cov <- config$min_cov + + # Initialise parallel workers. + doParallel::registerDoParallel(cores = config$n_cores) + + logging::loginfo("Counting alleles for %s", paste0(sample$patient_id, ":", sample$id), logger="CAMDAC") + # For each segment, load the appropriate SNP/CpG loci file segment and call allele counter in parallel + # Set warn=2 to ensure foreach fails if any of the parallel workers are terminated or raise a warning. + # without this option, foreach simply returns a warning and the pipeline continues. Essential for memory warning terminations. + options(warn = 2) + tmpfiles <- foreach(seg = segments, .combine = "c") %dopar% { + loci_dt <- load_loci_for_segment(seg, loci_files) + ac_file <- cwrap_get_allele_counts(bam_file, seg, loci_dt, paired_end, drop_ccgg, min_mapq = min_mapq, min_cov = min_cov) + tmp <- tempfile(tmpdir = tempdir, fileext = ".fst") + fst::write_fst(ac_file, tmp) + return(tmp) + } + options(warn = 0) + + # Combine temporary files with allele counts results into a single data table + result <- foreach(i = tmpfiles, .combine = "rbind") %dopar% { + fst::read_fst(i, as.data.table = T) + } + + # Write to output file + format_and_write_output(result, output_filename) # 2 lines + # Delete temporary files + fs::dir_delete(tempdir) + + # Stop parallel workers. When running the pipeline multiple times in an R session, + # R re-uses workers but does not clear memory. Hence large objects in foreach loops will remain. + doParallel::stopImplicitCluster() + return(output_filename) +} + + + +#' Make SNPs +#' +#' Format and save SNP file for CNA analysis (ASCAT or BATTENBERG) +#' +#' @param sample A camdac sample object +#' @param config A camdac config object +#' @export +cmain_make_snps <- function(sample, config) { + # Skip if counts file does not exist + output_file <- CAMDAC::get_fpath(sample, config, "snps") + if (fs::file_exists(output_file) & !config$overwrite) { + logging::logdebug("Skipping SNP profile creation for %s", paste0(sample$id), logger="CAMDAC") + return(output_file) + } + + ac_file <- get_fpath(sample, config, "counts") + if (!fs::file_exists(ac_file)) { + logging::logdebug("No counts file. Skipping SNP profile creation for %s", paste0(sample$id), logger="CAMDAC") + return(NULL) + } + + + logging::loginfo("Making SNP profile for %s", paste0(sample$id), logger="CAMDAC") + + # Load required reference files + gc_refs <- get_reference_files(config, "gc_per_window") + repli_ref <- get_reference_files(config, "repli_timing") + loci_ref <- get_reference_files(config, "loci_files") + + # Load SNP profiles + ac_file <- get_fpath(sample, config, "counts") + snps <- load_snp_profile(ac_file, loci_ref) + # Ensure SNPs sorted for ASCAT analysis + snps <- sort_genomic_dt(snps) + + # Save tumour SNPs to output file + fs::dir_create(fs::path_dir(output_file)) + data.table::fwrite(snps, file = output_file, compress = "gzip") + + # Return + return(output_file) +} + + + +#' Bind SNPs +#' +#' Combing tumour-normal SNP file for CNA analysis (ASCAT or BATTENBERG) +#' +#' @param tumour A camdac sample object +#' @param normal A camdac sample object +#' @param config A camdac config object +#' @export +cmain_bind_snps <- function(tumour, normal, config) { + tsnps_output_file <- CAMDAC::get_fpath(tumour, config, "tsnps") + if (fs::file_exists(tsnps_output_file) & !config$overwrite) { + logging::logdebug("Skipping SNP profile creation for %s", paste0(tumour$id, "&", normal$id), logger="CAMDAC") + return(tsnps_output_file) + } + + # Load required reference files + gc_refs <- get_reference_files(config, "gc_per_window") + repli_ref <- get_reference_files(config, "repli_timing") + + # Check previous pipeline step was run + tsnps_f <- get_fpath(tumour, config, "snps") + if (!fs::file_exists(tsnps_f)) { + logging::logwarn("Tumour SNP profiles must be created before binding for CNA calling. CAMDAC may not run correctly.", logger="CAMDAC") + return() + } + tsnps <- fread_chrom(tsnps_f) + + # Merge tumor and normal + tsnps <- bind_snps_protocol(tsnps, normal, config) + + # Filter tumor SNPs for heterozygous SNPs based on normal + # Not necessary if normal is NULL as this is done in `bind_snps_protocol` + # Future refactor: bind_snps_protocol should not filter in tumor-only mode + if (!is.null(normal)) { + tsnps <- select_heterozygous_snps(tsnps) + } + + # Set autosome status for calculating LogR + is_autosome <- !(tsnps$chrom %in% c("X", "Y", "23", "24")) + # Set normal cov for calculating LogR + if(is.null(normal)) { + normal_cov <- NA + } else { + normal_cov <- tsnps$total_counts_n + } + # Call LogR function + tsnps$LogR <- calculate_logr(tsnps$total_counts, normal_cov, is_autosome) + + # Correct LogR with GC and replication timing + tsnps <- annotate_gc(tsnps, gc_refs, n_cores = config$n_cores) # Long-running + tsnps <- annotate_repli(tsnps, repli_ref) + tsnps[, LogR_corr := spline_regress_logr(LogR, GC, repli)] + + # Remove low coverage singletons (far apart from other SNPs). + tsnps <- rm_low_cov_singletons(tsnps) + + # Ensure SNPs sorted for ASCAT analysis + tsnps <- sort_genomic_dt(tsnps) + + # Save tumour SNPs to output file + fs::dir_create(fs::path_dir(tsnps_output_file)) + data.table::fwrite(tsnps, file = tsnps_output_file, compress = "gzip") + + # Return + return(tsnps_output_file) +} + +#' Call CNA +#' +#' Config determines whether ASCAT or Battenberg is used +#' @param tumour A camdac sample object +#' @param normal A camdac sample object +#' @param config A camdac config object +#' @export +cmain_call_cna <- function(tumour, config) { + # Skip if file exists and overwrite is false + cna_output_name <- get_fpath(tumour, config, "cna") + if (fs::file_exists(cna_output_name) & !config$overwrite) { + logging::logdebug("CNA Found. Skipping %s analysis for %s", config$cna_caller, tumour$id, logger="CAMDAC") + return(cna_output_name) + } + + # Skip if tsnps file does not exist + tsnps_f <- get_fpath(tumour, config, "tsnps") + if (!fs::file_exists(tsnps_f)) { + logging::logwarn("No tsnps file. Skipping %s analysis for %s:%s", config$cna_caller, tumour$patient_id, tumour$id, logger="CAMDAC") + return(NULL) + } + + if (config$cna_caller == "ascat") { + cna <- cmain_run_ascat(tumour, config) + } else if (config$cna_caller == "battenberg") { + cna <- cmain_run_battenberg(tumour, config) + } else { + logging::logerror("Unknown CNA caller option in config", logger="CAMDAC") + stop() + } + + data.table::fwrite(cna, file = cna_output_name, sep = "\t", col.names = T, quote = F) + return(cna_output_name) +} + +#' Run ASCAT.m +#' +#' Expects SNP profiles to have been created using `cmain_make_snp_profiles` +#' +#' @param tumour A camdac sample object +#' @param normal A camdac sample object +#' @param config A camdac config object +#' @export +cmain_run_ascat <- function(tumour, config) { + logging::loginfo("Running ASCAT analysis for %s", paste0(tumour$id), logger="CAMDAC") + + # Setup output object and results directory + out_obj <- get_fpath(tumour, config, "ascat") + out_dir <- fs::dir_create(fs::path_dir(out_obj)) + + # Load TSNPS + tsnps <- fread_chrom( + CAMDAC::get_fpath(tumour, config, "tsnps") + ) + + # Set CNA settings from object + cna_settings <- config$cna_settings + + # Set Rho and Psi to NA if not given (required by ASCAT) + if (!is.null(cna_settings$rho) & !is.null(cna_settings$psi)) { + preset_rho <- cna_settings$rho + preset_psi <- cna_settings$psi + } else { + preset_rho <- NA + preset_psi <- NA + } + + # Set penalty to 200 if not given + if (is.null(cna_settings$ascat_penalty)) { + ascat_penalty <- 200 + } else { + ascat_penalty <- as.numeric(cna_settings$ascat_penalty) + } + + # Log ASCAT penalty param + logging::loginfo("Using ASCAT penalty: %s", ascat_penalty, logger="CAMDAC") + + # Run ASCAT + ascat_results <- run_ascat.m2(tumour, tsnps, + outdir = out_dir, rho_manual = preset_rho, + psi_manual = preset_psi, penalty = ascat_penalty + ) + + # Write ASCAT output files. QS used to serialise for faster read/write of WGBS data. RRBS uses .RData. + ascat_output_name <- get_fpath(tumour, config, "ascat") + ascat_frag_name <- gsub("output.qs", "frag.qs", ascat_output_name) + ascat_bc_name <- gsub("output.qs", "bc.qs", ascat_output_name) + + qs::qsave(ascat_results$ascat.bc, ascat_bc_name) + qs::qsave(ascat_results$ascat.frag, ascat_frag_name) + qs::qsave(ascat_results$ascat.output, ascat_output_name) + + # Write CNA object to file for ease + + cna <- load_cna_data(tumour, config, "ascat") + + return(cna) +} + +#' Run battenberg +#' +#' Expects SNP profiles to have been created using `cmain_make_snp_profiles` +#' +#' @param tumour A camdac sample object +#' @param normal A camdac sample object +#' @param config A camdac config object +#' @export +cmain_run_battenberg <- function(tumour, config) { + # BB operates from within output directory, therefore we switch there to start and leave before ending + currentwd <- getwd() + outdir <- fs::dir_create(get_fpath(tumour, config, "battenberg", dir = T)) + setwd(outdir) + + # Convert CAMDAC objects to bb inputs + tumour_prefix <- paste0(tumour$patient_id, "-", tumour$id) + normal_prefix <- paste0(tumour$patient_id, "-", "N") + camdac_tsnps <- get_fpath(tumour, config, "tsnps") + stopifnot(fs::file_exists(camdac_tsnps)) + tsnps <- fread_chrom(camdac_tsnps) + + logging::loginfo("Preparing WGBS allele counts for Battenberg", logger="CAMDAC") + camdac_to_battenberg_allele_freqs(tsnps, tumour_prefix, normal_prefix, outdir, min_normal_depth = config$min_cov) + + logging::loginfo("Preparing WGBS BAF and logR for Battenberg", logger="CAMDAC") + prepare_wgbs_files <- camdac_to_battenberg_prepare_wgbs(tumour_prefix, normal_prefix, camdac_tsnps, outdir) + + logging::loginfo("Running Battenberg for %s", paste0(tumour$id), logger="CAMDAC") + # Define battenberg inputs. + tumourname <- tumour_prefix + normalname <- normal_prefix + ismale <- ifelse(tumour$sex == "XY", TRUE, FALSE) + + # Setup battenberg references + # `get_reference_files` returns files in subdirectory, so to get root we take the parent of the first file returned. + bb_38_dir <- fs::path_dir(get_reference_files(config, "battenberg"))[[1]] + beagleref.template <- paste0(bb_38_dir, "/beagle5/chrCHROMNAME.1kg.phase3.v5a.vcf.gz") + beagleplink.template <- paste0(bb_38_dir, "/beagle5/plink.chrCHROMNAME.map") + problemloci <- paste0(bb_38_dir, "/probloci/probloci.txt.gz") + imputeinfofile <- create_impute_info_file(bb_38_dir, outdir) # Created from template. + + # Set beagle software path. CAMDAC config creation fits by default. + beaglejar <- config$beaglejar + + # Set default RHO (purity) and PSI (ploidy) based on config + cna_settings = config$cna_settings + if (!is.null(cna_settings$rho) & !is.null(cna_settings$psi)) { + use_preset_rho_psi <- T + preset_rho <- cna_settings$rho + preset_psi <- cna_settings$psi + } else { + use_preset_rho_psi <- F + preset_rho <- NA + preset_psi <- NA + } + + # Limit number of cores to 6 to avoid battenberg memory errors. + # Note. Battenberg cores > 6 gives out of memory error + javajre = ifelse(is.null(cna_settings$java), "java", cna_settings$java) + bb_cores = ifelse(is.null(cna_settings$cores), 6, cna_settings$cores) + beaglemaxmem = ifelse(is.null(cna_settings$beaglemaxmem), 10, cna_settings$beaglemaxmem) + if (bb_cores > 6){ + logging::logwarn("Battenberg may raise out of memory errors if given too many cores.", logger="CAMDAC") + } + min_normal_depth <- config$min_cov + + # Run battenberg + # We could add another (optional) config parameter for battenberg cores? + battenberg_wgbs_wrapper(tumourname, normalname, imputeinfofile, problemloci, ismale, beaglejar, + beagleref.template, beagleplink.template, + phasing_gamma = 2, nthreads = bb_cores, + use_preset_rho_psi = use_preset_rho_psi, preset_rho = preset_rho, + min_normal_depth = min_normal_depth, preset_psi = preset_psi, javajre = javajre, + beaglemaxmem = beaglemaxmem + ) + + logging::loginfo("Saving Battenberg results.", logger="CAMDAC") + cna <- load_cna_data(tumour, config, "battenberg") + + setwd(currentwd) # Return to original directory + return(cna) +} + +#' Make methylation +#' +#' Pre-process methylation from allele counts for CAMDAC deconvolution +#' +#' @param sample A camdac sample object +#' @param config A camdac config object +#' @export +cmain_make_methylation_profile <- function(sample, config) { + # Skip if methylation file exists for sample + output_file <- get_fpath(sample, config, "meth") + if (fs::file_exists(output_file)) { + logging::logdebug("Methylation profile already exists for %s %s", sample$patient_id, sample$id, logger="CAMDAC") + return() + } + + ac_file <- get_fpath(sample, config, "counts") + if (!fs::file_exists(ac_file)) { + logging::logdebug("No counts file. Skipping methylation profile for %s %s", sample$patient_id, sample$id, logger="CAMDAC") + return() + } + + logging::loginfo("Preprocessing methylation data: %s", sample$id, logger="CAMDAC") + allele_counts <- data.table::fread(ac_file) + methylation <- process_methylation(allele_counts, min_meth_loci_reads = config$min_cov) + rm(allele_counts) + + logging::loginfo("Calculating HDI: %s", sample$id, logger="CAMDAC") + hdi <- calculate_counts_hdi(methylation$M, methylation$UM, n_cores = config$n_cores) + methylation <- cbind(methylation, hdi) + rm(hdi) + + logging::loginfo("Saving methylation profile: %s %s", sample$patient_id, sample$id, logger="CAMDAC") + fs::dir_create(fs::path_dir(output_file)) + data.table::fwrite(methylation, file = output_file) + return(output_file) +} + +#' Deconvolve methylation +#' +#' @param tumour A camdac sample object +#' @param normal A camdac sample object +#' @param config A camdac config object +#' @export +cmain_deconvolve_methylation <- function(tumour, normal, config) { + if (!file.exists(get_fpath(tumour, config, "meth"))) { + logging::loginfo("No methylation file for tumor. Skipping deconvolution for %s", paste0(tumour$patient_id, ":", tumour$id), logger="CAMDAC") + return() + } + if (!file.exists(get_fpath(normal, config, "meth"))) { + logging::loginfo("No methylation file for normal infiltrates. Skipping deconvolution for %s", paste0(tumour$patient_id, ":", tumour$id), logger="CAMDAC") + return() + } + outfile <- get_fpath(tumour, config, "pure") + if (file.exists(outfile) && !config$overwrite) { + logging::loginfo("Pure tumour methylation exists. Skipping deconvolution for %s", paste0(tumour$patient_id, ":", tumour$id), logger="CAMDAC") + return(outfile) + } + + logging::loginfo("Combining tumour-normal methylation: %s", tumour$patient_id, logger="CAMDAC") + # Load DNAme data and merge (one function) + t_meth <- fread_chrom(get_fpath(tumour, config, "meth")) + n_meth <- fread_chrom(get_fpath(normal, config, "meth")) + meth_c <- combine_tumour_normal_methylation(t_meth, n_meth) + + logging::loginfo("Annotating CNAs: %s", paste0(tumour$id, ":", normal$id), logger="CAMDAC") + # Load copy number data from ascat.output and annotate CGs. + cna <- fread_chrom(get_fpath(tumour, config, "cna")) + meth_c <- annotate_cgs_with_cnas(meth_c, cna) + + logging::loginfo("Deconvolving DNAme: %s", tumour$patient_id, logger="CAMDAC") + # Calculate m_t + meth_c <- deconvolve_bulk_methylation(meth_c) + + # Filter: CN=0 , effective cov_t>= 3, is.na(mt-raw) + meth_c <- filter_deconvolved_methylation(meth_c) + + logging::loginfo("Calculating pure_tumour HDI: %s", tumour$patient_id, logger="CAMDAC") + # Calculate m_t HDI # parallel, long-running function + meth_c <- calculate_m_t_hdi_norm(meth_c) + + data.table::fwrite(meth_c, outfile) +} + +#' Call tumour-normal DMPs +#' +#' Single-sample DMP calling on CAMDAC-deconvolved data +#' +#' @param tumour A camdac sample object +#' @param normal A camdac sample object +#' @param config A camdac config object +#' @export +cmain_call_dmps <- function(tumour, normal, config) { + if (!file.exists(get_fpath(tumour, config, "pure"))) { + logging::logingo("No purified methylation file for tumor. Skipping deconvolution for %s", paste0(tumour$patient_id, ":", tumour$id), logger="CAMDAC") + return() + } + + if (is.null(normal)) { + logging::loginfo("No cell of origin provided. Skipping DMP calling for %s", paste0(tumour$patient_id, ":", tumour$id), logger="CAMDAC") + return() + } + + output_filename <- get_fpath(tumour, config, "dmps") + if (file.exists(output_filename) && !config$overwrite) { + logging::loginfo("Skipping DMP calling for %s", paste0(tumour$patient_id, ":", tumour$id), logger="CAMDAC") + return(output_filename) + } + + logging::loginfo("Calling differentially methylated positions.", logger="CAMDAC") + # Call DMPs between tumour and normal + pmeth <- fread_chrom(get_fpath(tumour, config, "pure")) + nmeth <- fread_chrom(get_fpath(normal, config, "meth")) + + # Ensure tumour and normal subset to the same CpGs only. + overlaps <- findOverlaps( + GRanges(seqnames = pmeth$chrom, ranges = IRanges(pmeth$start, pmeth$end)), + GRanges(seqnames = nmeth$chrom, ranges = IRanges(nmeth$start, nmeth$end)), + type = "equal" + ) + pmeth <- pmeth[queryHits(overlaps), ] + nmeth <- nmeth[subjectHits(overlaps), ] + + tmeth <- call_dmps(pmeth, nmeth, effect_size = 0.2, prob = 0.99, itersplit = 5e5, ncores = config$n_cores) + tmeth_outfile <- get_fpath(tumour, config, "dmps") + fst::write_fst(tmeth, tmeth_outfile) +} + +#' Call tumour-normal DMRs +#' +#' Single-sample DMR calling on CAMDAC DMP data +#' +#' @param tumour A camdac sample object +#' @param normal A camdac sample object +#' @param config A camdac config object +#' @export +cmain_call_dmrs <- function(tumour, config) { + dmp_outfile <- get_fpath(tumour, config, "dmps") + if (!fs::file_exists(dmp_outfile)) { + logging::loginfo("No DMPs file. Skipping DMR calling for %s", paste0(tumour$patient_id, ":", tumour$id), logger="CAMDAC") + return() + } + + output_filename <- get_fpath(tumour, config, "dmrs") + if (file.exists(output_filename) && !config$overwrite) { + logging::loginfo("Skipping DMR calling for %s", paste0(tumour$patient_id, ":", tumour$id), logger="CAMDAC") + return(output_filename) + } + + logging::loginfo("Calling differentially methylated regions.", logger="CAMDAC") + tmeth_outfile <- get_fpath(tumour, config, "dmps") + tmeth_dmps <- fst::read_fst(tmeth_outfile, as.data.table = T) + regions_file <- CAMDAC::get_reference_files(config, "annotations", "*all_regions_annotations*") + regions_annotations <- data.table::fread(regions_file) + tmeth_dmrs <- call_dmrs(tmeth_dmps, regions_annotations, n_cores = config$n_cores) + tmeth_dmrs_outfile <- get_fpath(tumour, config, "dmrs") + fst::write_fst(tmeth_dmrs, tmeth_dmrs_outfile) +} diff --git a/R/config.R b/R/config.R new file mode 100755 index 0000000..b15c2a5 --- /dev/null +++ b/R/config.R @@ -0,0 +1,410 @@ +#' Build CAMDAC sample object +#' @param id Unique identifier for the sample +#' @param sex The sex of the patient, "XX" or "XY". Required for CNA calling. +#' @param bam Sample BAM file. If not given, CAMDAC expects files linked with `attach_output`. +#' @param patient_id An identifier for the patient +#' @export +CamSample <- function(id, sex, bam = NULL, patient_id = "P") { + return( + list( + id = id, + sex = sex, + bam = bam, + patient_id = patient_id + ) + ) +} + +#' Set CAMDAC configuration +#' +#' @param outdir A path to save CAMDAC results. The results folder structure +#' follows the format PATIENT/DATASET/SAMPLE/. +#' @param bsseq Bisulfite sequencing platform. Choose between "wgbs" or "rrbs". +#' @param lib Bisulfite sequencing library. Choose "pe" for paired end, "se" for single end. +#' @param build Reference genome build. Choose "hg38" or "hg19". +#' @param n_cores Number of cores to process CAMDAC data in parallel wherever possible. +#' @param regions A BED file with regions to restrict the analysis to +#' @param refs Path to CAMDAC reference files. If this is not given, CAMDAC searches the +#' environment variable CAMDAC_PIPELINE_FILES. If this is not set, CAMDAC searches recursively in the current +#' working directory. +#' @param min_mapq Minimum mapping quality filter used in `cmain_allele_counts()`. +#' @param min_cov Minimum coverage filter for: DNA methylation, Normal SNP selection. +#' @param overwrite Config to overwrite files if they already exist. +#' @param cna_caller The CNA caller to use. "ascat" or "battenberg". Default is "battenberg" +#' @param cna_settings A list of settings to pass to the CNA caller. rho, psi, java, beaglemaxmem +#' @export +CamConfig <- function(outdir, bsseq, lib, build, n_cores = 1, regions = NULL, + refs = NULL, n_seg_split = 50, min_mapq = 1, min_cov = 1, min_normal_cov=10, overwrite = FALSE, + cna_caller = "battenberg", cna_settings = NULL) { + # Create output directory if it doesn't exist and set to absolute path + fs::dir_create(outdir) + outdir <- fs::path_real(outdir) + + # Validate inputs + stopifnot(cna_caller %in% c("ascat", "battenberg")) + stopifnot(lib %in% c("pe", "se")) + stopifnot(bsseq %in% c("wgbs", "rrbs")) + + # Set camdac references if not they do not exist + refs <- ifelse(is.null(refs), pipeline_files(), fs::path_real(refs)) + + # If using battenberg, validate that java is available and set beagle jar + if (cna_caller == "battenberg") { + check_java() + bjar <- get_reference_files( + list( + refs = refs, + build = build, + bsseq = bsseq + ), + "beagle_jar" + ) + } else { + bjar <- NULL + } + + # If using rrbs, CNA caller must be ASCAT + if (bsseq == "rrbs") { + if (cna_caller != "ascat"){ + logwarn("CNA caller must be `ascat` for RRBS data. This has now been set for the analysis.") + cna_caller <- "ascat" + } + } + + if (bsseq == "wgbs"){ + if (lib != "pe"){ + logwarn( + "WGBS data analysis is only available for paired-end samples. Stopping." + ) + stop() + } + } + + return(list( + refs = refs, + outdir = outdir, + build = build, + bsseq = bsseq, + bsseq_lib = lib, + n_cores = n_cores, + n_seg_split = n_seg_split, + min_mapq = min_mapq, + min_cov = min_cov, + min_normal_cov = min_normal_cov, + overwrite = overwrite, + beaglejar = bjar, + regions = regions, + cna_caller = cna_caller, + cna_settings = cna_settings + )) +} + +is_pe <- function(config) { + # Returns TRUE if sample is paired end. + ifelse(config$bsseq_lib == "pe", TRUE, FALSE) +} + +is_ccgg <- function(config) { + # Returns TRUE if ccgg should be included in camdac run + ifelse(config$bsseq == "wgbs", TRUE, FALSE) +} + +FPATH_CODES <- c( + "counts", "meth", "pure", "dmps", "dmrs", "segment_split", "snps", + "ascat", "battenberg", "tsnps", "cna", "asm_snps", "asm_counts", "asm_hap_stats", + "asm_phase_map", "asm_meth", "asm_cna", "asm_meth_cna", "asm_meth_pure", "asm_ss_dmp", "asm_dmp" +) + +# Create/confirm output directories +#' @export +get_fpath <- function(sample, config, code, dir = FALSE) { + stopifnot(code %in% FPATH_CODES) + + # Set output file name + output_name <- dplyr::case_when( + code == "counts" ~ fs::path( + config$outdir, sample$patient_id, "Allelecounts", sample$id, paste( + sample$patient_id, sample$id, "SNPs", "CpGs", "all", "sorted", "csv", "gz", + sep = "." + ) + ), + code == "segment_split" ~ fs::path( # TEST: Tempfile to place segments + config$outdir, sample$patient_id, "Allelecounts", sample$id, paste( + sample$patient_id, sample$id, "segment", "counts", fs::path_file(tempfile()), "fst", + sep = "." + ) + ), + code == "meth" ~ fs::path( + config$outdir, sample$patient_id, "Methylation", sample$id, paste( + sample$patient_id, sample$id, "m", "csv", "gz", + sep = "." + ) + ), + code == "pure" ~ fs::path( + config$outdir, sample$patient_id, "Methylation", sample$id, paste( + sample$patient_id, sample$id, "pure", "csv", "gz", + sep = "." + ) + ), + code == "dmps" ~ fs::path( + config$outdir, sample$patient_id, "Methylation", sample$id, paste( + sample$patient_id, sample$id, "CAMDAC_results_per_CpG", "fst", + sep = "." + ) + ), + code == "dmrs" ~ fs::path( + config$outdir, sample$patient_id, "Methylation", sample$id, paste( + sample$patient_id, sample$id, "CAMDAC_annotated_DMRs", "fst", + sep = "." + ) + ), + code == "snps" ~ fs::path( + config$outdir, sample$patient_id, "Copynumber", sample$id, paste( + sample$patient_id, sample$id, "SNPs", "csv", "gz", + sep = "." + ) + ), + code == "tsnps" ~ fs::path( + config$outdir, sample$patient_id, "Copynumber", sample$id, paste( + sample$patient_id, sample$id, "tnSNP", "csv", "gz", + sep = "." + ) + ), + code == "ascat" ~ fs::path( + config$outdir, sample$patient_id, "Copynumber", sample$id, "ascat", paste( + sample$patient_id, sample$id, "ascat", "output", "qs", + sep = "." + ) + ), + code == "battenberg" ~ fs::path( + config$outdir, sample$patient_id, "Copynumber", sample$id, "battenberg", paste( + sample$patient_id, sample$id, "battenberg", "output", "qs", + sep = "." + ) + ), + code == "cna" ~ fs::path( + config$outdir, sample$patient_id, "Copynumber", sample$id, paste( + sample$patient_id, sample$id, "cna", "txt", + sep = "." + ) + ), + code == "asm_counts" ~ fs::path( + config$outdir, sample$patient_id, "AlleleSpecific", sample$id, paste( + sample$patient_id, sample$id, "asm_counts", "csv", "gz", + sep = "." + ) + ), + code == "asm_hap_stats" ~ fs::path( + config$outdir, sample$patient_id, "AlleleSpecific", sample$id, paste( + sample$patient_id, sample$id, "asm_hap_stats", "csv", "gz", + sep = "." + ) + ), + code == "asm_phase_map" ~ fs::path( + config$outdir, sample$patient_id, "AlleleSpecific", sample$id, paste( + sample$patient_id, sample$id, "asm_phase_map", "csv", "gz", + sep = "." + ) + ), + code == "asm_counts" ~ fs::path( + config$outdir, sample$patient_id, "AlleleSpecific", sample$id, paste( + sample$patient_id, sample$id, "asm_counts", "csv", "gz", + sep = "." + ) + ), + code == "asm_snps" ~ fs::path( + config$outdir, sample$patient_id, "AlleleSpecific", sample$id, paste( + sample$patient_id, sample$id, "asm_snps", "txt", + sep = "." + ) + ), + code == "asm_cna" ~ fs::path( + config$outdir, sample$patient_id, "AlleleSpecific", sample$id, paste( + sample$patient_id, sample$id, "asm_cna", "txt", + sep = "." + ) + ), + code == "asm_meth" ~ fs::path( + config$outdir, sample$patient_id, "Methylation", sample$id, paste( + sample$patient_id, sample$id, "asm_meth", "csv", "gz", + sep = "." + ) + ), + code == "asm_meth_cna" ~ fs::path( + config$outdir, sample$patient_id, "Methylation", sample$id, paste( + sample$patient_id, sample$id, "asm_meth_cna", "csv", "gz", + sep = "." + ) + ), + code == "asm_meth_pure" ~ fs::path( + config$outdir, sample$patient_id, "Methylation", sample$id, paste( + sample$patient_id, sample$id, "asm_meth_pure", "csv", "gz", + sep = "." + ) + ), + code == "asm_ss_dmp" ~ fs::path( + config$outdir, sample$patient_id, "Methylation", sample$id, paste( + sample$patient_id, sample$id, "asm_ss_dmp", "csv", "gz", + sep = "." + ) + ), + code == "asm_dmp" ~ fs::path( + config$outdir, sample$patient_id, "Methylation", sample$id, paste( + sample$patient_id, sample$id, "asm_dmp", "csv", "gz", + sep = "." + ) + ) + ) + + if (dir) { + return(fs::path_dir(output_name)) + } + + # When outpath does not exist, an uninitialised character object is given. Replace with empty string. + if (length(output_name) == 0) { + output_name <- "" + } + + return(output_name) +} + +#' Split genome into segments for allele counting +#' @param segments_file An RDS file containing a GRanges object with each chromosome region to split +#' @param n_seg_split An integer to split each chromosome segment +#' @keywords internal +split_segments_gr <- function(segments_file, n_seg_split) { + segs_gr <- readRDS(segments_file) + # Split segments by n per chromosomse and combine into a single GRanges + # Must be list and not GRangesList in order to combine downstream. + + # seqlengths = setNames(width(segs_gr),seqnames(segs_gr)) # Works because segs_gr is entire genome lengths + # GenomicRanges::tileGenome(seqlengths, ntile=n_seg_split) # Can run do.call on this + segs_tile <- as.list( + GenomicRanges::tile(segs_gr, n = n_seg_split) + ) + segs_all <- do.call(c, segs_tile) + # Convert into list of GRanges tiles for parallelising + segs_list <- lapply(seq(length(segs_all)), function(x) segs_all[x]) + return(segs_list) +} + +#' Get CAMDAC reference files from config +#' @export +get_reference_files <- function(config, type_folder, glob = NULL) { + stopifnot( + type_folder %in% c( + "annotations", "battenberg", "gc_per_window", "loci_files", "repli_timing", "segments_files", + "beagle_jar" + ) + ) + + # Select parent directory based on bsseq and build + root <- fs::dir_ls(config$refs, recurse = T, type = "directory", regexp = paste0( + config$bsseq, ".{1}", # File separator + config$build, "$" + )) + + # List requested reference files: + refs <- fs::dir_ls(fs::path(root, type_folder), glob = glob) + return(refs) +} + +#' Download CAMDAC pipeline files +#' +#' @description CAMDAC pipeline files are required for analysis. This function downloads the files to +#' the output directory and unpacks them. By default, CAMDAC searches for the files in the +#' environment variable CAMDAC_PIPELINE_FILES. If this is missing, the current directory is used. +#' @param assay Sequencing assay. Either wgbs or rrbs. +#' @param directory Optional. Directory to download files to. +#' @export +download_pipeline_files <- function(bsseq, directory = NULL) { + stopifnot(bsseq %in% c("wgbs", "rrbs", "test")) + loginfo("Downloading pipeline files for %s analysis", bsseq) + + # Get download URL from CAMDAC index file + url_index_file <- system.file("extdata", "pipeline_files_urls.txt", package = "CAMDAC") + urls <- read.table(url_index_file, header = F, stringsAsFactors = F) + names(urls) <- c("bsseq", "link") + link <- urls[urls$bsseq == bsseq, ][[2]] + + # Get download location + # If a directory is passed to the function, install there. + if (!is.null(directory)) { + location <- fs::path_expand(directory) + } else { + # Else, get pipeline files location from environment variable + # The currect directory is used if environment variable is empty + cpf_env <- Sys.getenv("CAMDAC_PIPELINE_FILES") + location <- ifelse(cpf_env == "", ".", cpf_env) + location <- fs::path_expand(location) + } + + # Ensure download directory path exists + if (!fs::dir_exists(location)) { + fs::dir_create(location) + } + + # Download pipeline files and unzip + tf <- tempfile(tmpdir = ".") + download.file(link, destfile = tf, method = "wget") + loginfo("Unpacking tempfile (tar.gz): %s", tf) + untar(tf, exdir = location) + + fs::file_delete(tf) + loginfo("Tempfile unpacked and deleted %s", tf) + + loginfo("Pipeline files for %s downloaded to %s", bsseq, location) +} + +# Return data table with CpG/SNP loci for segment +#' @export +load_loci_for_segment <- function(seg, loci_files) { + # Load CAMDAC loci object for all regions in seg + + # Select loci files that have chromosomes in seg + chrom <- gsub("chr", "", unique(seqnames(seg))) + chrom_as_loci_file_number <- dplyr::case_when(chrom == "X" ~ "23", chrom == "Y" ~ "24", TRUE ~ chrom) + loci_file_regex <- paste0(".*\\.", chrom_as_loci_file_number, ".RData", collapse = "|") + loci_filenames <- loci_files[grepl(loci_file_regex, loci_files)] + + # Early exit if no loci files matched + if (length(loci_filenames) == 0) { + return(NA) + } + + # Load loci files as a single object + loci_dt <- data.table() + for (infile in loci_filenames) { + load(infile) # Brings loci_subset into local environment + ol <- findOverlaps(seg, loci_subset) + loci <- data.frame(loci_subset[ + subjectHits(findOverlaps(seg, loci_subset)) + ]) + loci_dt <- rbind(loci_dt, loci) + } + + # Early exit if no loci are present in segment + if (nrow(loci_dt) == 0) { + return(NA) + } + + # Format first column. Only works if data exists. + names(loci_dt)[1] <- "chrom" + + return(loci_dt) +} + +pipeline_files <- function() { + pf <- Sys.getenv("CAMDAC_PIPELINE_FILES") + ifelse(pf == "", fs::path_real("."), pf) +} + +check_java <- function() { + java_found <- system2("java", "-version", stderr = F, stdout = F) == 0 + if (!java_found) { + stop(paste0( + "Java not found. Please install Java to use Battenberg,", + "otherwise, run set cna_caller to ASCAT and try again." + )) + } +} diff --git a/R/differential.R b/R/differential.R new file mode 100644 index 0000000..524b727 --- /dev/null +++ b/R/differential.R @@ -0,0 +1,261 @@ +# Evan Miller's closed form solution for the probability that +# a draw from a beta dist is greater than another +# Takes counts and methylation fractions for normal and bulk +prob_diff_meth <- function(M_n, UM_n, M, UM) { + # Return NA when counts are not given + if (any(is.na(c(M_n, UM_n, M, UM)))) { + return(NA) + } + + M_n <- M_n + 1 + UM_n <- UM_n + 1 + M <- M + 1 + UM <- UM + 1 + j <- seq.int(0, round(M) - 1) + log_vals <- (lbeta(M_n + j, UM_n + UM) - log(UM + j) - + lbeta(1 + j, UM) - lbeta(M_n, UM_n)) + 1 - sum(exp(log_vals)) +} +# Vectorized +v_prob_diff_meth <- Vectorize(prob_diff_meth) + +# Calculate probability of DMP (difference between betas) from bulk and normal counts +calc_prob_dmp <- function(M_n, UM_n, M, UM, itersplit = 5e5, ncores = 5) { + split_factor <- make_split_factor(length(M_n), itersplit) + msplit <- iterators::isplit(seq(length(M_n)), split_factor) + + doParallel::registerDoParallel(cores = ncores) + prob <- foreach(v = msplit, .combine = "c") %do% { + x <- v$value + ph <- v_prob_diff_meth(M_n[x], UM_n[x], M[x], UM[x]) + return(ph) + } + doParallel::stopImplicitCluster() + + # Format values + prob <- data.table::fcase( + is.na(prob), 0.5, + prob > 1, 1, + prob < 0, 0, + rep_len(TRUE, length(prob)), prob # Otherwise return value + ) + + return(prob) +} + +#' Call differentially methylated positions +#' @keywords internal +call_dmps <- function(pmeth, nmeth, effect_size = 0.2, prob = 0.99, itersplit = 5e5, ncores = 5) { + stopifnot(nrow(pmeth) == nrow(nmeth)) + + # Set variables + M_n <- nmeth$M + UM_n <- nmeth$UM + M <- pmeth$M + UM <- pmeth$UM + m <- pmeth$m + m_b_diff <- pmeth$m - nmeth$m + m_t_diff <- pmeth$m_t - nmeth$m + + phypo <- calc_prob_dmp(M_n, UM_n, M, UM, ncores = ncores, itersplit = itersplit) + + prob_DMP <- data.table::fifelse(m_t_diff > 0, 1 - phypo, phypo) # I.e. if bulk is greater than normal then it's a hyper DMP + rm(phypo) + + DMP_b <- prob_to_call(prob_DMP, m_b_diff, effect_size = effect_size, prob = prob) + DMP_t <- prob_to_call(prob_DMP, m_t_diff, effect_size = effect_size, prob = prob) + + + res <- cbind( + pmeth, + data.table(prob_DMP, DMP_b, DMP_t, + ndmp_m = nmeth$m, + ndmp_cov = nmeth$cov, + ndmp_ml = nmeth$m_x_low, + ndmp_mh = nmeth$m_x_high + ) + ) + + return(res) +} + +prob_to_call <- function(p, mdiff, effect_size = 0.2, prob = 0.99) { + data.table::fcase( + p >= prob & mdiff >= effect_size, "hyper", + p >= prob & mdiff <= (-effect_size), "hypo" + ) +} + +#' Add CAMDAC region annotations to dt. +#' DT must have chrom, start, end +#' @noRd +annotate_dmp_regions <- function(dt, all_regions_anno) { + # Ensure chromosomes are correct format + dt[, chrom := factor(chrom, levels = c(1:22, "X", "Y"), ordered = TRUE)] + all_regions_anno[, chrom := factor(chrom, levels = c(1:22, "X", "Y"), ordered = TRUE)] + + # Overlap annotated regions and CpG methylation objects + setkey(dt, chrom, start, end) + setkey(all_regions_anno, chrom, start, end) + dt <- foverlaps(all_regions_anno, dt, type = "any", nomatch = NULL) + + # Order regions + dt <- dt[order( + as.numeric(cluster_id), + factor(chrom, levels = c(1:22, "X", "Y"), ordered = TRUE), + start, end + ), ] + return(dt) +} + +#' Count CpGs within DMP annotations +#' @keywords internal +get_cluster_counts <- function(dt) { + cluster_stats <- dt[, .SD, .SDcols = c("m_t", "m_n", "DMP_t", "cluster_id")] + cluster_counts <- cluster_stats[, .( + CpG_counts = length(DMP_t), + DMP_counts = length(DMP_t[!is.na(DMP_t)]), + consec_DMPs = max_consec_dmp(DMP_t) + ), by = "cluster_id"] + return(cluster_counts) +} + +#' Summarise CG stats per DMR +#' @keywords internal +collapse_cpg_to_dmr <- function(dt) { + dt <- dt[!is.na(DMR) & !is.na(DMP_t)] + dt <- dt[, + .( + m_n = mean(m_n), + m_n_low = mean(m_x_low_n), + m_n_high = mean(m_x_high_n), + m_t = mean(m_t), + m_t_low = mean(m_t_low), + m_t_high = mean(m_t_high), + prob = mean(prob_DMP), + CG_CN = mean(CG_CN), + nA = mean(nA, na.rm = T), + nB = mean(nB, na.rm = T), + # segment = paste(unique(segment), collapse = ";"), + DMR_type = set_dmr_type(DMP_t) + ), + by = .(cluster_id, chrom, i.start, i.end) + ] + setnames(dt, "i.start", "start") + setnames(dt, "i.end", "end") + dt[, DMR := "DMR"] + return(dt) +} + +# Reapply DMR annotations, which are currently lost when collapsing CpGs +re_annotate_dmrs <- function(dt, all_regions_anno) { + all_regions_anno[, `:=`( + chrom = NULL, start = NULL, end = NULL, CpG_counts = NULL + )] + + dt <- merge(dt, all_regions_anno, all.x = T, by = c("cluster_id")) + return(dt) +} + + +#' Function to call DMRs on a camdac dmp dataset +#' @keywords internal +call_dmr_routine <- function(tmeth_dmps, regions_annotations, min_DMP_counts, min_consec_DMP) { + # Annotate DMPs + tmeth_dmps <- annotate_dmp_regions(tmeth_dmps, regions_annotations) + dmp_cluster_counts <- get_cluster_counts(tmeth_dmps) # Note: Required later for merging to DMRs + tmeth_dmps <- merge(tmeth_dmps, dmp_cluster_counts, by = "cluster_id", all.x = T) + + # Get CpG and DMP counts for each cluster + tmeth_dmps[, DMR := ifelse( + DMP_counts >= min_DMP_counts & + consec_DMPs >= min_consec_DMP, + "DMR", + NA + )] + + # Return NULL if no DMRs overlap + # tmeth_dmr = tmeth_dmr[!is.na(DMR) & !is.na(DMP_t)] + # if(nrow(tmeth_dmr) ==0){return(data.table())} + + # CAMDAC legacy: add CNA segment + # tmeth_dmps[, segment := paste0(chrom,":",seg_start,"-",seg_end)] + + # Filter to DMRs + tmeth_dmr <- collapse_cpg_to_dmr(tmeth_dmps) + rm(tmeth_dmps) + + # Add CG counts and annotations + tmeth_dmr <- merge(tmeth_dmr, dmp_cluster_counts, by = "cluster_id", all.x = T) + tmeth_dmr <- re_annotate_dmrs(tmeth_dmr, regions_annotations) + + return(tmeth_dmr) +} + + + +# Helper: Calculate the maximum number of consecutive DMPs in a dataset +max_consec_dmp <- function(x) { + # Run-length encode DMPs + rz <- rle(x[!is.na(x)]) + + # Return NA if no DMPs present + if (length(rz$lengths) == 0) { + return(NA_integer_) + } + # Get maximum consecutive DMPs in DMR + return(max(rz$lengths)) +} + +# Helper: Get DMR type +set_dmr_type <- function(x) { + n_dmps <- length(x) + is_hypo <- sum(x == "hypo") / n_dmps >= 0.9 + is_hyper <- sum(x == "hyper") / n_dmps >= 0.9 + is_mixed <- !is_hypo & !is_hyper + dmr_type <- data.table::fcase( + is_hypo, "hypo", + is_hyper, "hyper", + is_mixed, "mixed" + ) + return(dmr_type) +} + +call_dmrs <- function(tmeth_dmps, regions_annotations, itersplit = 3e5, min_DMP_counts = 5, min_consec_DMP = 4, n_cores = 5) { + # Split region annotations in order to parallelise over subsets + split_factor <- make_split_factor(nrow(regions_annotations), itersplit) + regions_annotations <- split(regions_annotations, split_factor) + + # Set warn=2 to ensure foreach fails if any of the parallel workers are terminated or raise a warning. + # without this option, foreach simply returns a warning and the pipeline continues. Essential for memory warning terminations. + options(warn = 2) + # Calculate DMR data for CpGs in parallel + doParallel::registerDoParallel(cores = n_cores) + dmrs <- foreach(regions_subset = regions_annotations, .combine = "rbind") %dopar% { + call_dmr_routine(tmeth_dmps, regions_subset, min_DMP_counts, min_consec_DMP) + } + doParallel::stopImplicitCluster() + options(warn = 0) + + return(dmrs) +} + +# Note: mbdiff and mtdiff are calculated tumor - normal +dmp_call_pipe <- function(mbdiff, M_n, UM_n, M, UM, mtdiff = NULL, effect_size = 0.2, prob = 0.99, itersplit = 1e5, ncores=5) { + # Calculate bulk DMP probability from counts + phypo <- calc_prob_dmp(M_n, UM_n, M, UM, ncores = ncores, itersplit = itersplit) + + # Run bulk calculation if no pure given + if (is.null(mtdiff)) { + # If tumor is greater than normal then it's a hyper DMP + prob_DMP <- data.table::fifelse(mbdiff > 0, 1 - phypo, phypo) + DMP_b <- prob_to_call(prob_DMP, mbdiff, effect_size, prob) + return(data.table(prob_DMP, mbdiff, DMP_b)) + } else { + # Otherwise, run pure calculation and get DMP for bulk and pure + prob_DMP <- data.table::fifelse(mtdiff > 0, 1 - phypo, phypo) + DMP_b <- prob_to_call(prob_DMP, mbdiff, effect_size, prob) + DMP_t <- prob_to_call(prob_DMP, mtdiff, effect_size, prob) + return(data.table(prob_DMP, mbdiff, DMP_b, mtdiff, DMP_t)) + } +} diff --git a/R/download_pipeline_files.R b/R/download_pipeline_files.R index 16ef1a6..6517dfd 100644 --- a/R/download_pipeline_files.R +++ b/R/download_pipeline_files.R @@ -39,11 +39,11 @@ download_pipeline_files <- function(bsseq, directory=NULL, quiet=TRUE){ tryCatch({ download.file(link, destfile=tf, method="wget", quiet=quiet) }, error=function(e){ - logger::log_error("Pipeline files for {bsseq} could not be downloaded from {link}.") + logerror("Pipeline files for %s could not be downloaded from %s.", bsseq, link) stop() }) untar(tf, exdir=location) - logger::log_info("Pipeline files for {bsseq} downloaded to {location}") + logging::loginfo("Pipeline files for %s downloaded to %s", bsseq, location, logger="CAMDAC") return(location) } diff --git a/R/epi_allele_counts.R b/R/epi_allele_counts.R new file mode 100644 index 0000000..da27745 --- /dev/null +++ b/R/epi_allele_counts.R @@ -0,0 +1,301 @@ +# Main + +cwrap_get_epialleles <- function(bam_file, seg, loci_dt = NA, paired_end, drop_ccgg, + min_mapq = 1, min_cov = 3) { + # Loci may be NA if loci for segment chromosome (i.e. chromY) + # is missing. Return early in these cases as no alleles to count. + # Two conditions required to avoid error raised using is.na alone. + if (all(class(loci_dt) == "logical")) { + if (is.na(loci_dt)) { + return(empty_count_alleles_result()) + } + } + + # Pre-applying multi-SNP loci filter + loci_dt <- loci_dt[!duplicated(loci_dt, by = c("chrom", "start", "end"), fromLast = T)] + + # Read BAM and annotate SNP and CPG loci + bam_dt <- get_reads_in_segments(bam_file, seg, min_mapq, paired_end = paired_end) + if (nrow(bam_dt) == 0) { + return(empty_count_alleles_result()) + } + + # Overlap with loci + bam_dt <- format_bam_for_loci_overlap(bam_dt, paired_end = paired_end) + bam_dt <- annotate_bam_with_loci(bam_dt, loci_dt, drop_ccgg = drop_ccgg, paired_end = paired_end) + bam_dt <- drop_positions_outside_segments(bam_dt, seg) + if (nrow(bam_dt) == 0) { + return(empty_count_alleles_result()) + } + + if (paired_end) { + # For paired end samples, we must select a single read at overlapping and then fix the + # strand column to reflect read orientation (as per single-end CAMDAC) + bam_dt <- fix_pe_overlap_at_loci(bam_dt) # Filters + bam_dt <- add_loci_read_position(bam_dt) + bam_dt <- fix_pe_strand_with_flags(bam_dt) + bam_dt <- get_alleles_and_qual(bam_dt) + bam_dt <- drop_pe_fields(bam_dt) + } else { + bam_dt <- add_loci_read_position(bam_dt) + bam_dt <- get_alleles_and_qual(bam_dt) + } + + # Additional filtering + bam_dt <- filter_clipped_dinucleotides(bam_dt) + # bam_dt <- filter_bam_by_quality(bam_dt, min_mapq = min_mapq) + # Instead, annotate quality as we don't want to remove the knowledge of ref CG sites overlapping? + hi_qual_dinucs <- data.table::like(bam_dt$qual.dinucs, "([5-9A-K:-@])([5-9A-K:-@])") + hi_qual_snps <- data.table::like(bam_dt$qual.SNP, "([5-9A-K:-@])") + + # Set dinuc data to NA at positions where only SNP passed the quality filter + # This keeps the SNP data for downstream BAF/LogR but excludes the site from + # methylation rate calculations + bam_dt[ + width >= 2 & !hi_qual_dinucs, + c("alleles.dinucs", "qual.dinucs") := NA + ] + + # Filter BAM for high quality dinucleotides and SNPs + # bam_dt <- bam_dt[ + # (width >= 2 & hi_qual_dinucs) | # Hi quality dinucleotide filter + # (!is.na(POS) & hi_qual_snps) # Hi quality SNP filter + # ] + + # [1] 129852 20 + ## dim(bam_dt) + + # Filter records for minimum mapping quality + # Note: mq filtering can also be applied by ScanBam + bam_dt <- bam_dt[mq >= min_mapq] + + if (nrow(bam_dt) == 0) { + return(empty_count_alleles_result()) + } + + # Get nucleotide counts and flatten pileup + bam_dt <- annotate_nucleotide_counts(bam_dt) + + + # RE filters: Up to this point, reads only filtered, not CpG sites. So all reads overlapping should be canonical. + + # At this stage, we have a mapping between reads and CpGs, with information on whether the site is a CG SNP + # And the methylated/unmethylated counts + # We don't care about SNPs on the read for now in general, so remove these + bam_dt <- bam_dt[width > 1] + + # Now count the M or UM on the single CpGs, not on the pileup (as would be done in AC) + bam_dt <- get_read_methylation_counts(bam_dt) # This function is defined here, above. + + bam_dt <- bam_dt[, chrom := gsub("chr", "", chrom)] + + # Apply CADMAC rules to get allele counts, methylation rates and BAFs + res <- format_get_epialleles_result(bam_dt) + + return(res) +} + +# Helper functions for entropy allele counter +format_get_epialleles_result <- function(x) { + x[, + .( + read.start = min(read.start), + read.end = min(read.end), + cgstarts = list(start), + states = list(rle(M)) + ), + by = c("qname", "chrom") + ] +} + +get_read_methylation_counts <- function(bam_dt) { + # A combination of get_snp_allele counts and get_read_methylation_counts + # designed for single reads (not pileup) + + # Set REF_ALT combination string from loci + bam_dt[!is.na(ref), SNP := paste0(ref, alt)] + + # Count REF alleles. Any missing loci are set to NA + bam_dt[, ref_counts := data.table::fcase( + SNP == "AC", Af + Ar, + SNP == "CA", Tf + Cr + Cf, + SNP == "AG", Af, + SNP == "GA", Gf, + SNP == "AT", Af + Ar, + SNP == "TA", Tf + Tr, + SNP == "GT", Gf + Ar + Gr, + SNP == "TG", Tf + Tr, + SNP == "CG", Tf + Cr + Cf, + SNP == "GC", Gf + Ar + Gr, + SNP == "CT", Cr, + SNP == "TC", Tr + )] + + # Count ALT alleles. Any loci not present are set to NA + bam_dt[, alt_counts := data.table::fcase( + SNP == "AC", Tf + Cr + Cf, + SNP == "CA", Af + Ar, + SNP == "AG", Gf, + SNP == "GA", Af, + SNP == "AT", Tf + Tr, + SNP == "TA", Af + Ar, + SNP == "GT", Tf + Tr, + SNP == "TG", Gf + Ar + Gr, + SNP == "CG", Gf + Ar + Gr, + SNP == "GC", Tf + Cr + Cf, + SNP == "CT", Tr, + SNP == "TC", Cr + )] + # Count total reads contributing to SNP ref/alt counts + bam_dt[, total_counts := ref_counts + alt_counts] + + # Count all reads with nucleotides expected by CAMDAC rules. This includes + # positions where we couldn't distinguish between bisulfite conversion and SNPs. + # This will later be subtracted from total depth to determine unexpected nucleotide count. + bam_dt[, all_counts := data.table::fcase( + is.na(ref), TGf + CAr + CGf + CGr, + # For CT/AG SNPs, expected nucleotides are not in total_counts because they + # confound bisulfite conversion, however we add them here for all_counts. + SNP %like% "([GA][AG])", Af + Gf + Ar + Gr, + SNP %like% "([CT][TC])", Cr + Tr + Tf + Cf, + !is.na(ref), total_counts # All other positions get ref/alt counts + )] + + # Set M from reads reporting methylation at CG dinucleotides, + # ignoring reads at CG-destroying SNPs. + # Note: In RRBS version, CCGGs must be matched to SNP positions with a +1 offset + bam_dt[ + width > 1, # Calculate for CpG loci only, setting non-CpGs to NA + M := data.table::fcase( + is.na(SNP), CGf + CGr, + # When CpG starts at C/T SNP loci, we can't differentiate SNP from bisulfite conversion. + # Therefore, count the reverse strand (bottom in directional library) only + start == POS & SNP %like% "([CT][TC])", CGr, + # When CpG ends at A/G SNP loci, we can't differentiate SNP from bisulfite conversion. + # Therefore, count the forward strand (top in directional library) only + end == POS & SNP %like% "([AG][GA])", CGf, + # Count CpG dinucleotides at all other SNPs loci. This works as fcase + # moves through conditions in order. + !is.na(SNP), CGf + CGr + ) + ] + + # Set UM as reads reporting unmethylated CG dinucleotides as above, + # ignoring reads at CG-destroying SNPs. + bam_dt[ + width > 1, + UM := data.table::fcase( + is.na(SNP), TGf + CAr, + start == POS & SNP %like% "([CT][TC])", CAr, + end == POS & SNP %like% "([AG][GA])", TGf, + !is.na(SNP), TGf + CAr + ) + ] + + # Set methylation counts to NA at loci with insufficient data + bam_dt[ + M == 0 & UM == 0, + c("M", "UM") := NA + ] + + return(bam_dt) +} + +filter_loci_dt_multi_snp <- function(x) { + # Filter duplicated CpGs derived from CAMDAC annotations file + x[, msl := duplicated(x, by = c("chrom", "start", "end"), fromLast = F) | + duplicated(x, by = c("chrom", "start", "end"), fromLast = T)] +} + +# Function to read and write epiallele states from a data.table +write_epi_states <- function(x, outfile, inplace = F) { + if (inplace == F) { # Function below edits x in place. Decide whether to copy x or not + x <- copy(x) + } + # x: data.table returned from entropy allele counter. + # First, convert cg starts to pipe-separated string + x[, cgstarts := lapply(cgstarts, function(x) paste0(x, collapse = "|"))] + #  Next, convert states to pipe-separated string + x[, states := lapply(states, function(x) paste0(inverse.rle(x), collapse = "|"))] + # Finally, save to file + data.table::fwrite(x, outfile) +} + +read_epi_states <- function(infile) { + x <- data.table::fread(infile) + # First, convert cg starts to list + x[, cgstarts := lapply(cgstarts, function(x) strsplit(x, "|", fixed = TRUE))] + x[, cgstarts := lapply(cgstarts, function(x) as.numeric(x))] + + # Next, convert cgstates to list + x[, states := lapply(states, function(x) strsplit(x, "|", fixed = TRUE))] + suppressWarnings(x[, states := lapply(states, function(x) rle(as.numeric(x)))]) + + return(x) +} + +generate_epiallele_matrix <- function(x, chrom, region_start, region_end) { + # x is an epiallele counts table + # returns: A list with first entry as methylation matrix for the region, where + # 0:unmethylated, 1:methylated, 2:unknown, and NA:unmapped + # TODO: Do we want to restrict to reference genome hgs? + # For example, CG-SNPs will always be NA and mess up entropy calculations + stopifnot(region_end > region_start) + + x <- x[chrom == chrom] + + # Get reads that overlap region + overlaps <- x[ + !(read.end < region_start | read.start > region_end) + ] + + # Return empty list if no reads overlap + if (nrow(overlaps) == 0) { + return(list()) + } + + # Get the set of start sites in the matrix + startsets <- unique(unlist(overlaps$cgstarts)) + st_order <- order(startsets) + startsets <- startsets[st_order] + ncgs <- length(startsets) + + # For each read, convert the states to a vector of length ncgs + state_vecs <- lapply( + seq(nrow(overlaps)), + function(i) { + # Get sites in the read's CpGs + cur_rle <- overlaps$states[[i]] + ssite <- inverse.rle(cur_rle) + ssite[is.na(ssite)] <- 2 #  Sites uncovered are set to 2 + + # Get vector of all sites in the region, setting to NA if not in read + rsite <- overlaps$cgstarts[[i]] + rsel <- match(rsite, startsets) # Vectorised which() + cbool <- rep(NA, ncgs) + cbool[rsel] <- ssite + + return(cbool) + } + ) + + state_matrix <- do.call(rbind, state_vecs) + colnames(state_matrix) <- startsets + rownames(state_matrix) <- overlaps$qname + + # Keep CpGs in region + col_sel <- startsets >= region_start & startsets <= region_end + state_matrix <- state_matrix[, col_sel] + + res <- list( + "matrix" = state_matrix, + "meta" = data.frame( + chrom = chrom, + region_start = region_start, + region_end = region_end, + startsets = startsets + ) + ) + + return(res) +} diff --git a/R/format_output.R b/R/format_output.R index e7dc30a..ccdf3b3 100755 --- a/R/format_output.R +++ b/R/format_output.R @@ -20,7 +20,7 @@ #' is desired in addition to GRanges object in .RData file #' #' @return Concatenated SNP and CpG information - +#' @keywords internal format_output <- function(patient_id, sample_id, sex, is_normal = FALSE, path, path_to_CAMDAC, @@ -91,7 +91,7 @@ format_output <- function(patient_id, sample_id, sex, path_to_CAMDAC = path_to_CAMDAC, outfile = outfile_prefix ) - cat("Msp1 fragments information obtained for patient\n") + logging::loginfo("Msp1 fragments information obtained for patient\n", logger="CAMDAC") rm(outfile_prefix) } @@ -105,7 +105,7 @@ format_output <- function(patient_id, sample_id, sex, #' @param outfile character srting with output filename #' #' @author elizabeth larose cadieux - +#' @keywords internal get_msp1_fragments <- function(dt, build, path_to_CAMDAC, outfile) { # Set build to to assembly version disregarging USCS vs. Ensembl if (build == "GRCH37") { @@ -120,9 +120,10 @@ get_msp1_fragments <- function(dt, build, path_to_CAMDAC, outfile) { path_to_CAMDAC, paste0("pipeline_files/msp1_fragments/msp1_fragments_RRBS_", build, ".fst") ) - fragments <- read_fst(path = msp1_fragments_file, as.data.table = TRUE) + fragments <- fst::read_fst(path = msp1_fragments_file, as.data.table = TRUE) # Assign CpG IDs + dt = data.table::data.table(dt) dt[, CpG_ID := paste(CHR, start, end, sep = "_")] dt <- dt[!duplicated(CpG_ID), ] @@ -216,8 +217,8 @@ get_msp1_fragments <- function(dt, build, path_to_CAMDAC, outfile) { # plot log10 fragment size distribution outfile <- paste0(outfile, "fragment_length_histogram.pdf") - p <- ggplot(df_fragments) + - geom_histogram(aes(x = l, y = ..count..), col = "cornflowerblue", fill = "white", bins = 100) + + p <- ggplot2::ggplot(df_fragments) + + geom_histogram(aes(x = l, y = ggplot2::after_stat(count)), col = "cornflowerblue", fill = "white", bins = 100) + theme_classic() + ylab("Number of fragments") + coord_cartesian(xlim = c(35, 1000)) + #+coord_cartesian(xlim=c(log10(40),3))+ @@ -232,10 +233,12 @@ get_msp1_fragments <- function(dt, build, path_to_CAMDAC, outfile) { } +#' @title Round2 #' @description Round numerical values to 'n' digits #' @param x Numerical vector containing the numbers to round #' @param digits Numerical value representing the number of decimal digits to retain #' @return rounded numerical vector +#' @keywords internal round2 <- function(x, digits) { ifelse(as.integer(x * (10^(digits + 1))) %% 10 >= 5, ceiling(x * (10^digits)) / (10^digits), floor(x * (10^digits)) / (10^digits)) } diff --git a/R/get_allele_counts.R b/R/get_allele_counts.R index 38ff5b1..50ea921 100755 --- a/R/get_allele_counts.R +++ b/R/get_allele_counts.R @@ -25,22 +25,30 @@ #' #' @return One .fst file including methylation info at CpGs and BAF and depth of coverage at #' SNPs for the ith subset of RRBS loci - +#' @keywords internal get_allele_counts <- function (i , patient_id, sample_id, sex, bam_file, mq=0, - path, path_to_CAMDAC, build=NULL, n_cores, test=FALSE, paired_end = TRUE){ + path, path_to_CAMDAC, build=NULL, n_cores, test=FALSE, paired_end = TRUE, segments_bed=NULL){ if(getOption("scipen")==0){options(scipen = 999)} # important to turn scientific notation off when saving genomic coordinates to .txt files - - # ensure mq is parsed as numerical value - mq <- as.numeric(mq) - cat("Mapping treshold MQ ≥ ",mq," applied","\nBase quality treshold BQ ≥ 20 applied\n", sep = "") - + # Set working directory path and create results folders # Do not change this - subsequent functions will look for files in this directory path_output <- file.path(path, patient_id, "Allelecounts", sample_id) suppressWarnings(dir.create(path_output, recursive = TRUE)) + # Return output file if it exists + f_nm <- file.path(path_output, paste(patient_id, ".", sample_id, ".", i, ".SNPs.CpGs.fst", sep = "")) + if (file.exists(f_nm)){ + logging::loginfo(paste0("Output counts file exists - Skipping: ", f_nm, "\n"), logger="CAMDAC") + return(NULL) + } + + # ensure mq is parsed as numerical value + mq <- as.numeric(mq) + logging::loginfo(paste0("Mapping treshold MQ ≥ ",mq," applied.","Base quality treshold BQ ≥ 20 applied"), logger="CAMDAC") + + # Load doParrellel if running job in parrallel if(n_cores > 1){ x <- c("doParallel", "parallel") @@ -60,7 +68,7 @@ get_allele_counts <- function (i , patient_id, sample_id, sex, bam_file, mq=0, build <- ifelse(unname(tmp)=="155270560", "hg19", "hg38") UCSC <- ifelse(substr(names(tmp), 1, 1)=="c", TRUE, FALSE) } - cat(paste("ScanBam pileup with build ", build, sep = " "), "\n", sep = "") + logging::loginfo(paste0("ScanBam pileup with build ", build, sep = " "), logger="CAMDAC") # Set build variables if(build%in%c("hg19","hg38")){UCSC=TRUE} @@ -76,6 +84,16 @@ get_allele_counts <- function (i , patient_id, sample_id, sex, bam_file, mq=0, f_name = paste(segments_file_path, "segments.",build, ".",i, ".RData", sep = "") load(f_name) rm(f_name) + + # If segments_bed is given, subset segments file to overlapping locations + if (!is.null(segments_bed)){ + regions = read_segments_bed(segments_bed) + segments_subset = segments_subset[queryHits(GenomicRanges::findOverlaps(segments_subset, regions) )] + } + if(length(segments_subset) == 0){ + logging::loginfo(paste0("No regions found for ", i, "."), logger="CAMDAC") + return(NULL) + } # Ensure that spurious alignments to Y in females are removed if(sex=="XX"&i==25){segments_subset<-segments_subset[!as.character(seqnames(segments_subset))%in%c("chrY","Y")]} @@ -196,7 +214,7 @@ get_allele_counts <- function (i , patient_id, sample_id, sex, bam_file, mq=0, if(UCSC == FALSE){seqlevelsStyle(loci_subset) <- "Ensembl"} # Extract info per CpG/SNP loci - overlaps <- mergeByOverlaps(gr_bam, loci_subset) + overlaps <- IRanges::mergeByOverlaps(gr_bam, loci_subset) rm(gr_bam, loci_subset) df_pileup <- data.table(qname=as.character(overlaps$qname), strand=as.character(strand(overlaps[,"gr_bam"])), @@ -597,7 +615,6 @@ get_allele_counts <- function (i , patient_id, sample_id, sex, bam_file, mq=0, # get_reads in parrallel if(n_cores>1){ - print(n_cores) # Set the cluster cl <- makeCluster(n_cores) @@ -633,9 +650,9 @@ get_allele_counts <- function (i , patient_id, sample_id, sex, bam_file, mq=0, invisible(gc()) # Create file - f_nm <- file.path(path_output, paste(patient_id, ".", sample_id, ".", i, ".SNPs.CpGs.fst", sep = "")) - write_fst(df_merged, f_nm) - cat(paste0("Written to: ", f_nm, "\n")) + + fst::write_fst(df_merged, f_nm) + logging::loginfo(paste0("Written to: ", f_nm), logger="CAMDAC") } # END diff --git a/R/get_differential_methylation.R b/R/get_differential_methylation.R index f3e456d..2644012 100755 --- a/R/get_differential_methylation.R +++ b/R/get_differential_methylation.R @@ -42,7 +42,7 @@ #' enhancer (vista and FANTOM5 annotation) #' #' @return Biologically significant DMPs, DMRs - +#' @keywords internal get_differential_methylation <- function(patient_id,sample_id,sex,normal_origin_proxy_id, path,path_to_CAMDAC,build, @@ -57,9 +57,9 @@ get_differential_methylation <- "As such, you cannot set sample_id to your normal cell of origin sample ID.", sep="\n")) } - if(detectCores()= prob & m_diff_tn <= (-effect_size), "hypo", NA))] # checkpoint - print(paste("DMPs annotated given prob = ", prob, " and minimum effect-size = ", - effect_size, sep=" ")) + logging::loginfo(paste("DMPs annotated given prob = ", prob, " and minimum effect-size = ", + effect_size, sep=" "), logger="CAMDAC") # save CAMDAC results per CpG output_file1 = file.path(path_output, "CAMDAC_results_per_CpG.RData") @@ -118,9 +118,9 @@ get_differential_methylation <- save(CAMDAC_results_per_CpG, file=output_file1) # checkpoint - cat(paste0("CAMDAC CpG-wise results saved at:\n", output_file1, - "\nThis includes copy number information, pure tumour ", - "methylation rates and DMP calls.\n")) + logging::loginfo(paste0("CAMDAC CpG-wise results saved at: ", output_file1, + ". This includes copy number information, pure tumour ", + "methylation rates and DMP calls."), logger="CAMDAC") rm(output_file1) # extract DMPs @@ -135,7 +135,7 @@ get_differential_methylation <- write.table(CAMDAC_DMPs, file=output_file2, sep='\t', col.names = TRUE, quote=FALSE) # checkpoint - cat(paste0("CAMDAC DMPs saved in BED5 format at ", output_file2, "\n")) + logging::loginfo(paste0("CAMDAC DMPs saved in BED5 format at ", output_file2, "\n"), logger="CAMDAC") rm(CAMDAC_DMPs, output_file2) # Extract DMPs and obtain summary stats @@ -161,13 +161,13 @@ get_differential_methylation <- write.table(tmp , file=output_file3, sep='\t', col.names = FALSE, quote=FALSE) # checkpoint - cat(paste0("\nDMP summary stats saved in ",output_file3,"\n")) + logging::loginfo(paste0("\nDMP summary stats saved in ",output_file3,"\n"), logger="CAMDAC") rm(nam, nams, vec, tmp, output_file3) # load annotations annotations_file = paste0(path_to_CAMDAC, "/pipeline_files/", build, "_annotations/", build, "_all_regions_annotations.fst") - all_regions_anno <- read_fst(path = annotations_file, as.data.table = TRUE) + all_regions_anno <- fst::read_fst(path = annotations_file, as.data.table = TRUE) all_regions_anno[, chrom := factor(chrom, levels=c(1:22,"X","Y"), ordered=TRUE)] ## Group CpGs into bins, get bin methylation info and annotate Ensembl features @@ -191,6 +191,9 @@ get_differential_methylation <- anno_list=all_regions_anno, min_DMP_counts=min_DMP_counts_in_DMR, min_consec_DMP=min_consec_DMP_in_DMR, n_cores=n_cores, bulk=bulk) + if (is.null(CAMDAC_DMRs)){ + return(NULL) + } colnames(CAMDAC_DMRs)[grepl("^i\\.", colnames(CAMDAC_DMRs))] <- c("start", "end") # set filenames and filepaths @@ -205,7 +208,7 @@ get_differential_methylation <- rm(dt, f_nm) # checkpoint - cat(paste0(prefix, " DMRs identified and saved at ", output_file4, "\n")) + logging::loginfo(paste0(prefix, " DMRs identified and saved at ", output_file4, "\n"), logger="CAMDAC") rm(output_file4) # set filenames and filepaths @@ -251,6 +254,7 @@ get_differential_methylation <- #' #' @return A data.table object with all the CpG loci, their coverage, counts #' methylated and methylation rate +#' @keywords internal get_DMPs <- function (path, patient_id, sample_id, df, prob=0.99, n_cores) { # Evan Miller's closed form solution for the probability that @@ -280,7 +284,7 @@ get_DMPs <- function (path, patient_id, sample_id, df, prob=0.99, n_cores) { result <- cbind(numeric(n)) # Get DMPs - result[,1] <- mcmapply(function(alpha_n,beta_n,alpha_b,beta_b) { + result[,1] <- parallel::mcmapply(function(alpha_n,beta_n,alpha_b,beta_b) { prob_hypo <- NULL prob_hypo <- h(alpha_n = alpha_n, beta_n=beta_n, alpha_b=alpha_b, beta_b=beta_b) if(is.null(prob_hypo)){prob_hypo <- NA} @@ -314,6 +318,7 @@ get_DMPs <- function (path, patient_id, sample_id, df, prob=0.99, n_cores) { #' @param n_cores number of cores for parallel processing #' #' @return A dataframe for each sample_id with the copy number calls added +#' @keywords internal bin_CpGs <- function (path, patient_id, sample_id, dt, anno_list, n_cores) { # Ensure dt is a data.table object @@ -339,9 +344,9 @@ bin_CpGs <- function (path, patient_id, sample_id, dt, anno_list, n_cores) { ids <- unique(ov$cluster_id) l = length(ids) - cat("Concatenate annotated bins\n") + logging::logdebug("Concatenate annotated bins", logger="CAMDAC") # concatenate annotated CpG methylation - dt_anno_bins <- rbindlist(mclapply(1:l, function(i, df, ids){ + dt_anno_bins <- rbindlist(parallel::mclapply(1:l, function(i, df, ids){ x <- df[cluster_id==ids[i], ] x <- x[, segment := paste0(chrom,":",seg_start,"-",seg_end)] y <- x[, .(m_n= mean(m_n, na.rm=TRUE), @@ -391,6 +396,7 @@ bin_CpGs <- function (path, patient_id, sample_id, dt, anno_list, n_cores) { #' @param n_cores number of cores for parallel processing #' #' @return A dataframe for each sample_id with the copy number calls added +#' @keywords internal get_DMRs <- function (path, patient_id, sample_id, dt, anno_list, min_DMP_counts, min_consec_DMP, n_cores, bulk=FALSE) { @@ -409,8 +415,8 @@ get_DMRs <- function (path, patient_id, sample_id, dt, anno_list, start,end),] # print analysis parameters - cat(paste0("DMR threholds","\n", "min DMP counts : ", min_DMP_counts ,"\n", - "min number of consecutive DMPs : ", min_consec_DMP, "\n")) + logging::loginfo(paste0("DMR thresholds.", "min DMP counts : ", min_DMP_counts ,". ", + "min number of consecutive DMPs : ", min_consec_DMP, "."), logger="CAMDAC") # annotations to be assigned anno_names <- c("all_CpGs", "intergenic", "intragenic", "CGI", "shore", "shelf", @@ -486,11 +492,19 @@ get_DMRs <- function (path, patient_id, sample_id, dt, anno_list, # extract all bin ids with coverage and number of unique bins ids <- results[!is.na(DMR), unique(cluster_id)] l <- length(ids) + + # Report and return if no DMRs found + if (length(ids) == 0){ + logging::loginfo("No DMRs found with the current parameters.", logger="CAMDAC") + return(NULL) + } else { + logging::loginfo(paste0("Number of DMRs found: ", length(ids), "\n"), logger="CAMDAC") + } - cat("Concatenate DMR calls \n") + logging::logdebug("Concatenate DMR calls.", logger="CAMDAC") # concatenate annotated CpG methylation if(bulk==FALSE){ - dt_DMRs <- rbindlist(mclapply(1:l, function(i, df, ids){ + dt_DMRs <- rbindlist(parallel::mclapply(1:l, function(i, df, ids){ x <- df[cluster_id==ids[i], ] x <- x[, segment := paste0(chrom,":",seg_start,"-",seg_end)] y <- x[, .(m_n= ifelse(sum(!is.na(DMP_t))==0, as.numeric(NA), mean(m_n[!is.na(DMP_t)])), @@ -510,7 +524,7 @@ get_DMRs <- function (path, patient_id, sample_id, dt, anno_list, return(y) }, df=ov, ids=ids, mc.cores=n_cores)) } else { - dt_DMRs <- rbindlist(mclapply(1:l, function(i, df, ids){ + dt_DMRs <- rbindlist(parallel::mclapply(1:l, function(i, df, ids){ x <- df[cluster_id==ids[i], ] x <- x[, segment := paste0(chrom,":",seg_start,"-",seg_end)] y <- x[, .(m_b= ifelse(sum(!is.na(DMP_b))==0, as.numeric(NA), mean(m_b[!is.na(DMP_b)])), @@ -603,11 +617,12 @@ get_DMRs <- function (path, patient_id, sample_id, dt, anno_list, # Plot summary methylation information with annotated information # Arguments: +#' @title Plot methylation information #' @param dt Data table with methylation information per CpG #' @param path Character path variable pointing to the desired working directory. #' @param bulk Logical determining whether the bulk or purified tumour is to be plotted #' @return NULL - +#' @keywords internal plot_methylation_info_with_anno <- function(dt, path, bulk){ # Set color code for hyper/hypo diff --git a/R/get_pure_tumour_methylation.R b/R/get_pure_tumour_methylation.R index 603fd11..10e5d03 100755 --- a/R/get_pure_tumour_methylation.R +++ b/R/get_pure_tumour_methylation.R @@ -32,7 +32,7 @@ #' enhancer (vista and FANTOM5 annotation) #' #' @return CAMDAC purified tumour methylation rates - +#' @keywords internal get_pure_tumour_methylation <- function(patient_id,sample_id,sex, normal_infiltrates_proxy_id, path,path_to_CAMDAC,build, @@ -44,9 +44,9 @@ get_pure_tumour_methylation <- function(patient_id,sample_id,sex, "samples as this sample is a proxy for the normal methylation rate", sep="\n")) } - if(detectCores()% + dplyr::filter(V1 == sample_id) %>% + unlist() %>% + as.character() + opt <- list() + opt$patient_id <- data[1] + opt$tumour_bam <- data[2] + opt$normal_bam <- data[3] + opt$sex <- data[4] + opt$reference_dir <- refdir + opt$outdir <- outdir + + tumour <- CamSample( + patient_id = opt$patient_id, + sex = opt$sex, + id = "T", + bam = opt$tumour_bam + ) + + normal <- CamSample( + patient_id = opt$patient_id, + sex = opt$sex, + id = "N", + bam = opt$normal_bam + ) + + config <- CamConfig( + refs = opt$reference_dir, + outdir = opt$outdir, + build = "hg38", + bsseq = "wgbs", + bsseq_lib = "pe", + n_cores = 10, + n_seg_split = 1000 + ) + + return(list(tumour, normal, config)) +} + +setup_cna_inject_subdir <- function(tumour, normal, config, subdir_name) { + # Name subdirectory + wgs_outdir <- fs::path(config$outdir, subdir_name) + + # Sym-link allele counts and tsnps file + allele_counts_t <- get_fpath(tumour, config, "counts") + wgs_allele_counts_t <- fs::path(wgs_outdir, gsub(config$outdir, "", allele_counts_t)) + + allele_counts_n <- get_fpath(normal, config, "counts") + wgs_allele_counts_n <- fs::path(wgs_outdir, gsub(config$outdir, "", allele_counts_n)) + + tsnps_f <- get_fpath(tumour, config, "tsnps") + wgs_tsnps_f <- fs::path(wgs_outdir, gsub(config$outdir, "", tsnps_f)) + + # Ensure parent directories exist fore ach new file + create_parent <- function(x) fs::dir_create(fs::path_dir(x)) + create_calls <- sapply( + c(wgs_allele_counts_t, wgs_allele_counts_n, wgs_tsnps_f), + create_parent + ) + + # Create symlinks and update config to new output directory + fs::file_copy(allele_counts_t, wgs_allele_counts_t, overwrite = T) + fs::file_copy(allele_counts_n, wgs_allele_counts_n, overwrite = T) + fs::file_copy(tsnps_f, wgs_tsnps_f, overwrite = T) + + # Update config outdir + config$outdir <- wgs_outdir + return(config) +} + +#' Cache existing CAMDAC results into a sub-directory so that the current ones can be +#' overwritten by the refitting pipeline +#' Decided this is unnecessary as the initial results were so wrong. +# setup_cache_refit <- function(tumour, normal, config, cache_path){ +# +# } + +#' Exported only for development +#' @keywords internal +helper_camdac_pileup <- function(bam_file, seg, loci_dt) { + paired_end <- T + drop_ccgg <- T + bam_dt <- CAMDAC:::get_reads_in_segments(bam_file, seg, paired_end = paired_end, min_mapq = 0) + if (nrow(bam_dt) == 0) { + return(empty_count_alleles_result()) + } + bam_dt <- CAMDAC:::format_bam_for_loci_overlap(bam_dt, paired_end = paired_end) + bam_dt <- CAMDAC:::annotate_bam_with_loci(bam_dt, loci_dt, drop_ccgg = drop_ccgg, paired_end = paired_end) + bam_dt <- CAMDAC:::drop_positions_outside_segments(bam_dt, seg) + bam_dt <- CAMDAC:::fix_pe_overlap_at_loci(bam_dt) + bam_dt <- CAMDAC:::add_loci_read_position(bam_dt) + bam_dt <- CAMDAC:::fix_pe_strand_with_flags(bam_dt) + bam_dt <- CAMDAC:::get_alleles_and_qual(bam_dt) + bam_dt <- CAMDAC:::drop_pe_fields(bam_dt) + bam_dt <- CAMDAC:::filter_clipped_dinucleotides(bam_dt) + bam_dt <- CAMDAC:::annotate_nucleotide_counts(bam_dt) + bam_dt$total_depth <- 1 + bam_dt <- CAMDAC:::get_snp_allele_counts(bam_dt) + return(bam_dt) +} + + +#' Parse ASCAT and Battenberg output directories to load CNA data +#' +#' See "annotate_copy_number" func +#' A function required to load copy number for a tumour sample from camdac, either ascat or bb, +#' result should be: chrom, start, end, nA, nB, CN (total), seg_min and seg_max. +#' This should also include the purity and ploidy. As a separate list? +#' note that seg_min and seg_max are actually duplicates of the start and end columns, required to +#' keep track of the ascat segment positions after overalp +#' WARN: This drops sex chromosome but not implimented. Also should drops CN=0 (hom del) regions +#' @export +load_cna_data <- function(tumour, config, data_type) { + if (data_type == "ascat") { + return( + load_cna_data_ascat(tumour, config) + ) + } else if (data_type == "battenberg") { + return( + load_cna_data_battenberg(tumour, config) + ) + } else { + stop("Invalid data type argument given") + } +} + + +load_cna_data_ascat <- function(tumour, config) { + ascat.output <- qs::qread(get_fpath(tumour, config, "ascat")) + purity <- ascat.output$aberrantcellfraction + ploidy <- ascat.output$ploidy + fit <- ascat.output$goodnessOfFit + + seg <- ascat.output$segments_raw + cna_clean <- data.table( + chrom = factor(seg$chr, levels = c(1:22, "X", "Y")), + start = as.numeric(as.character(seg$startpos)), + end = as.numeric(as.character(seg$endpos)), + major_cn = seg$nMajor, minor_cn = seg$nMinor + ) + + setkeyv(cna_clean, cols = c("chrom", "start", "end")) + + cna_clean$purity <- purity + cna_clean$ploidy <- ploidy + cna_clean$fit <- fit + cna_clean$pipeline <- "ascat" + return(cna_clean) +} + +load_cna_data_ascat_wgs <- function(ascat_output_file) { + load(ascat_output_file) + purity <- ascat.output$aberrantcellfraction + ploidy <- ascat.output$ploidy + fit <- ascat.output$goodnessOfFit + + seg <- ascat.output$segments_raw + cna_clean <- data.table( + chrom = factor(seg$chr, levels = c(1:22, "X", "Y")), + start = as.numeric(as.character(seg$startpos)), + end = as.numeric(as.character(seg$endpos)), + nA = seg$nMajor, nB = seg$nMinor + ) + + setkeyv(cna_clean, cols = c("chrom", "start", "end")) + + result <- list( + purity = purity, ploidy = ploidy, fit = fit, ascna = cna_clean + ) + + return(result) +} + +load_cna_data_battenberg <- function(tumour, config, bb_raw = FALSE, bb_dir = NA, cna_glob="*_copynumber.txt") { + # Allows us to use this helper for non-CAMDAC directories + if (is.na(bb_dir)) { + bb_dir <- fs::path_dir(get_fpath(tumour, config, "battenberg")) + } + + # Skip if expected files don't exist + pp_file <- fs::dir_ls(bb_dir, glob = "*purity_ploidy.txt*") + cna_file <- fs::dir_ls(bb_dir, glob = cna_glob) + if (length(pp_file) == 0 | length(cna_file) == 0) { + return(NULL) + } + + # Load purity and ploidy + pp <- data.table::fread(pp_file) + fit_file <- fs::dir_ls(bb_dir, glob = "*rho_and_psi.txt") + ff <- suppressWarnings(data.table::fread(fit_file)) + pp$fit <- round(ff[is.best == T]$distance * 100, 3) + + # Load cna file + cna_file <- cna_file[!grepl("chrX", cna_file)] + bb_cna <- data.table::fread(cna_file) + bb_cna_fields <- c( + "chr", "startpos", "endpos", "nMaj1_A", "nMin1_A", + "frac1_A", "nMaj2_A", "nMin2_A", "frac2_A" + ) + bb_cna <- bb_cna[, ..bb_cna_fields] + bb_cna <- bb_cna[!is.na(nMaj1_A)] + + # Ensure Maj/Min2 are integer. Required when NA in field for compleetely clonal sample is read + bb_cna$nMaj2_A <- as.integer(bb_cna$nMaj2_A) + bb_cna$nMin2_A <- as.integer(bb_cna$nMin2_A) + + # Select major copy number from joint solutions + bb_cna[, selector := (bb_cna$frac1_A > bb_cna$frac2_A) | is.na(bb_cna$frac2_A)] + bb_cna[, nA := data.table::fifelse( + selector, nMaj1_A, nMaj2_A + )] + bb_cna[, nB := data.table::fifelse( + selector, nMin1_A, nMin2_A + )] + + # Ensure nA is always nMajor + bb_cna[nA bb_cna$frac2_A) | is.na(bb_cna$frac2_A)] + bb_cna[, nA := data.table::fifelse( + selector, nMaj1_A, nMaj2_A + )] + bb_cna[, nB := data.table::fifelse( + selector, nMin1_A, nMin2_A + )] + bb_cna[, CN := nA + nB] + + # Finalise data for export + cna_clean <- bb_cna[, .( + chrom = factor(chr, levels = c(1:22, "X", "Y")), start = startpos, end = endpos, + nA, nB, CN + )] + setkeyv(cna_clean, cols = c("chrom", "start", "end")) + + cna_clean$sample_id <- sample_id + cna_clean$purity <- pp$purity + cna_clean$ploidy <- pp$ploidy + cna_clean$fit <- pp$fit + + return(cna_clean) +} + +camdac_winsorize_tsnps <- function(tumour, config) { + # Read data, winsorize BAF, delete outliers at 0/1 & write + tsnps_output_file <- CAMDAC::get_fpath(tumour, config, "tsnps") + tsnps <- data.table::fread(tsnps_output_file) + tsnps_hets <- tsnps[between(BAFr_n, 0.15, 0.85), .(chrom, POS, BAFr)] + baf_outliers <- winsorize(tsnps_hets$BAFr)$outliers + + # CAMDAC rule : We remove winsorize outliers that are within BAF 0 and 1 + # so that we aren't artificially removing 0s and 1s at imbalanced regions + tsnps_hets <- tsnps_hets[baf_outliers][BAFr == 0 | BAFr == 1] + + setkey(tsnps, chrom, POS) + tsnps <- tsnps[!tsnps_hets] + tsnps <- sort_genomic_dt(tsnps) + data.table::fwrite(tsnps, tsnps_output_file) + return(tsnps_output_file) +} + +chelper_import_pon_meth <- function(tumour, normal_id, config, pon_file) { + # Imports PON file into the same patient folder as the tumour sample. + normal_pon_tpid <- CamSample( + tumour$patient_id, tumour$sex, normal_id, "normal", NA + ) + outfile <- get_fpath(normal_pon_tpid, config, "meth") + outdir <- fs::path_dir(outfile) + fs::dir_create(outdir) + fs::file_copy(pon_file, outfile, overwrite = T) + return(normal_pon_tpid) # Normal sample for deconvolution +} + +read_segments_bed <- function(bed_file) { + # Read segments from bed file + segments <- data.table::fread(bed_file, header = FALSE) + # Read as GRanges + segments <- GRanges(segments$V1, IRanges(segments$V2, segments$V3)) + + # Combine overlapping segments + segments <- reduce(segments) + + # Convert to GRangesList + segments <- split(segments, seq_len(length(segments))) + + return(segments) +} + +fread_chrom <- function(x, ...) { + # Read a file with a chrom column and ensure it is a factor + # Used for counts and snp files where X and Y are missing + x <- data.table::fread(x, ...) + x$chrom <- as.character(x$chrom) + if (stringr::str_starts(x$chrom[[1]], "chr")) { + x$chrom <- gsub("chr", "", x$chrom) + } + return(x) +} + +#' Manually assign output file to CAMDAC sample +#' @param sample CamSample object +#' @param config CamConfig object +#' @param code Code for output file. See `vignettes("output")` for descriptions. +#' @param file Path to file to copy to expected location +#' @export +attach_output <- function(sample, config, code, file) { + # Validate external CNA files before attaching to CAMDAC + if (code == "cna" | code == "asm_cna") { + cna <- validate_cna(data.table::fread(file)) + } + + # Get the expected output file path + exp <- get_fpath(sample, config, code) + + # Ensure directory exists + fs::dir_create(fs::path_dir(exp)) + + # Copy file to expected location + fs::file_copy(file, exp, overwrite = TRUE) +} + +# Validate CNA object +validate_cna <- function(cna) { + if (is.null(cna)) { + return(NULL) + } + + # Check expected number of columns. Rename and clean as per docs. + stopifnot(ncol(cna) >= 7) + names(cna)[1:7] <- c("chrom", "start", "end", "major_cn", "minor_cn", "purity", "ploidy") + cna$chrom <- stringr::str_remove(cna$chrom, "chr") + cna$chrom <- factor(cna$chrom, levels = c(1:22, "X", "Y")) + cna <- as.data.table(cna)[order(chrom, start)] + return(cna) +} + +# Get ASM snps as granges +load_asm_snps_gr <- function(camsample, config) { + # Load ASM SNPs from file + hets_file <- get_fpath(camsample, config, "asm_snps") + snps <- data.table::fread(hets_file) + snps_gr <- GRanges( + seqnames = snps$chrom, + ranges = IRanges(start = snps$pos, end = snps$pos), + strand = "*" + ) + seqlevelsStyle(snps_gr) <- "UCSC" + snps_gr$ref <- snps$ref + snps_gr$alt <- snps$alt + snps_gr$hap_id <- seq(length(snps_gr)) + + return(snps_gr) +} + +camdac_to_bedgraph <- function(x, outfile){ + # Convert CAMDAC allele couns file to bedgraph format + # Output is headerless tab-delimited file with the fields 1=chr, 2=start, 3=M, 4=UM + + # Set methylation and coverage fields based on input file name + fields = dplyr::case_when( + stringr::str_ends(x, "SNPs.CpGs.all.sorted.csv.gz") ~ c("total_counts_m", "m"), + stringr::str_ends(x, "m.csv.gz") ~ c("cov", "m"), + stringr::str_ends(x, "pure.csv.gz") ~ c("cov_t", "m_t") + ) + if(all(is.na(fields))){ + logwarn("Unrecognised file format. No bedgraph created for {x}") + return(NULL) + } + counts_field = dplyr::sym(fields[1]) + m_field = dplyr::sym(fields[2]) + + # Read data + x = data.table::fread(x) + # Ensure UCSC format + is_ucsc = grepl("chr", x$chrom[[1]]) + if(!is_ucsc){ + x$chrom = paste0("chr", x$chrom) + } + + # Drop any SNP-only positions + x = x[(end - start)>0,] + + # Set start for CCGG sites (from RRBS) + x[, nstart := ifelse( (end - start) == 3, start + 1, start)] + + # Calculate total counts + xbg = x[, .(chrom, nstart)] + xbg$M = round(x[[counts_field]]*x[[m_field]],0) + xbg$UM = x[[counts_field]] - xbg$M + xbg = xbg[!is.na(M) & !is.na(UM),] + + # Write out to bedgraph + data.table::fwrite(xbg, outfile, sep="\t", quote=F, col.names=F) + + return(outfile) +} + diff --git a/R/imports.R b/R/imports.R new file mode 100644 index 0000000..0bbeef0 --- /dev/null +++ b/R/imports.R @@ -0,0 +1,101 @@ +# Import calls make package functions available to CAMDAC internal functions without :: syntax. +# NULL required for roxygen to generate these import calls in NAMESPACE after devtools::document() + +#' @import data.table +#' @import ggplot2 +#' @import foreach +#' @import doParallel +#' @import GenomicRanges +#' @import GenomicAlignments +#' @import logging +#' @import ggplot2 +#' @import dplyr +#' @import gridExtra +#' @import stringr +#' @import png +#' @import MASS +#' @import GenomeInfoDb +#' @importFrom IRanges IRanges +#' @import S4Vectors +#' @import Rsamtools +#' @importFrom grDevices adjustcolor dev.off png rgb +#' @importFrom graphics abline axis par plot points rect text +#' @importFrom stats cor frequency lm median na.omit optimize qbeta rbeta runif setNames +#' @importFrom utils read.table write.table +NULL + +# Global variables used in packages like data.table with non-standard evaluation. +# utils::globalVariables stops errors in devtools::check() and submissions to CRAN. +utils::globalVariables( + c( + "ref.snp", + "ref", + "alt.snp", + "alt.af", + "chrom", + "start", + "end", + "nA", + "nB", + "seg", + "i", + "DMR", + "DMP_t", + "m_n", + "m_x_low_n", + "m_x_high_n", + "m_t", + "m_t_low", + "m_t_high", + "prob_DMP", + "m", + "CG_CN", + "nA", + "nB", + "segment", + "chrom", + "i.start", + "i.end", + "ref", + "alt", + "total_counts", + "alt_counts", + "LogR_corr", + "LogR", + "GC", + "repli", + "POS", + "ref_counts", + "dt_chrom", + "#CHR", + "Count_A", + "Count_C", + "Count_G", + "Count_T", + "Good_depth", + "..outfile_columns", + "normalLogR", + "p", + "cov", + "cov_t", + "m_t", + "end", + "width", + "read.start", + "CN", + "m_t_raw", + "groupid", + "N", + "flag", + "Af", "Ar", "Cf", "Cr", "Gf", "Gr", "Tf", "Tr", "CGf", "CGr", "TGf", "CAr", "CCGG", "mq", + "POS", "qwidth", "..keep_columns", "CHR", "alleles.dinucs", "qual.dinucs", "gr_normal", + "gr_tumor", "M", "UM", "total_counts_m", "M_snuc", "mate_status", "total_depth", + "BAFloci", ".", "flag", "Frequency", "BAF", "cov", "rstart", "rend", "rwidth", + "snp_width", "qname", "ah_strand", "snp_start", "strand", "cigar", "ID", "QUAL", + "FILTER", "INFO", "FORMAT", "strand", "i.strand", "..bam_cols", "cg_snp", + "CG_CN_n", "cluster_id", "seqnames", "gc_file", "BAFr", "total_counts_n", "alleles.SNP", + "..cell_line_cols", "AF", "qual", "qual.SNP", "SNP", "UM_snuc", "all_counts", "other_counts", + "selector", "nMaj1_A", "nMaj2_A", "nMin1_A", "nMin2_A", "chr", "startpos", "endpos", + "is.best", "..bb_cna_fields", "rBAF", "BAF_n", "count", "density" + ) +) \ No newline at end of file diff --git a/R/methylation.R b/R/methylation.R new file mode 100755 index 0000000..dcf5b3c --- /dev/null +++ b/R/methylation.R @@ -0,0 +1,463 @@ +#' Pre-process CAMDAC methylation data + +# Code ---- + +#' Calculate HDI interval width +#' @noRd +#' @keywords internal +intervalWidth <- function(lowTailPr, ICDFname, credMass, ...) { + ICDFname(credMass + lowTailPr, ...) - ICDFname(lowTailPr, ...) +} + +#' HDI of ICDF +#' @param ICDFname The inverse cumulative density function of the distribution. +#' @param credMass The desired mass of the HDI region. +#' @param tol Tolerance parameter for optimisation. the lower the tolerance,the +#' longer the optimisation, but the higher the accuracy. +#' According to CAMDAC RRBS comments, tol=1e-4 gives values +#' of the same accuracy as our max resolution. +#' This function is adapted from Greg Snow's TeachingDemos package +#' E.g.Determine HDI of a M=30 and UM=12 CpG +#' Adding 1 to shape parameter ensures uniform beta(1,1) is updated with our counts +#' HDIofICDF(qbeta,shape1 = 30+1 , shape2 = 12+1 ) +#' @return Highest density interval (HDI) limits in a vector. +#' @keywords internal +HDIofICDF <- function(ICDFname, credMass = 0.99, tol = 1e-4, ...) { + incredMass <- 1.0 - credMass + + # Here, shape parameters are passed to ICDFname function via `...` + optInfo <- optimize(f = intervalWidth, interval = c(0, incredMass), ICDFname = ICDFname, credMass = credMass, tol = tol, ...) + + HDIlowTailPr <- optInfo$minimum + vec <- setNames(object = ICDFname(c(HDIlowTailPr, credMass + HDIlowTailPr), ...), nm = c("low", "high")) + return(data.frame(lo = vec[[1]], high = vec[[2]])) + # return(vec) +} + +# Calculate HDI counts for unique combinations of records to speed up processing time +unique_calculate_counts_hdi <- function(M, UM, n_cores = 1, itersplit = 5e5) { + # Itersplit default: Benchmarking found 500K cpgs takes ~1min + # Validate M-length + inp_len <- length(M) + stopifnot(inp_len == length(UM)) + + # Get unique pairs to save computation + udata <- unique(data.table(M = M, UM = UM)) + M <- udata$M + UM <- udata$UM + inp_len <- length(M) + rm(udata) + + # Split data for parallel chunks + split_factor <- make_split_factor(inp_len, itersplit) + M <- split(M, split_factor) + UM <- split(UM, split_factor) + + # Calculate HDI parallel + doParallel::registerDoParallel(cores = n_cores) + # mapply is used to vectorise over M and UM, which are arrays + hdi <- foreach(M = M, UM = UM) %dopar% { + hdi_qbeta(M, UM) + } + doParallel::stopImplicitCluster() + + # Bind result as data.table + hdi <- data.table::rbindlist(hdi) + res <- cbind( + data.table(M = unlist(M), UM = unlist(UM)), + hdi + ) + + return(res) +} + +calculate_counts_hdi <- function(M, UM, n_cores = 1, itersplit = 5e5) { + # Calculate HDI and bind to original data. Adds columns "m_x_low" and "m_x_high" + u_hdi <- unique_calculate_counts_hdi(M, UM, n_cores = n_cores, itersplit = itersplit) + u_hdi <- round(u_hdi, digits = 5) + # Combine original data with HDI in order + hdi_data <- merge( + data.table(M = M, UM = UM, i = seq(length(M))), + u_hdi, + all.x = TRUE, + by = c("M", "UM") + ) + names(hdi_data) <- c("M", "UM", "hdi_i", "m_x_low", "m_x_high") + hdi_data <- hdi_data[order(hdi_i), .(m_x_low, m_x_high)] + # Return HDI data alone (able to cbind original data) + return(hdi_data) +} + +# Return allele counts data restricted to methylation sites and formatted for +# downstream deconvolution and DMP identification +process_methylation <- function(allele_counts, min_meth_loci_reads = 3) { + # Limit data to CpG/CCGG sites with methylation data. + methyl <- allele_counts[width > 1 & !is.na(m) & total_counts_m > min_meth_loci_reads] + rm(allele_counts) + + # Annotate heterozygous SNPs for downstream CG-SNP investigation + # FEATURE: BAF is already counted in allele_counts, + # therefore this label may be more appropriate earlier in the pipeline + methyl[, SNP := fifelse(!is.na(BAF) & (BAF >= 0.15) & (BAF <= 0.85), 1, 0)] + + # Add column of methylation coverage as cov. + methyl[, cov := total_counts_m] + + # Add CG-SNP status (CG-forming or destroying). Required for accurate CG-copy number assignment + methyl[, cg_snp := classify_cg_snp(start, width, POS, ref, alt)] + + # Subset to required output columns + methyl <- methyl[, .(chrom, start, end, M, UM, m, cov, SNP, BAF, cg_snp)] + + # Return data + return(methyl) +} + + + +save_methylation_df <- function(methyl, sample, config) { + output_file <- get_fpath(sample, config, "meth") + data.table::fwrite(methyl, file = output_file) +} + +# A key result of run_methylation_data_processing is the dt_tumour_and_normal_m.RData object +# I would like to see how far I can get without combining these two, but rather working with the data separately (memory issues afterall) +# I.e. can differential methylation analysis start by loading each separately? +combine_tumour_normal_methylation <- function(t_meth, n_meth) { + new_names <- sapply(names(n_meth), function(x) { + ifelse(x %in% c("CHR", "chrom", "start", "end"), x, paste0(x, "_n")) + }) + names(n_meth) <- new_names + + # Set keys for merge (sorts table internally) + # We first correct orderings for chrom fields by making them factors + t_meth$chrom <- factor(t_meth$chrom, levels = c(1:22, "X", "Y")) + n_meth$chrom <- factor(n_meth$chrom, levels = c(1:22, "X", "Y")) + setkey(t_meth, chrom, start, end) + setkey(n_meth, chrom, start, end) + + # Combine into a single CpG table. + # All normal fields are now prefixed with 'i.' + # nomatch=0 drops mismatching fields, rather than retaining them as NA + # type="equal" searches for exact CpG range matches + meth_c <- foverlaps(n_meth, t_meth, nomatch = 0, type = "equal") + meth_c$i.start <- NULL + meth_c$i.end <- NULL + setkey(meth_c, chrom, start, end) + return(meth_c) +} + +annotate_cgs_with_cnas <- function(meth_c, cna) { + # Format cna names, allowing datasets to be independent + names(cna)[1:5] <- c("chrom", "start", "end", "nA", "nB") + setkey(cna, chrom, start, end) + # Add additional columns so segments can be referenced elswhere in code + cna$seg_start <- cna$start + cna$seg_end <- cna$end + cna$CN <- cna$nA + cna$nB + meth_cna <- foverlaps(cna, meth_c, nomatch = 0) + meth_cna[, i.start := NULL] + meth_cna[, i.end := NULL] + + # Set CG copy number us BAF threshold of 0.5. + # At CG-SNP sites, reads will only contain CGs depending on whether the site is a CG-forming or CG-destroying SNP. + # Therefore the methylation copy number at these loci will reflect the major or minor allele. + meth_cna[, CG_CN := data.table::fcase( + # If not a CG-SNP, take total copy number + is.na(BAF), CN, + # Take major if majority allele contains CG (CG-destroying with low BAF or CG-forming with high BAF) + (BAF <= 0.5 & cg_snp == "D") | (BAF >= 0.5 & cg_snp == "F"), nA, + # Take minor if majority allele does no contain CG (CG-destroying with high BAF or CG-forming with low BAF) + (BAF > 0.5 & cg_snp == "D") | (BAF < 0.5 & cg_snp == "F"), nB, + # Set to total CN for any other case + rep(TRUE, nrow(meth_cna)), CN + )] + + # Set normal CG copy number. + # CG_CN_n is 1 at CG-destroying/forming hetrozygous SNPs + meth_cna[, CG_CN_n := data.table::fifelse(is.na(BAF), 2, 1, na = NA)] + # Set normal copy number on sex chromosome X in MALES + # In males, it has CN=1 outside PAR regions and 2 within. + # TODO: meth_cna <- calculate_cg_cn_norm(meth_cna, patient_sex, reference_genome) + + # Add overall tumour purity + meth_cna[, p := cna$purity[[1]]] + return(meth_cna) +} + +# TODO: Include CCGG sites in this function for RRBS +classify_cg_snp <- function(start, width, POS, ref, alt) { + cg_snp_class <- data.table::fcase( + # A CG-SNP is any position with a width >1 and a non-na POS + # Return NA for sites that do not meet this criteria + width != 2 | is.na(POS), NA_character_, + # CG-destroying SNPs have a reference C at CG-start or G at CG-end. + (start == POS & ref == "C") | (start + 1 == POS & ref == "G"), "D", + # CG-forming SNPs have an alt C at the CG-start or G at the CG-end. + (start == POS & alt == "C") | (start + 1 == POS & alt == "G"), "F", + default = NA_character_ + ) + return( + factor(cg_snp_class, levels = c("F", "D", NA_character_)) + ) +} + +calculate_mt <- function(mb, mn, p, CN) { + tumour_frac <- p * CN + normal_frac <- (1 - p) * 2 # Normal CN assumed to be 2 + mt <- (( + mb * (tumour_frac + normal_frac) + ) - ( + mn * normal_frac + )) / + tumour_frac + return(mt) +} + +calculate_mt_cov <- function(cov_b, p, CN) { + # Effective tumour coverage estimated by deconvolving bulk coverage + # The fractin of reads reporting the tumour is a function of the tumour purity, + # and copy number. If CN was not included two sites with the same purity, cov_b and CN_norm can + # differ by CN and be incorrectly inferred + tumour_frac <- p * CN + normal_frac <- (1 - p) * 2 # Normal CN assumed to be 2 + cov_t <- round( + cov_b * tumour_frac / (tumour_frac + normal_frac), + digits = 0 + ) + return(cov_t) +} + +deconvolve_bulk_methylation <- function(meth_c) { + # Deconvolve methylation + meth_c[, m_t_raw := calculate_mt( + m, m_n, p, CG_CN + )] + + # Correct pure tumour methylation rates set outside 0 and 1 after deconvolution + meth_c[, m_t := data.table::fcase( + m_t_raw < 0, 0, + m_t_raw > 1, 1, + rep(TRUE, nrow(meth_c)), m_t_raw + )] + + # Calculate tumour coverage by deconvolution + meth_c[, cov_t := calculate_mt_cov(cov, p, CG_CN)] + + return(meth_c) +} + +filter_deconvolved_methylation <- function(meth_c) { + meth_c[ + CN > 0 & # Remove homozygous deletions + cov_t >= 3 & # Remove low-coverage CpGs (after deconvolution) + !is.na(m_t_raw) # Capture any errors in deconvolution. Should be none! + ] +} + +#' Calculate HDI by simulation +#' @keywords internal +calculate_m_t_hdi <- function(meth_c, n_cores, itersplit = 1e5) { + inp_len <- nrow(meth_c) + split_factor <- make_split_factor(inp_len, itersplit) + + msplit <- iterators::isplit(meth_c, split_factor) + + # Calculate HDI + doParallel::registerDoParallel(cores = n_cores) + hdi_all <- foreach(v = msplit, .combine = "rbind") %dopar% { + x <- v$value + hdi <- vec_HDIofMCMC_mt( + M_b = x$M, + UM_b = x$UM, + M_n = x$M_n, + UM_n = x$UM_n, + p = x$p, + CN = x$CG_CN, + credMass = 0.99 + ) + colnames(hdi) <- c("m_t_low", "m_t_high") + return(hdi) + } + doParallel::stopImplicitCluster() + + meth_c <- cbind(meth_c, hdi_all) + return(meth_c) +} + +#' Calculate HDI by simulation +#' +#' Computes highest density interval from a sample of representative values, +#' estimated as shortest credible interval for a unimodal distribution +#' +#' @param M_b counts methylated in the tumour +#' @param UM_b counts unmethylated in the tumour +#' @param M_n counts methylated in the normal +#' @param UM_n counts unmethylated in the normal +#' @param p tumour purity +#' @param CN total tumour copy number +#' @param CN_n total normal copy number +#' @param credMass default is 0.99 +#' credMass is a scalar between 0 and 1, indicating the mass within the +#' credible interval that is to be estimated. +#' @return Value: HDIlim is a vector containing the limits of the HDI +#' @keywords internal +HDIofMCMC_mt <- function(M_b, UM_b, M_n, UM_n, p, CN, credMass = 0.99) { + # Cannot calculate if counts are not given. Return NA in this instance + if(any(is.na(c(M_b, UM_b, M_n, UM_n, p, CN)))){ + return(c(lower=NA, upper=NA)) + } + + # Simulate beta distributions from bulk and normal methylation + bulk_dist <- rbeta(n = 2000, shape1 = M_b + 1, shape2 = UM_b + 1) + normal_dist <- rbeta(n = 2000, shape1 = M_n + 1, shape2 = UM_n + 1) + + # Get constants for effect of purity and copy number + tumour_frac <- p * CN + normal_frac <- (1 - p) * 2 # Normal CN assumed to be 2 + bulk_constant <- tumour_frac + normal_frac + + # Deconvolve methylation rates from monte carlo simulation to approximate mt distribution + mt_dist <- ((bulk_dist * bulk_constant) - (normal_dist * normal_frac)) / tumour_frac + + # Calculate the HDI of mt from the simulation + # See (Kruschke, K., 2015, Doing Bayesian Data Analysis, 721–736) + mt_dist <- sort(mt_dist) + # get the width of the nth percentile where n=credMass + ci_idx_length <- ceiling(credMass * length(mt_dist)) + # get the diff between all pairs at a suitable width + ciWidth <- diff(x = mt_dist, lag = ci_idx_length) + # Return HDI + HDIlim <- (c( + lower = mt_dist[which.min(ciWidth)], + upper = mt_dist[which.min(ciWidth) + ci_idx_length] + )) + + return(HDIlim) +} + +# Vectorise HDI simulation +vec_HDIofMCMC_mt <- function(...) { + vf <- Vectorize(HDIofMCMC_mt) # Accepts vectors with dim(y,0) + result <- vf(...) # Returns a matrix with dim(x ,y) + # Transpose result to return rows corresponding to original data: dim(y,x) + + # Vectorize will return a matrix, however if there are sufficiently large rows, + # the matrix is returned as a list of vectors. This is a workaround to ensure that the + # result is always a matrix. + if ("list" %in% class(result)) { + # Unfortunately, sites with no result are returned simply as numeric(0). + # Set these to appropriate NA values before reducing, otherwise they are dropped by R. + z <- sapply(result, length) != 2 + result[z] <- list(c(NA, NA)) + out <- Reduce(rbind, result) + return(out) + } else { + # If the result is not a list, it is a matrix + # The matrix must be transposed + out <- t(result) + } + + return(out) +} + +# Helper function to split data +make_split_factor <- function(nrows, itersplit) { + # Get the integer by which we will split data + # If nrows is < itersplit, set the factor to 1 + split_factor <- ifelse( + nrows < itersplit, + 1, + round(nrows / itersplit, 0) + ) + + # Repeat a sequence of our split factor + # then sort it to ensure data is split in order + split_f <- sort( + rep_len( + seq(split_factor), + nrows + ) + ) + + return(split_f) +} + + +# Calculate HDI of mt by normal approximation +hdi_norm_approx <- function(m, um, mn, umn, p, CN){ + # Is robust to NA + # Add the relevant pseudocounts as they are in the m_t calculation + m = m+1; um = um+1; mn = mn+1; umn = umn+1 + + # Calculate mt as mean + m_b = m/(m+um) + m_n = mn/(mn+umn) + bulk_constant = ( (p * CN) + ((1 - p) * 2) ) / (p * CN) + norm_constant = ( (1-p) * 2) / (p * CN) + m_t = (m_b * bulk_constant) - (m_n * norm_constant) + + # Calculate variance + var_bulk = var_func(m, um) * (bulk_constant^2) + var_norm = var_func(mn, umn) * (norm_constant^2) + var_t = var_bulk + var_norm + sd_t = sqrt(var_t) + + # Calculate normal approx for HDI + m_t_low = qnorm(c(0.005), mean = m_t, sd = sd_t) + m_t_high = qnorm(c(0.995), mean = m_t, sd = sd_t) + + return(data.table(cbind(m_t_low, m_t_high))) +} + +# Calculate HDI of mt by normal approximation +calculate_m_t_hdi_norm <- function(meth_c){ + meth_c_hdi = hdi_norm_approx( + meth_c$M, + meth_c$UM, + meth_c$M_n, + meth_c$UM_n, + meth_c$p, + meth_c$CN + ) + meth_c = cbind(meth_c, meth_c_hdi) + return(meth_c) +} + +# Refactored from unique_calculate_counts_hdi +hdi_qbeta <- function(M, UM){ + # Settings + shape1 = M+1 + shape2 = UM+1 + incredMass <- 1.0 - .99 + credMass <- .99 + tol <- 1e-4 + + qBetaInterval <- function(lowTailPr, credMass, shape1, shape2) { + qbeta(credMass + lowTailPr, shape1, shape2) - qbeta(lowTailPr, shape1, shape2) + } + + # Vectorize optimisation + get_minima <- function(shape1, shape2, credMass, incredMass){ + optInfo <- optimize(f = intervalWidth, interval = c(0, incredMass), ICDFname = qbeta, credMass = credMass, tol = tol, + shape1=shape1, shape2=shape2) + return(optInfo$minimum) + } + v_get_minima <- Vectorize(get_minima, vectorize.args = c("shape1", "shape2")) + + # Warnings if NA + minima = suppressWarnings(v_get_minima(shape1, shape2, credMass, incredMass)) + lo = qbeta(minima, shape1, shape2) + hi = qbeta(minima + credMass, shape1, shape2) + + hdi = data.frame(lo, hi) + return(hdi) +} + +# Now, variance is the sum of the individual beta variances +# And, to scale variance, you multiply by the scaling factor squared. +var_func <- function(a, b){ + (a * b) / ( + ( a + b ) ^ 2 * ( a + b + 1 ) + ) +} diff --git a/R/onLoad.R b/R/onLoad.R new file mode 100644 index 0000000..2c30b18 --- /dev/null +++ b/R/onLoad.R @@ -0,0 +1,26 @@ +.onLoad <- function(libname, pkgname){ + # Silence load warnings for conflicting global methods from imports.R packages + options(conflicts.policy = list( + error=FALSE, + warn=FALSE, + generics.ok=TRUE, + depends.ok=TRUE, + can.mask=c("select") + )) + + # Setup CAMDAC package logger + library(logging) + logging::logReset() # Fixes root logger, but does this mess up other packages? + camdac_logger <- logging::getLogger(pkgname) + logging::setLevel("INFO", camdac_logger) + logging::addHandler(logging::writeToConsole, + logger=pkgname, + formatter = function(record) { + sprintf("[%s] | %s | %s | %s", + record$timestamp, + record$logger, + record$levelname, + record$msg) + }) + logging::loginfo(sprintf("Package loaded."), logger="CAMDAC") +} \ No newline at end of file diff --git a/R/panel.R b/R/panel.R new file mode 100644 index 0000000..7a43542 --- /dev/null +++ b/R/panel.R @@ -0,0 +1,318 @@ +#' Make CAMDAC methylation panel from allele counts +#' Methylation fractions are obtained by summing M and UM reads across samples +#' @param ac_files Allele count files from CAMDAC +#' @param ac_props Proportions of each sample to use in panel. If NULL, samples are weighted by their +#' total number of reads, which equals the sum of M and UM counts. If samples are NA, then +#' proportions are redistributed. +#' @param min_coverage Minimum coverage for a sample's site to be included in panel +#' @param min_samples Minimum number of samples with coverage for a site to be included in panel +#' @param max_sd Maximum standard deviation of methylation for a site to be included in panel +#' @param drop_snps Boolean. If TRUE, drop per-sample CG-SNPs (BAF < 0.1 or BAF > 0.9) from panel +#' @param cores Number of cores to use for calculating HDI +#' @export +panel_meth_from_counts <- function(ac_files, ac_props = NULL, min_coverage = 3, min_samples = 1, + max_sd = 1, drop_snps = FALSE, cores = 5) { + # Load AC files as list, ordering each sample by the same CpG positions + # Adds a PASS field for us to track and set sites to NA based on filters + acl <- load_panel_ac_files(ac_files) + + # Apply per-sample CpG constraints + acl <- apply_coverage_filter(acl, min_coverage) + acl <- apply_snp_filter(acl, drop_snps) + + # Apply panel CpG constraints + mask_1 <- min_sample_cg_threshold(acl, min_samples) + mask_2 <- max_sd_threshold(acl, max_sd) + panel_mask <- mask_1 & mask_2 + if (sum(panel_mask) == 0) { + stop("No sites meet panel constraints. Try lowering min_samples or increasing max_sd.") + } + + # Filter CG sites by panel mask + acl <- lapply(acl, function(e) e[panel_mask, ]) + + # Combine counts to create methylation panel + panel <- panel_meth_counts(acl, ac_props) + + # Add methylation HDI to panel + hdi <- calculate_counts_hdi(panel$M, panel$UM, n_cores = cores) + panel <- cbind(panel, hdi) + + # Return panel object + return(panel) +} + +#' Load allele count files +#' @param ac_files Allele count files from CAMDAC +#' @return List of data tables for each allele counts file +load_panel_ac_files <- function(ac_files, cores=5) { + # Set fields to draw from AC files + ac_load_fields <- c( + "chrom", "start", "end", "POS", "ref", "alt", + "total_depth", "M", "UM", "m", "total_counts_m", "BAF" + ) + # Load ac files as list of data tables + data <- parallel::mclapply(ac_files, function(x) { + v <- data.table::fread(x, select = ac_load_fields) + setkey(v, chrom, start, end) + return(v) + }, mc.cores = cores) + # Find unique cpg positions in all samples so we can create + #  a shared mapping for the dataset + uac <- unique_cpg_pos(data) + + # Set all data tables to have the same cpg positions + cix <- lapply(data, get_overlap_ix, x = uac) + dix <- lapply( + seq_along(data), + function(i) data[[i]][cix[[i]], ] + ) + + # Add PASS field + dix <- lapply(dix, function(e) { + e$PASS <- TRUE + e[is.na(total_counts_m) | is.na(m), PASS := FALSE] + return(e) + }) + + # Add filename (no path prefixes) to list + names(dix) <- fs::path_file(ac_files) + + return(dix) +} + +unique_cpg_pos <- function(cg_list) { + x <- lapply(cg_list, function(e) e[, c("chrom", "start", "end")]) + x <- Reduce(rbind, x) + x <- unique(x) + return(x) +} + +get_overlap_ix <- function(x, y) { + # Returns row indexes of y that exactly overlap x + foverlaps(x, y, by.x = c("chrom", "start", "end"), which = T, mult = "first", type = "equal") +} + +apply_coverage_filter <- function(e, min_coverage) { + lapply( + e, + function(o) o[PASS == T & total_counts_m < min_coverage, PASS := FALSE] + ) +} + +apply_snp_filter <- function(acl, drop_snps) { + set_snps_na <- function(x) { + x[ + PASS == T & + (!is.na(POS) & !is.na(BAF) & dplyr::between(BAF, 0.1, 0.9)), + PASS := FALSE + ] + } + # If we are to drop SNPS, set all PASS to False where SNPs are called + if (drop_snps) { + res <- lapply(acl, set_snps_na) + } else { + res <- acl + } + return(res) +} + +min_sample_cg_threshold <- function(x, min_samples) { + rs <- Reduce( + cbind, + lapply(x, function(o) o$PASS) + ) %>% as.matrix() %>% rowSums() + return(rs >= min_samples) +} + +max_sd_threshold <- function(x, max_sd) { + # Can't calculate SD with one sample. Return TRUE if so + if (length(x) == 1) { + return(rep(TRUE, nrow(x[[1]]))) + } + rs <- Reduce( + cbind, + lapply(x, function(o) o$m) + ) %>% as.matrix() %>% matrixStats::rowSds(na.rm = T) + bool <- ifelse(!is.na(rs) & rs <= max_sd, TRUE, FALSE) + return(bool) +} + +panel_meth_counts <- function(x, ac_props = NULL) { + # x is a list of data tables with the same cpg positions and fields from CAMDAC ac files + # mask is a boolean for CpG sites to be included in analysis + + # Set non-passing counts to NA + x <- lapply(x, function(o) { + o[PASS == FALSE, `:=`( + M = NA, UM = NA, m = NA, total_counts_m = NA + )] + return(o) + }) + + # If we have been given count proportions, use them to weight methylation rates + if (!is.null(ac_props)) { + # Get methylation rates as matrix + m <- Reduce( + cbind, + lapply(seq_along(x), function(i) x[[i]]$m) + ) + m <- as.matrix(m) + + # Recalculate, weighting by proportions of present data + pmat <- matrix(rep(ac_props, nrow(m)), byrow = T, ncol = length(ac_props)) + + # Adjust proportions where beta is NA + pmat[is.na(m)] <- 0 + pmat <- pmat / rowSums(pmat) + m[is.na(m)] <- 0 # Allows us to multiply safely with pmat + + # Get new beta based on linear combination of new proportions + m <- as.numeric(rowSums(m * pmat)) + + total_counts_m <- Reduce( + cbind, + lapply(seq_along(x), function(i) x[[i]]$total_counts_m) # Get complete counts + ) %>% rowSums(na.rm = T) + + M <- round(m * total_counts_m, 0) + UM <- total_counts_m - M + POS <- NA + total_depth <- NA + BAF <- NA + + # We can get chrom start and end from one sample as all CpGs aligned before passing to this function + chrom <- x[[1]]$chrom + start <- x[[1]]$start + end <- x[[1]]$end + } else { + # Otherwise, sum the counts + M <- Reduce(cbind, lapply(x, function(o) o$M)) %>% as.matrix() %>% rowSums(na.rm = T) + UM <- Reduce(cbind, lapply(x, function(o) o$UM)) %>% as.matrix() %>% rowSums(na.rm = T) + m <- M / (M + UM) + total_counts_m <- M + UM + POS <- NA + total_depth <- NA + BAF <- NA + chrom <- x[[1]]$chrom + start <- x[[1]]$start + end <- x[[1]]$end + } + + # Return panel data + res <- data.table( + chrom = chrom, + start = start, + end = end, + M = M, + UM = UM, + m = m, + cov = total_counts_m + ) + + return(res) +} + +#' Make CAMDAC methylation panel from a matrix of beta values +#' @param mat Matrix of beta values. Rows are CpGs, columns are samples +#' @param chrom Vector of chromosome names +#' @param start Vector of CpG start positions +#' @param end Vector of CpG end positions +#' @param cov Vector of coverage values to give each CpG site. If a matrix is provided, coverage is calculated as the sum of reads for each site. +#' @param cores Number of cores to use for calculating HDI +#' @param min_samples Minimum number of samples that must have a non-NA value for a CpG site to be included in panel +#' @param max_sd Maximum standard deviation of methylation for a site to be included in panel. +#' @export +panel_meth_from_beta <- function(mat, chrom, start, end, cov, props, cores, min_samples = 1, max_sd = 1) { + # Format chromosome as expected + chrom <- gsub("chr", "", chrom) + + # Get expected formats + stopifnot(length(props) == ncol(mat)) + mat <- as.matrix(mat) + + # Apply min sample filter to CpG sites + mask_min_samples <- rowSums(!is.na(mat)) >= min_samples + mat <- mat[mask_min_samples, ] + + # Apply max sd filter to CpG sites + mask_max_sd <- matrixStats::rowSds(mat, na.rm = T) <= max_sd + mask_max_sd[is.na(mask_max_sd)] <- TRUE + mat <- mat[mask_max_sd, ] + + # Apply filter to coverage depending on whether it is a single value, vector or matrix + if (is.null(dim(cov))) { + if (length(cov) == 1) { + # Do nothing + cov <- cov + } else { + cov <- cov[mask_min_samples | mask_max_sd] + } + } else { + cov <- cov[mask_min_samples | mask_max_sd, ] + cov <- rowMeans(cov, na.rm = T) + } + + # Set proportions as matrix + pmat <- matrix(rep(props, nrow(mat)), byrow = T, ncol = length(props)) + + # Adjust proportions where beta is NA + pmat[is.na(mat)] <- 0 + pmat <- pmat / rowSums(pmat) + mat[is.na(mat)] <- 0 # Allows us to multiply safely with pmat + + # Get new beta based on linear combination of new proportions + nbeta <- as.numeric(rowSums(mat * pmat)) + # Get new counts based on coverage + M <- round(cov * nbeta, 0) + UM <- cov - M + + # Set panel + panel <- data.table( + chrom = chrom, + start = start, + end = end, + M = M, + UM = UM, + m = nbeta, + cov = cov + ) + # Add methylation HDI to panel + hdi <- calculate_counts_hdi(panel$M, panel$UM, n_cores = cores) + panel <- cbind(panel, hdi) + + return(panel) +} + +#' Panel ASM from counts +#' Basic function to create an ASM methylation panel from allele count or ASM meth files +#' WARNING: In active development. +#' @param c1 First ASM allele counts file to merge +#' @param c2 Second ASM allele counts file to merge +panel_asm_from_counts <- function(c1, c2) { + protocol_make_asm_meth <- function(x) { + # Select DNA methylation fields + x[ + width == 2, + .(chrom, start, end, alt_total_counts_m, ref_total_counts_m, alt_m, ref_m) + ] + } + + if ("CHR" %in% names(c1)) { + am1 <- protocol_make_asm_meth(c1) + } + + if ("CHR" %in% names(c2)) { + am2 <- protocol_make_asm_meth(c2) + } + + amm <- merge(am1, am2, by = c("chrom", "start", "end"), suffixes = c(".1", ".2"), all = T) + + + outdt <- amm[, .(chrom, start, end)] + outdt$alt_total_counts_m <- rowSums(amm[, .(alt_total_counts_m.1, alt_total_counts_m.2)], na.rm = T) + outdt$ref_total_counts_m <- rowSums(amm[, .(ref_total_counts_m.1, ref_total_counts_m.2)], na.rm = T) + outdt$alt_m <- rowMeans(amm[, .(alt_m.1, alt_m.2)], na.rm = T) + outdt$ref_m <- rowMeans(amm[, .(ref_m.1, ref_m.2)], na.rm = T) + return(outdt) +} diff --git a/R/pipeline.R b/R/pipeline.R new file mode 100644 index 0000000..5127bb9 --- /dev/null +++ b/R/pipeline.R @@ -0,0 +1,275 @@ +#' CAMDAC analysis pipeline +#' +#' @param tumor Tumor `CamSample()` object for deconvultion. +#' @param germline Patient-matched normal `CamSample()` object. May be NULL if `tumor` has CNA calls already. +#' @param infiltrates Normal `CamSample()` as a proxy for infiltrating normal methylation. +#' @param origin Normal `CamSample()` representing cell of origin for tumor-normal differential methylation. +#' @param config Configuration built with `CamConfig()`. +#' @export +pipeline <- function(tumor, germline, infiltrates, origin, config) { + if (config$bsseq == "wgbs"){ + logging::loginfo("WGBS analysis pipline selected.", logger="CAMDAC") + pipeline_wgbs(tumor, germline, infiltrates, origin, config) + } else if (config$bsseq == "rrbs") { + logging::loginfo("RRBS analysis pipline selected.", logger="CAMDAC") + pipeline_rrbs(tumor, germline, infiltrates, origin, config) + } else { + stop("Unsupported bsseq type. Please use 'wgbs' or 'rrbs'.") + } +} + +#' Run CAMDAC WGBS analysis on a bulk tumor and patient-matched tissue-matched tumor-adjacent normal sample. +#' @param tumor Tumor `CamSample` object for deconvultion. +#' @param germline Patient-matched normal `CamSample` object. May be NULL if `tumor` has CNA calls already. +#' @param infiltrates Normal `CamSample` as a proxy for infiltrating normal methylation. +#' @param origin Normal `CamSample` representing cell of origin for tumor-normal differential methylation. +#' @param config Configuration built with `CamConfig()`. +#' @keywords internal +pipeline_wgbs <- function(tumor, germline = NULL, infiltrates = NULL, origin = NULL, config) { + # Log + logging::loginfo("Pipeline start for %s", tumor$patient_id, logger="CAMDAC") + + # Preprocess CpG, SNP and methylation data for all samples + preprocess_wgbs( + list(tumor, germline, infiltrates, origin), + config + ) + + # Combine tumor-germline SNPs and call CNAs + cmain_bind_snps(tumor, germline, config) + cmain_call_cna(tumor, config) + + # Run deconvolution + cmain_deconvolve_methylation(tumor, infiltrates, config) + + # Call differential methylation + cmain_call_dmps(tumor, origin, config) + cmain_call_dmrs(tumor, config) + + # Log + logging::loginfo("CAMDAC WGBS pipeline complete for %s", tumor$patient_id, logger="CAMDAC") +} + +#' Preprocess a list of CamSample objects for analysis +#' @param sample_list. List of CamSample objects. +#' @param config. CamConfig object. +#' @export +#' @keywords internal +preprocess_wgbs <- function(sample_list, config) { + for (s in sample_list) { + # Go to next part of loop if its null + if (is.null(s)) { + next + } + + # Count SNP and CpG alleles if a BAM file is provided + cmain_count_alleles(s, config) + + # Prepare SNP data for CNA calling if allele counts are present + cmain_make_snps(s, config) + + # Format methylation rates for deconvolution + cmain_make_methylation_profile(s, config) + } +} + + +#' Call CAMDAC for a tumor and patient-matched normal sample +#' @param tumor Tumor `CamSample` object for deconvultion. +#' @param germline Patient-matched normal `CamSample` object. May be NULL if `tumor` has CNA calls already. +#' @param infiltrates Normal `CamSample` as a proxy for infiltrating normal methylation. +#' @param origin Normal `CamSample` representing cell of origin for tumor-normal differential methylation. +#' @param config Configuration built with `CamConfig()`. +#' @keywords internal +pipeline_rrbs <- function(tumor, germline, infiltrates, origin, config){ + + # Preprocess RRBS normal samples + for (s in list(germline, infiltrates, origin)){ + + # Go to next part of loop if its null + if (is.null(s)) { + next + } + + logging::loginfo("Preprocessing sample %s:%s", s$patient_id, s$id, logger="CAMDAC") + preprocess_rrbs_normal( + patient_id = s$patient_id , sample_id = s$id, bam_file = s$bam, + min_tumor = 1, min_normal = config$min_normal_cov, mq = config$min_mapq, + sex = s$sex, path = config$outdir, + pipeline_files = config$refs, build = config$build, + n_cores = config$n_cores, paired_end = is_pe(config), segments_bed=config$regions + ) + } + + # Main : Process RRBS tumour using the design from input files + + # Setup + patient_id <- tumor$patient_id + sample_id <- tumor$id + bam_file <- tumor$bam + sex <- tumor$sex + path <- config$outdir + pipeline_files <- config$refs + build <- config$build + n_cores <- config$n_cores + min_tumor <- config$min_cov + min_normal <- config$min_normal_cov + mq <- config$min_mapq + paired_end <- is_pe(config) + segments_bed <- config$regions + + # Define expected ac file + ac_file = file.path( + path, patient_id, "Allelecounts", sample_id, + paste0(patient_id, ".", sample_id, ".SNPs.CpGs.all.sorted.RData") + ) + + if (!file.exists(ac_file)) { + logging::loginfo("Preprocess tumour data: %s:%s", patient_id, sample_id, logger="CAMDAC") + # Run allele counter for tumor sample + for (a in 1:25) { + get_allele_counts( + i = a, patient_id = patient_id, sample_id = sample_id, + sex = sex, bam_file = bam_file, mq = mq, + path = path, path_to_CAMDAC = pipeline_files, + build = build, n_cores = n_cores, test = FALSE, paired_end=paired_end, segments_bed=segments_bed + ) + } + + # Merge allele counts + format_output( + patient_id, sample_id, sex, is_normal=FALSE, path, pipeline_files, build + ) + + } else { + logging::loginfo("Preprocess RRBS tumour: %s.", ac_file, logger="CAMDAC") + } + + # Create SNP files and run ASCAT (tumor) + cna_file = file.path( + path, patient_id, "Copy_number", sample_id, + paste0(patient_id, ".", sample_id, ".ascat.output.RData") + ) + if (!file.exists(cna_file)){ + logging::loginfo("ASCAT.m Tumor", logger="CAMDAC") + run_ASCAT.m( + patient_id, sample_id, sex, + patient_matched_normal_id = germline$id, + path, pipeline_files, build, + min_normal, min_tumor, + n_cores, reference_panel_coverage = NULL + ) + } else { + logging::loginfo("CNA file already exists: %s", cna_file, logger="CAMDAC") + } + + # Process methylation info for copy number profiling and plot summary. + logging::loginfo("Running DNA methylation processing for Tumour", logger="CAMDAC") + run_methylation_data_processing( + patient_id, sample_id, + normal_infiltrates_proxy_id = infiltrates$id, + normal_origin_proxy_id = origin$id, + path, min_normal, min_tumor, n_cores, + reference_panel_normal_infiltrates = NULL, + reference_panel_normal_origin = NULL + ) + + # Get purified methylation rate + logging::loginfo("Calculating pure tumour DNA methylation", logger="CAMDAC") + get_pure_tumour_methylation( + patient_id = patient_id, sample_id = sample_id, sex = sex, + normal_infiltrates_proxy_id = infiltrates$id, + path, pipeline_files, build, + n_cores, reseg = FALSE + ) + + # Get DMP and DMR calls + logging::loginfo("Get tumour differential methylation.", logger="CAMDAC") + get_differential_methylation( + patient_id = patient_id, sample_id = sample_id, sex = sex, + normal_origin_proxy_id = origin$id, + path, pipeline_files, build, + effect_size = 0.2, prob = 0.99, + min_DMP_counts_in_DMR = 5, min_consec_DMP_in_DMR = 4, + n_cores, reseg = FALSE, bulk = FALSE + ) + + logging::loginfo("Pipeline complete for %s", tumor$patient_id, logger="CAMDAC") +} + +preprocess_rrbs_normal <- function(patient_id, sample_id, bam_file, min_tumor, + min_normal, mq, sex, path, pipeline_files, build, n_cores, paired_end, segments_bed) { + + # For normals, CAMDAC-RRBS expects same ID + normal_id = sample_id + + # Define expected allele counts. + ac_file = file.path( + path, patient_id, "Allelecounts", sample_id, + paste0(patient_id, ".", sample_id, ".SNPs.CpGs.all.sorted.RData") + ) + + loginfo("CAMDAC:::preprocess_rrbs_normal: %s:%s", patient_id, sample_id) + loginfo("Creating allele count files...") + if(!file.exists(ac_file)) { + + # Run allele counter for normal sample + for (a in 1:25) { + get_allele_counts( + i = a, patient_id = patient_id, sample_id = sample_id, sex = sex, bam_file, mq = mq, + path, pipeline_files, build, n_cores, test = FALSE, paired_end=paired_end, segments_bed=segments_bed + ) + } + + # Merge allele counts + is_normal <- ifelse(sample_id == normal_id, TRUE, FALSE) + format_output( + patient_id, sample_id, sex, is_normal, path, pipeline_files, build + ) + + loginfo("Allele counting finished.") + } else { + loginfo("CAMDAC:::preprocess_rrbs_normal: %s already exists, skipping counts.", ac_file) + } + + # Create SNP files (normal) or run ASCAT (tumor) + loginfo("Creating SNP files...") + snp_file = file.path( + path, patient_id, "Copy_number", sample_id, + paste0(patient_id, ".", sample_id, ".SNPs.RData") + ) + if (!file.exists(snp_file)){ + run_ASCAT.m( + patient_id = patient_id, sample_id = sample_id, sex = sex, + patient_matched_normal_id = normal_id, + path = path, path_to_CAMDAC = pipeline_files, build = build, + min_normal = min_normal, min_tumour = NULL, + n_cores = n_cores, reference_panel_coverage = NULL + ) + + loginfo("SNP files created.") + } else { + loginfo("CAMDAC:::preprocess_rrbs_normal: %s already exists, skipping SNP prep.", snp_file) + } + + # Process methylation info for copy number profiling and plot summary. + meth_file = file.path( + path, patient_id, "Methylation", sample_id, "dt_normal_m.RData" + ) + + loginfo("Creating methylation files...") + if (!file.exists(meth_file)){ + run_methylation_data_processing( + patient_id, sample_id, + normal_infiltrates_proxy_id = normal_id, + normal_origin_proxy_id = normal_id, + path, min_normal, min_tumor, n_cores, + reference_panel_normal_infiltrates = NULL, + reference_panel_normal_origin = NULL + ) + loginfo("Mehylation files created.") + } else { + loginfo("CAMDAC:::preprocess_rrbs_normal: %s already exists, skipping methylation prep.", meth_file) + } + +} diff --git a/R/pipeline_tumor_normal.R b/R/pipeline_tumor_normal.R deleted file mode 100644 index 49115e4..0000000 --- a/R/pipeline_tumor_normal.R +++ /dev/null @@ -1,86 +0,0 @@ - -#' Call CAMDAC for a tumor and patient-matched normal sample -#' -#' @param patient_id character. Patient identifier -#' @param tumor_id character. Tumor sample identifier -#' @param normal_id character. Normal sample identifier -#' @param tumor_bam character. Full path to tumor bam file -#' @param normal_bam character. Full path to normal bam file -#' @param sex character. Patient sex: "XX" for female or "XY" for male -#' @param path character. Full path to CAMDAC output directory -#' @param pipeline_files character. Full path to parent directory containing CAMDAC pipeline_files -#' @param build character. Genome build: "hg19" or "hg38" -#' @param min_tumor integer. Minimum read filter for tumor samples -#' @param min_normal integer. Minimum read filter for normal samples -#' @param n_cores integer. Number of cores to use for parallel processing -#' @param mq integer. Minimum mapping quality filter -#' @export -pipeline_tumor_normal <- function(patient_id, tumor_id, normal_id, tumor_bam, normal_bam, sex, path, - pipeline_files, build, min_tumor = 3, min_normal = 10, - n_cores = 1, mq = 0, paired_end = FALSE) { - # Preprocess tumor and normal sample - preprocess_sample( - patient_id, normal_id, normal_id, normal_bam, min_tumor, - min_normal, mq, sex, path, pipeline_files, build, n_cores, paired_end - ) - preprocess_sample( - patient_id, tumor_id, normal_id, tumor_bam, min_tumor, - min_normal, mq, sex, path, pipeline_files, build, n_cores, paired_end - ) - - # Get purified methylation rate - get_pure_tumour_methylation( - patient_id = patient_id, sample_id = tumor_id, sex = sex, - normal_infiltrates_proxy_id = normal_id, - path, pipeline_files, build, - n_cores, reseg = FALSE - ) - - # Get DMP and DMR calls - get_differential_methylation( - patient_id = patient_id, sample_id = tumor_id, sex = sex, - normal_origin_proxy_id = normal_id, - path, pipeline_files, build, - effect_size = 0.2, prob = 0.99, - min_DMP_counts_in_DMR = 5, min_consec_DMP_in_DMR = 4, - n_cores, reseg = FALSE, bulk = FALSE - ) -} - - -preprocess_sample <- function(patient_id, sample_id, normal_id, bam_file, min_tumor, - min_normal, mq, sex, path, pipeline_files, build, n_cores, paired_end = FALSE) { - # Run allele counter for normal sample - for (a in 1:25) { - get_allele_counts( - i = a, patient_id = patient_id, sample_id = sample_id, sex = sex, bam_file, mq = mq, - path, pipeline_files, build, n_cores, test = FALSE, paired_end - ) - } - - # Merge allele counts - # Set normal status based on whether sample and normal ID match - is_normal <- ifelse(sample_id == normal_id, TRUE, FALSE) - format_output( - patient_id, sample_id, sex, is_normal, path, pipeline_files, build - ) - - # Create SNP files (normal) or run ASCAT (tumor) - run_ASCAT.m( - patient_id, sample_id, sex, - patient_matched_normal_id = normal_id, - path, pipeline_files, build, - min_normal, min_tumor, - n_cores, reference_panel_coverage = NULL - ) - - # Process methylation info for copy number profiling and plot summary. - run_methylation_data_processing( - patient_id, sample_id, - normal_infiltrates_proxy_id = normal_id, - normal_origin_proxy_id = normal_id, - path, min_normal, min_tumor, n_cores, - reference_panel_normal_infiltrates = NULL, - reference_panel_normal_origin = NULL - ) -} diff --git a/R/plots.R b/R/plots.R new file mode 100755 index 0000000..a98ada6 --- /dev/null +++ b/R/plots.R @@ -0,0 +1,142 @@ +#' @title ascat.plotSegmentedData.RRBS +#' @description Plot segmentated BAF LogR +#' @param ASCATobj an ASCAT object (e.g. data structure from ascat.loadData) +#' +#' @return Produces png files showing the logR and BAF values for tumour and germline samples +#' @author Peter Van Loo +#' @noRd +ascat.plotSegmentedData.RRBS <- function (ASCATobj, lim_logR=2) +{ + for (arraynr in 1:dim(ASCATobj$Tumor_LogR)[2]) { + Select_nonNAs = rownames(ASCATobj$Tumor_BAF_segmented[[arraynr]]) + AllIDs = 1:dim(ASCATobj$Tumor_LogR)[1] + names(AllIDs) = rownames(ASCATobj$Tumor_LogR) + HetIDs = AllIDs[Select_nonNAs] + png(filename = paste(ASCATobj$samples[arraynr], ".ASPCF.png", + sep = ""), width = 2000, height = 1000, res = 200) + par(mar = c(0.5, 5, 5, 0.5), mfrow = c(2, 1), cex = 0.4, + cex.main = 3, cex.axis = 2) + r = ASCATobj$Tumor_LogR_segmented[rownames(ASCATobj$Tumor_BAF_segmented[[arraynr]]), + arraynr] + beta = ASCATobj$Tumor_BAF_segmented[[arraynr]][, , drop = FALSE] + plot(c(1, length(r)), c(-lim_logR ,lim_logR), type = "n", xaxt = "n", + main = paste(colnames(ASCATobj$Tumor_BAF)[arraynr], + ", LogR", sep = ""), xlab = "", ylab = "") + points(ASCATobj$Tumor_LogR[rownames(ASCATobj$Tumor_BAF_segmented[[arraynr]]), + arraynr], col = "red", pch = 10, cex = 0.20) + points(r, col = "blue") + abline(v = 0.5, lty = 1, col = "lightgrey") + chrk_tot_len = 0 + for (j in 1:length(ASCATobj$ch)) { + chrk = intersect(ASCATobj$ch[[j]], HetIDs) + chrk_tot_len_prev = chrk_tot_len + chrk_tot_len = chrk_tot_len + length(chrk) + vpos = chrk_tot_len + tpos = (chrk_tot_len + chrk_tot_len_prev)/2 + text(tpos, lim_logR-0.5, ASCATobj$chrs[j], pos = 1, cex = 2) + abline(v = vpos + 0.5, lty = 1, col = "lightgrey") + } + plot(c(1, length(beta)), c(0, 1), type = "n", xaxt = "n", + main = paste(colnames(ASCATobj$Tumor_BAF)[arraynr], + ", BAF", sep = ""), xlab = "", ylab = "") + points(ASCATobj$Tumor_BAF[rownames(ASCATobj$Tumor_BAF_segmented[[arraynr]]), + arraynr], col = "red", pch = 10, cex = 0.20) + points(beta, col = "blue") + points(1 - beta, col = "blue") + abline(v = 0.5, lty = 1, col = "lightgrey") + chrk_tot_len = 0 + for (j in 1:length(ASCATobj$ch)) { + chrk = intersect(ASCATobj$ch[[j]], HetIDs) + chrk_tot_len_prev = chrk_tot_len + chrk_tot_len = chrk_tot_len + length(chrk) + vpos = chrk_tot_len + tpos = (chrk_tot_len + chrk_tot_len_prev)/2 + text(tpos, 1, ASCATobj$chrs[j], pos = 1, cex = 2) + abline(v = vpos + 0.5, lty = 1, col = "lightgrey") + } + dev.off() + } +} + +#' @title ascat.plotRawData +#' @description Plot BAF LogR +#' @param ASCATobj an ASCAT object (e.g. data structure from ascat.loadData) +#' @param pch type of data points in plot +#' @param cex size of data points in plot +#' @param lim_logR y-axis limits on logR plot +#' +#' @return Produces png files showing the logR and BAF values for tumour and germline samples +#' @author Peter Van Loo +#' @keywords internal +ascat.plotRawData.flags = function(ASCATobj, pch, cex, lim_logR) { + return(1) + print.noquote("Plotting tumor data") + for (i in 1:dim(ASCATobj$Tumor_LogR)[2]) { + png(filename = paste(ASCATobj$samples[i],".tumour.png",sep=""), width = 2000, height = 1000, res = 200) + par(mar = c(0.5,5,5,0.5), mfrow = c(2,1), cex = 0.4, cex.main=3, cex.axis = 2, pch = ifelse(dim(ASCATobj$Tumor_LogR)[1]>100000,".",20)) + plot(c(1,dim(ASCATobj$Tumor_LogR)[1]), c(-lim_logR ,lim_logR ), type = "n", xaxt = "n", main = paste(ASCATobj$samples[i], ", tumor data, LogR", sep = ""), xlab = "", ylab = "") + points(ASCATobj$Tumor_LogR[,i],col="red") + #points(ASCATobj$Tumor_LogR[,i],col=rainbow(24)[ASCATobj$SNPpos$Chr]) + abline(v=0.5,lty=1,col="lightgrey") + chrk_tot_len = 0 + for (j in 1:length(ASCATobj$ch)) { + chrk = ASCATobj$ch[[j]]; + chrk_tot_len_prev = chrk_tot_len + chrk_tot_len = chrk_tot_len + length(chrk) + vpos = chrk_tot_len; + tpos = (chrk_tot_len+chrk_tot_len_prev)/2; + text(tpos,2,ASCATobj$chrs[j], pos = 1, cex = 2) + abline(v=vpos+0.5,lty=1,col="lightgrey") + } + + plot(c(1,dim(ASCATobj$Tumor_BAF)[1]), c(0,1), type = "n", xaxt = "n", main = paste(ASCATobj$samples[i], ", tumor data, BAF", sep = ""), xlab = "", ylab = "") + points(ASCATobj$Tumor_BAF[,i],col="red", pch = pch, cex = cex) + abline(v=0.5,lty=1,col="lightgrey") + chrk_tot_len = 0 + for (j in 1:length(ASCATobj$ch)) { + chrk = ASCATobj$ch[[j]]; + chrk_tot_len_prev = chrk_tot_len + chrk_tot_len = chrk_tot_len + length(chrk) + vpos = chrk_tot_len; + tpos = (chrk_tot_len+chrk_tot_len_prev)/2; + text(tpos,1,ASCATobj$chrs[j], pos = 1, cex = 2) + abline(v=vpos+0.5,lty=1,col="lightgrey") + } + dev.off() + } + + if(!is.null(ASCATobj$Germline_LogR)) { + print.noquote("Plotting germline data") + for (i in 1:dim(ASCATobj$Germline_LogR)[2]) { + png(filename = paste(ASCATobj$samples[i],".germline.png",sep=""), width = 2000, height = 1000, res = 200) + par(mar = c(0.5,5,5,0.5), mfrow = c(2,1), cex = 0.4, cex.main=3, cex.axis = 2, pch = ifelse(dim(ASCATobj$Tumor_LogR)[1]>100000,".",20)) + plot(c(1,dim(ASCATobj$Germline_LogR)[1]), c(-1,1), type = "n", xaxt = "n", main = paste(ASCATobj$samples[i], ", germline data, LogR", sep = ""), xlab = "", ylab = "") + points(ASCATobj$Germline_LogR[,i],col="red") + abline(v=0.5,lty=1,col="lightgrey") + chrk_tot_len = 0 + for (j in 1:length(ASCATobj$ch)) { + chrk = ASCATobj$ch[[j]]; + chrk_tot_len_prev = chrk_tot_len + chrk_tot_len = chrk_tot_len + length(chrk) + vpos = chrk_tot_len; + tpos = (chrk_tot_len+chrk_tot_len_prev)/2; + text(tpos,2,ASCATobj$chrs[j], pos = 1, cex = 2) + abline(v=vpos+0.5,lty=1,col="lightgrey") + } + plot(c(1,dim(ASCATobj$Germline_BAF)[1]), c(0,1), type = "n", xaxt = "n", main = paste(ASCATobj$samples[i], ", germline data, BAF", sep = ""), xlab = "", ylab = "") + points(ASCATobj$Germline_BAF[,i],col="red", pch = pch, cex = cex) + abline(v=0.5,lty=1,col="lightgrey") + chrk_tot_len = 0 + for (j in 1:length(ASCATobj$ch)) { + chrk = ASCATobj$ch[[j]]; + chrk_tot_len_prev = chrk_tot_len + chrk_tot_len = chrk_tot_len + length(chrk) + vpos = chrk_tot_len; + tpos = (chrk_tot_len+chrk_tot_len_prev)/2; + text(tpos,1,ASCATobj$chrs[j], pos = 1, cex = 2) + abline(v=vpos+0.5,lty=1,col="lightgrey") + } + dev.off() + } + } +} diff --git a/R/run_ASCAT.m.R b/R/run_ASCAT.m.R index 863243a..25ec440 100755 --- a/R/run_ASCAT.m.R +++ b/R/run_ASCAT.m.R @@ -31,7 +31,7 @@ #' @param reference_panel_coverage Path to the reference panel for the coverage. #' #' @return Three text files with all the CpG loci and their SNP and/or CpG methylation info - +#' @keywords internal run_ASCAT.m <- function (patient_id,sample_id,sex, patient_matched_normal_id=NULL, path,path_to_CAMDAC,build, @@ -78,7 +78,7 @@ run_ASCAT.m <- function (patient_id,sample_id,sex, orig_dir <- getwd() # Set reference human genome build variables - cat(paste("Data with build ", build, sep = " "), "\n", sep = "") + logging::loginfo(paste("Data with build ", build, sep = " "), logger="CAMDAC") if(build=="GRCH37"){build="hg19"} # set build to to assembly version disregarging UCSC/Ensembl if(build=="GRCH38"){build="hg38"} @@ -135,7 +135,7 @@ run_ASCAT.m <- function (patient_id,sample_id,sex, # Overlap normal and tumour dt_sample_SNPs <- merge(dt_sample_SNPs, dt_normal_SNPs, by = c("chrom","POS","ref", "alt")) rm(dt_normal_SNPs) - cat("Germline SNP info loaded succesfully!\n") + logging::logdebug("Germline SNP info loaded succesfully!", logger="CAMDAC") } ; rm(cols) # Obatin SNP genotype from bulk if there is no patient-matched normal @@ -160,14 +160,14 @@ run_ASCAT.m <- function (patient_id,sample_id,sex, if(sample_id == normal_id){ dt_sample_SNPs <- dt_sample_SNPs[total_counts >= min_normal,] min <- min_normal - cat("Minimum counts treshold in the matched normal",min_normal,"count(s)\n") + logging::loginfo("Minimum counts treshold in the matched normal %s count(s)",min_normal, logger="CAMDAC") } if(sample_id != normal_id){ dt_sample_SNPs <- dt_sample_SNPs[total_counts >= min_tumour,] min <- min_tumour - cat("Minimum counts treshold in tumour sample set to",min, - "count(s) \nMinimum counts treshold in the matched normal",min_normal,"count(s)\n") + logging::loginfo(paste0("Minimum counts treshold in tumour sample set to ",min, + "count(s).Minimum counts treshold in the matched normal",min_normal,"count(s)."), logger="CAMDAC") } # Add SNP loci ids @@ -199,11 +199,11 @@ run_ASCAT.m <- function (patient_id,sample_id,sex, if(sample_id!=normal_id){ # Remove low coverage singletons - cat("Removing low coverage singletons\n") + logging::logdebug("Removing low coverage singletons.", logger="CAMDAC") n <- nrow(dt_sample_SNPs) dt_sample_SNPs <- remove_low_cov_singletons(dt_sample_SNPs=dt_sample_SNPs,min=min) - cat(paste0("Low coverage singletons removed (", - round2((1-(nrow(dt_sample_SNPs)/n))*100, digits=2),"% of SNPs).\n")) + logging::loginfo(paste0("Low coverage singletons removed (", + round2((1-(nrow(dt_sample_SNPs)/n))*100, digits=2),"% of SNPs).\n"), logger="CAMDAC") rm(n) # Set reference file names for LogR bias correction @@ -234,7 +234,7 @@ run_ASCAT.m <- function (patient_id,sample_id,sex, dt_sample_SNPs <- dt_sample_SNPs[dt_stats, nomatch=0] rm(dt_stats) - cat("LogR correction completed\n") + logging::loginfo("LogR correction completed.", logger="CAMDAC") } else { # format normal seqnames in normal y <- substr(as.character(dt_sample_SNPs$chrom[1]),1,3) @@ -300,11 +300,11 @@ run_ASCAT.m <- function (patient_id,sample_id,sex, ascat.m.plotRawData(ascat.bc, raw_LogR=dt_sample_SNPs$LogR_t, pch = 10, cex = 0.2, lim_logR = 2.5) save(ascat.bc, file = paste(patient_id, sample_id, "ascat.bc.RData", sep = ".")) - cat("ASCAT object created\n") + logging::logdebug("ASCAT object created.", logger="CAMDAC") # Carry out segmentation gg = list(germlinegenotypes=ascat.bc$genotypes) - ascat.frag <- ascat.aspcf(ascat.bc, ascat.gg=gg, penalty=200) + ascat.frag <- ASCAT::ascat.aspcf(ascat.bc, ascat.gg=gg, penalty=200) # penalty = 200 recommended for sequencing data # fix issue with ascat.ascpcf renaming samples @@ -313,17 +313,17 @@ run_ASCAT.m <- function (patient_id,sample_id,sex, ascat.m.plotSegmentedData(ascat.frag, lim_logR = 2.5) save(ascat.frag, file = paste(patient_id, sample_id, "ascat.frag.RData", sep = ".")) - cat("\nASCAT copy number segmentation completed\n") + logging::loginfo("ASCAT copy number segmentation completed.", logger="CAMDAC") # Run copy number caller a first time to get the distance matrix - ascat.output <- ascat.runAscat(ascat.frag, gamma = 1) + ascat.output <- ASCAT::ascat.runAscat(ascat.frag, gamma = 1) save(ascat.output, file = paste(patient_id, sample_id,"ascat.output.RData", sep = ".")) num_het_SNPs = nrow(ascat.frag$Tumor_LogR_segmented) num_hom_SNPs = nrow(ascat.frag$Tumor_BAF_segmented[[1]]) rm(ascat.frag) if(file.exists(paste(patient_id, sample_id,"ASCATprofile.png", sep = "."))){ - cat("\nASCAT completed\n") + logging::loginfo("ASCAT completed.", logger="CAMDAC") # Save purity and ploidy f.nm <- paste(patient_id, ".", sample_id,".ACF.and.ploidy.txt", sep = "") @@ -337,12 +337,13 @@ run_ASCAT.m <- function (patient_id,sample_id,sex, median_depth = median(dt_sample_SNPs$total_depth, na.rm=TRUE), median_n_depth = median(dt_sample_SNPs$total_depth_n, na.rm=TRUE)) rm(ascat.output, num_het_SNPs, num_hom_SNPs) - cat(format_delim(dt, delim = "\t", col_names = T), file = f) + cat(readr::format_delim(dt, delim = "\t", col_names = T), file = f) close(f); rm(dt,f,f.nm) - cat(paste("\nPloidy, Purity and summary stats saved in ", - path_output,patient_id,".",sample_id,".ACF.and.ploidy.txt","\n",sep = "")) + logging::loginfo(paste("\nPloidy, Purity and summary stats saved in ", + path_output,patient_id,".",sample_id,".ACF.and.ploidy.txt","\n",sep = ""), logger="CAMDAC") } else { - cat("\nASCAT could not find a solution\n") + logging::logerror("ASCAT could not find a solution for this sample.") + stop() } # convert to data.table @@ -368,7 +369,7 @@ run_ASCAT.m <- function (patient_id,sample_id,sex, # run plot function outfile = paste(patient_id, sample_id,"SNP_data.pdf", sep = "_") plot_SNP_info(dt=dt,outfile=outfile,min=min) - cat("BAF and LogR diagnostics plots generated\n") + logging::logdebug("BAF and LogR diagnostics plots generated.", logger="CAMDAC") } if(is.null(reference_panel_coverage)&sample_id==normal_id){ @@ -393,7 +394,7 @@ run_ASCAT.m <- function (patient_id,sample_id,sex, # run plot function plot_normal_SNP_info(dt=dt,outfile=outfile,min=min) - cat("Normal BAF plots generated\n") + logging::logdebug("Normal BAF plots generated.", logger="CAMDAC") } setwd(orig_dir) @@ -468,6 +469,7 @@ split_genome_RRBS = function(SNPpos) { #' @title remove_low_cov_singletons #' @description Remove low coverage singletons outliers #' @author Elizabeth larose cadieux +#' @keywords internal remove_low_cov_singletons = function(dt_sample_SNPs,min){ # subselect relevant columns @@ -512,7 +514,7 @@ remove_low_cov_singletons = function(dt_sample_SNPs,min){ #' @param fragments_file CAMDAC reference MspI fragments file #' @param replic_timing_file_prefix CAMDAC reference replication timing files path and file name prefix #' @param n_cores Numerical value correspdonding to the number of cores for parallel processing - +#' @keywords internal LogR_correction = function(dt_sample,dt_SNPs,build,chr_names,min_normal, fragments_file,replic_timing_file_prefix,n_cores){ @@ -637,7 +639,7 @@ LogR_correction = function(dt_sample,dt_SNPs,build,chr_names,min_normal, chrom_idx = 1:23 if(build=="hg19"){replic_files = paste0(replic_timing_file_prefix, chrom_idx, ".fst")} if(build=="hg38"){replic_files = paste0(replic_timing_file_prefix, chrom_idx,"_",build,".fst")} - replic_data = data.table(do.call(rbind, mclapply(replic_files, fst::read_fst, mc.cores = n_cores))) + replic_data = data.table(do.call(rbind, parallel::mclapply(replic_files, fst::read_fst, mc.cores = n_cores))) # format replication timing data cols <- colnames(replic_data) @@ -680,8 +682,8 @@ LogR_correction = function(dt_sample,dt_SNPs,build,chr_names,min_normal, corr_rep = abs(cor(fragments_replic_data[, .SD, .SDcols=2:ncol(fragments_replic_data)], dt_SNPs$LogR_t, use="complete.obs")[,1]) maxreplic = which.max(corr_rep)+1 ;rm(corr_rep) - cat(paste0("Replication timimg correction based on ", colnames(fragments_replic_data)[maxreplic], - " ENCODE cell line Repli-Seq data.")) + logging::loginfo(paste0("Replication timing correction based on ", colnames(fragments_replic_data)[maxreplic], + " ENCODE cell line Repli-Seq data."), logger="CAMDAC") # annotate each SNP with fragment replication timing info dt_SNPs$replic <- fragments_replic_data[, .SD, .SDcols=maxreplic] @@ -725,6 +727,7 @@ LogR_correction = function(dt_sample,dt_SNPs,build,chr_names,min_normal, #' @return Produces png files showing the logR and BAF values for tumour and germline samples #' @author Peter Van Loo #' @export +#' @keywords internal ascat.m.plotSegmentedData <- function (ASCATobj, lim_logR=2) { @@ -787,7 +790,7 @@ ascat.m.plotSegmentedData <- function (ASCATobj, lim_logR=2) #' #' @return Produces png files showing the logR and BAF values for tumour and germline samples #' @author Peter Van Loo -#' @export +#' @keywords internal ascat.m.plotRawData = function(ASCATobj, raw_LogR, pch, cex, lim_logR) { @@ -893,6 +896,7 @@ ascat.m.plotRawData = function(ASCATobj, raw_LogR, pch, cex, lim_logR) { #' Saves a pdf w/ methylation rate distribution, biases at polymorphic and #' non-polymorphic CG/CCGG and coverage distribution #' @author Elizabeth Larose Cadieux +#' @keywords internal plot_BAF_and_LogR <- function (dt, outfile, downsample=1E5) { # Only plot heterozygous SNPs @@ -930,12 +934,12 @@ plot_BAF_and_LogR <- function (dt, outfile, downsample=1E5) { scale_x_continuous("SNP loci", minor_breaks = lines_pos$BAFloci, breaks = lines_pos$BAFloci, labels = NULL) + ggtitle("BAF") + theme(legend.title = element_text(hjust = 0.5)) + guides(color = guide_legend(override.aes = list(size=10))) - d_BAF_n <-ggplot(dt_sample, aes(x=BAF_n,y=..count..,color = flag, fill=flag)) + geom_density(alpha=0.25) + + d_BAF_n <-ggplot(dt_sample, aes(x=BAF_n,y=ggplot2::after_stat(count),color = flag, fill=flag)) + geom_density(alpha=0.25) + scale_color_manual(name = "SNP\nflag", values=c("CCGG"="red","CG"="orange3","neither"="cornflowerblue")) + scale_fill_manual(name = "SNP\nflag", values=c("CCGG"="red","CG"="orange3","neither"="cornflowerblue")) + theme_classic() - h_BAF_n <-ggplot(dt_sample, aes(x=BAF_n,y=..count..,color = flag, fill=flag)) + geom_histogram(bins=100) + + h_BAF_n <-ggplot(dt_sample, aes(x=BAF_n,y=ggplot2::after_stat(count),color = flag, fill=flag)) + geom_histogram(bins=100) + scale_color_manual(name = "SNP\nflag", values=c("CCGG"="red","CG"="orange3","neither"="cornflowerblue")) + scale_fill_manual(name = "SNP\nflag", values=c("CCGG"="red","CG"="orange3","neither"="cornflowerblue")) + theme_classic() @@ -957,6 +961,7 @@ plot_BAF_and_LogR <- function (dt, outfile, downsample=1E5) { #' #' @return pdf #' @author Elizabeth Larose Cadieux +#' @keywords internal plot_SNP_info <- function (dt, outfile, min) { # Total INFORMATIVE counts at SNPs @@ -978,7 +983,7 @@ plot_SNP_info <- function (dt, outfile, min) { axis.text.x = element_blank(), axis.ticks.x = element_blank()) # Compare logR and logR corrected - #p3 <- ggplot(dt, aes(y=..count..))+ + #p3 <- ggplot(dt, aes(y=ggplot2::after_stat(count)))+ # ggtitle("C.")+ylab("Number of SNPs")+xlab("LogR")+ # theme_classic()+coord_cartesian(xlim=c(-2.5, 2.5)) + # geom_histogram(aes(x=LogR_t, color="raw", fill="raw"), @@ -1081,11 +1086,12 @@ plot_SNP_info <- function (dt, outfile, min) { #' #' @return pdf #' @author Elizabeth Larose Cadieux +#' @keywords internal plot_normal_SNP_info <- function (dt, outfile, min) { tmp <- dt[BAF>=0.15 & BAF <= 0.85 & !is.na(BAF),] tmp2 <- table(cut(tmp$BAF, breaks = (0.85-0.15)/0.01)) tmp2 <- unname(tmp2[which.max(tmp2)])*9/10 - p4 <- ggplot(data = tmp, aes(x=BAF,y=..count..,color = type,fill = type)) + + p4 <- ggplot(data = tmp, aes(x=BAF,y=ggplot2::after_stat(count),color = type,fill = type)) + geom_histogram(binwidth = 0.01, alpha = 0.25) + theme_minimal() + scale_color_manual(name = "", values = c("Homozygous" = "orange3", "Heterozygous" = "mediumpurple")) + scale_fill_manual(name = "", values = c("Homozygous" = "orange3", "Heterozygous" = "mediumpurple")) + diff --git a/R/run_methylation_data_processing.R b/R/run_methylation_data_processing.R index 0be897f..69b6312 100755 --- a/R/run_methylation_data_processing.R +++ b/R/run_methylation_data_processing.R @@ -44,7 +44,7 @@ #' cov_n is the total CpG methylation informative reads counts (M_n+UM_n) #' #' @return GRanges object in .RData file - +#' @keywords internal run_methylation_data_processing <- function (patient_id,sample_id, normal_infiltrates_proxy_id, normal_origin_proxy_id, @@ -194,11 +194,11 @@ if(!sample_id%in%normal_ids){ n<- nrow(dt_normal_m) M=dt_normal_m$M_n;UM=dt_normal_m$UM_n vec <- cbind.data.frame(low=numeric(length=n), high=numeric(length=n)) - vec[,1:2]<- do.call(rbind, mclapply(1:n, function(i,M,UM) + vec[,1:2]<- do.call(rbind, parallel::mclapply(1:n, function(i,M,UM) HDIofICDF(ICDFname=qbeta, credMass=.99, shape1=M[i]+1, shape2=UM[i]+1), mc.cores=n_cores, M=M, UM=UM)) # Checkpoint - cat("Reference profile methylation rates 99% highest density intervals annotated\n") + logging::logdebug("Reference profile methylation rates 99% highest density intervals annotated.", logger="CAMDAC") # Add HDI to data.table dt_normal_m[, as.character(c("m_n_low", "m_n_high")) := as.list(vec)] @@ -276,6 +276,7 @@ if(!sample_id%in%normal_ids){ #' @param trim Logical value establishing whether regions with extremely high coverage be trimmed or not #' #' @return A GRanges object with all the CpG loci, their coverage, counts methylated and methylation rate +#' @keywords internal format_methylation_df <- function (dt,sample_id,normal_ids,path_output,n_cores,suffix,trim=FALSE) { # Get total cov (UM includes hetorozygous SNP non-CpG allele counts) @@ -323,7 +324,7 @@ format_methylation_df <- function (dt,sample_id,normal_ids,path_output,n_cores,s n<- nrow(dt) M=dt$M;UM=dt$UM vec <- cbind.data.frame(low=numeric(length=n), high=numeric(length=n)) - vec[,1:2]<- do.call(rbind, mclapply(1:n, function(i,M,UM) + vec[,1:2]<- do.call(rbind, parallel::mclapply(1:n, function(i,M,UM) HDIofICDF(ICDFname=qbeta, credMass=.99, shape1=M[i]+1, shape2=UM[i]+1), mc.cores=n_cores, M=M, UM=UM)) @@ -338,7 +339,7 @@ format_methylation_df <- function (dt,sample_id,normal_ids,path_output,n_cores,s if(sample_id%in%normal_ids){ # Checkpoint - cat("Normal methylation rates 99% highest density intervals annotated\n") + logging::logdebug("Normal methylation rates 99% highest density intervals annotated.", logger="CAMDAC") # if required remove trim high coverage sites (probs = poor alignment) if(trim == TRUE){ @@ -354,7 +355,7 @@ format_methylation_df <- function (dt,sample_id,normal_ids,path_output,n_cores,s if(!sample_id%in%normal_ids){ # Checkpoint - cat("Bulk methylation rates 99% highest density intervals annotated\n") + logging::logdebug("Bulk methylation rates 99% highest density intervals annotated.", logger="CAMDAC") ## save dt #dt_tumour_m <- dt @@ -365,7 +366,8 @@ format_methylation_df <- function (dt,sample_id,normal_ids,path_output,n_cores,s } # Arguments: -#' @param ICDFname is R's name for the inverse cumulative density function +#' Calculate intervalWidth_r +#' @param ICDFname is R's name for the inverse cumulative density function #' of the distribution. #' @param credMass is the desired mass of the HDI region. #' @param tol is passed to R's optimize function, @@ -378,9 +380,9 @@ format_methylation_df <- function (dt,sample_id,normal_ids,path_output,n_cores,s #' Notice that the parameters of the ICDFname must be explicitly named; #' e.g., HDIofICDF( qbeta , 30+1 , 12+1 ) does not work. #' Adapted and corrected from Greg Snow's TeachingDemos package. - -# Source fct outside of loop to speed up code -intervalWidth = function(lowTailPr,ICDFname,credMass, ... ) { +#' Source fct outside of loop to speed up code +#' @keywords internal +intervalWidth_r = function(lowTailPr,ICDFname,credMass, ... ) { ICDFname(credMass+lowTailPr, ... ) - ICDFname(lowTailPr, ... ) } @@ -388,7 +390,7 @@ HDIofICDF = function(ICDFname, credMass=0.99 , tol=1e-4, ... ) { incredMass = 1.0 - credMass - optInfo = optimize(f = intervalWidth, interval = c(0,incredMass) , ICDFname=ICDFname , credMass=credMass , tol=tol , ... ) + optInfo = optimize(f = intervalWidth_r, interval = c(0,incredMass) , ICDFname=ICDFname , credMass=credMass , tol=tol , ... ) HDIlowTailPr = optInfo$minimum vec <- setNames(object = ICDFname(c(HDIlowTailPr, credMass+HDIlowTailPr), ... ), nm = c("low", "high")) @@ -403,10 +405,11 @@ HDIofICDF = function(ICDFname, credMass=0.99 , tol=1e-4, ... ) { #' @param outfile character srting with output pdf filename #' #' @return pdf w/ methylation rate distribution, biases at polymorphic and non-polymorphic CG/CCGG and coverage distribution +#' @keywords internal plot_methylation_info <- function (df_sample, outfile) { alph <- ifelse(df_sample$class %in% c("SNP CpG", "SNP CCGG"), "SNP", "non-SNP") - p1 <- ggplot(data=df_sample, aes(x=m, y=..density.., color=class, fill = class, alpha=alph)) + + p1 <- ggplot(data=df_sample, aes(x=m, y=after_stat(density), color=class, fill = class, alpha=alph)) + ylab("Normalised density") + theme_classic() + geom_density(bw= 0.025) + scale_x_continuous(name="CpG methylation rate", breaks=seq(0,1,.1)) + @@ -417,7 +420,7 @@ plot_methylation_info <- function (df_sample, outfile) { scale_alpha_manual(name = "", values = c("SNP"=0.1,"non-SNP"=0.5)) + theme(legend.position="none") + ggtitle("A.") #+ - p2 <- ggplot(data=df_sample, aes(x=m, y=..count..,color=class, fill = class)) + + p2 <- ggplot(data=df_sample, aes(x=m, y=ggplot2::after_stat(count),color=class, fill = class)) + theme_classic() + geom_histogram(binwidth=0.025,alpha = 0.25) + scale_x_continuous(name="CpG methylation rate", breaks=seq(0,1,.1)) + @@ -443,16 +446,15 @@ plot_methylation_info <- function (df_sample, outfile) { #' @param dt Data.table that the grob will be made out of #' @param title_v Title for display #' @param fontsize_v Fontsize for title. Default is 14 (goes well with my_theme) - #' @value gtable object - #' @export + #' ## Table - table_grob <- tableGrob(dt, rows = rep('', nrow(dt)), theme = ttheme_minimal(base_size=8,vjust=0, hjust=0)) + table_grob <- gridExtra::tableGrob(dt, rows = rep('', nrow(dt)), theme = ttheme_minimal(base_size=8,vjust=0, hjust=0)) ## Title - title_grob <- textGrob(title_v, gp = gpar(fontsize = fontsize_v),x=0,hjust=0) + title_grob <- grid::textGrob(title_v, gp = grid::gpar(fontsize = fontsize_v),x=0,hjust=0) ## Add title - table_grob <- gtable_add_rows(table_grob, heights = grobHeight(title_grob) + unit(5,'mm'), pos = 0) - table_grob <- gtable_add_grob(table_grob, title_grob, 1, 1, 1, ncol(table_grob), clip = "off") + table_grob <- gtable::gtable_add_rows(table_grob, heights = grid::grobHeight(title_grob) + unit(5,'mm'), pos = 0) + table_grob <- gtable::gtable_add_grob(table_grob, title_grob, 1, 1, 1, ncol(table_grob), clip = "off") } df_sample_tmp <- data.table(df_sample) @@ -491,7 +493,7 @@ plot_methylation_info <- function (df_sample, outfile) { # "non-SNP CpG"="lightsalmon", "non-SNP CCGG"="lightblue")) + # theme(axis.ticks.x = element_blank(),axis.text.x=element_text(size=8)) + # ggtitle("E.") #+ theme(legend.position="none") - p5 <- ggplot(data=df_sample, aes(x=total_depth, y=..count..)) + + p5 <- ggplot(data=df_sample, aes(x=total_depth, y=ggplot2::after_stat(count))) + theme_classic() + geom_histogram(bins=50,alpha = 0.25, col="grey15") + scale_x_continuous(name= "CpG coverage", diff --git a/README.Rmd b/README.Rmd new file mode 100755 index 0000000..73ec204 --- /dev/null +++ b/README.Rmd @@ -0,0 +1,98 @@ +--- +output: + github_document: + html_preview: false +--- + + + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.path = "man/figures/README-", + out.width = "100%", + eval = FALSE +) +``` + +# CAMDAC + +Copy-number Aware Methylation Deconvolution Analysis of Cancer (CAMDAC) is an R library for deconvolving bulk tumor DNA methylation (bisulfite) sequencing data ([Larose Cadieux et al., 2022, bioRxiv](https://www.biorxiv.org/content/10.1101/2020.11.03.366252v2)). + + + + +## Documentation + +Visit [https://vanloo-lab.github.io/CAMDAC/](https://vanloo-lab.github.io/CAMDAC/). + +## Installation : Dockerhub + +A CAMDAC container is available on [dockerhub](https://hub.docker.com/r/nmensah5/camdac) for use with Docker, Singularity or Apptainer: + +```{bash} +docker pull nmensah5/camdac:latest +echo "library(CAMDAC)" > commands.R +docker run -v $(pwd):/app nmensah5/camdac:latest Rscript commands.R +``` + +## Installation : Github + +You can install CAMDAC and its dependencies from an R console: + +```{r} +install.packages("remotes") +remotes::install_github("VanLoo-lab/CAMDAC") +``` + +## Quickstart + +We provide pre-built reference datasets for hg38 and hg19. These files are required to run CAMDAC for either RRBS or WGBS analysis [from the Zenodo repository: (10565423)](https://zenodo.org/records/10565423). An R getter function is provided for convenience: + +```{r} +CAMDAC::download_pipeline_files(bsseq = "rrbs", directory = "./refs") +CAMDAC::download_pipeline_files(bsseq = "wgbs", directory = "./refs") +``` + +For WGBS analysis, CAMDAC requires the `java` command line utility to be available in the system PATH. + +With reference files downloaded, run the tumor-normal deconvolution pipeline with test data: + +> [!NOTE] +> We provide downsampled BAM files for testing the pipeline. For representative results, please use your own BAM files. + +```{r example} +library(CAMDAC) + +tumor_bam <- system.file("testdata", "tumour_beds_min.sorted.bam", package = "CAMDAC") +normal_bam <- system.file("testdata", "normal_beds_min.sorted.bam", package = "CAMDAC") + +# Select samples for basic tumor-normal analysis +tumor <- CamSample(id = "T", sex = "XY", bam = tumor_bam, patient_id="readme") +normal <- CamSample(id = "N", sex = "XY", bam = normal_bam, patient_id="readme") + +# Configure pipeline +config <- CamConfig( + outdir = "./validation/results/test_readme/", bsseq = "rrbs", lib = "pe", + build = "hg38", refs = "./refs", n_cores = 1, cna_caller='ascat', + min_cov=1, # Minimum tumour coverage at 1 for testing. + min_normal_cov=1, # Minimum normal coverage at 1 for testing. + min_mapq=1 # Minimum MAPQ at 1 for testing. +) + +# Run CAMDAC +CAMDAC::pipeline( + tumor, germline = normal, infiltrates = normal, origin = normal, config +) +``` + +For a more detailed walkthrough with test data, see `vignette("pipeline")`. + +## Contributing + +To contribute to CAMDAC, fork [the repository](https://github.com/VanLoo-lab/CAMDAC) and install the development dependencies with `remotes::install_dev_deps('.')`. + +After making your changes, run the build and test commands listed in `vignette("contributing")`. + +Finally, submit a [pull request](https://github.com/VanLoo-lab/CAMDAC/pulls) with the changes on your fork. diff --git a/README.md b/README.md index c15a203..b7a9e98 100755 --- a/README.md +++ b/README.md @@ -1,50 +1,101 @@ -# Copy-number Aware Methylation Deconvolution and Analysis of Cancer (CAMDAC) -Plesae refer to the [CAMDAC manual](https://htmlpreview.github.io/?https://github.com/VanLoo-lab/CAMDAC/blob/main/CAMDAC_manual/CAMDAC_manual.html) for a detailed description of the CAMDAC principles, installation and steps for running the code. + -To cite CAMDAC, please refer to our pre-print: [Larose Cadieux et al., 2020. Copy number-aware deconvolution of tumor-normal DNA methylation profiles. bioRxiv.](https://doi.org/10.1101/2020.11.03.366252). +# CAMDAC -## Installation +Copy-number Aware Methylation Deconvolution Analysis of Cancer (CAMDAC) +is an R library for deconvolving bulk tumor DNA methylation (bisulfite) +sequencing data ([Larose Cadieux et al., 2022, +bioRxiv](https://www.biorxiv.org/content/10.1101/2020.11.03.366252v2)). -The CAMDAC R library can be install from github repository: + -```r -# Install the remotes package -install.packages("remotes") + -# Install CAMDAC from GitHub -remotes::install_github("VanLoo-lab/CAMDAC") +## Documentation + +Visit . + +## Installation : Dockerhub + +A CAMDAC container is available on +[dockerhub](https://hub.docker.com/r/nmensah5/camdac) for use with +Docker, Singularity or Apptainer: + +``` bash +docker pull nmensah5/camdac:latest +echo "library(CAMDAC)" > commands.R +docker run -v $(pwd):/app nmensah5/camdac:latest Rscript commands.R ``` -Files required to run the CAMDAC pipeline [(listed here)](inst/extdata/pipeline_files_urls.txt) can be downloaded with a helper function: +## Installation : Github -```r -library(CAMDAC) -CAMDAC::download_pipeline_files(bsseq="rrbs", directory="pipeline_files/") +You can install CAMDAC and its dependencies from an R console: + +``` r +install.packages("remotes") +remotes::install_github("VanLoo-lab/CAMDAC") ``` ## Quickstart -To call CAMDAC with a matched tumor and adjacent normal sample: +We provide pre-built reference datasets for hg38 and hg19. These files +are required to run CAMDAC for either RRBS or WGBS analysis [from the +Zenodo repository: (10565423)](https://zenodo.org/records/10565423). An +R getter function is provided for convenience: + +``` r +CAMDAC::download_pipeline_files(bsseq = "rrbs", directory = "./refs") +CAMDAC::download_pipeline_files(bsseq = "wgbs", directory = "./refs") +``` + +For WGBS analysis, CAMDAC requires the `java` command line utility to be +available in the system PATH. -```r +With reference files downloaded, run the tumor-normal deconvolution +pipeline with test data: + +> \[\!NOTE\] +> We provide downsampled BAM files for testing the pipeline. For +> representative results, please use your own BAM files. + +``` r library(CAMDAC) -tumor_bam <- system.file("extdata", "test_tumor.bam", package = "CAMDAC") -normal_bam <- system.file("extdata", "test_normal.bam", package = "CAMDAC") - -CAMDAC::pipeline_tumor_normal( - patient_id="P1", - tumor_id="T", - normal_id="N", - tumor_bam=tumor_bam, - normal_bam=normal_bam, - sex="XY", - path="results/", - pipeline_files="pipeline_files/", - build="hg38", - min_tumor = 1, - min_normal = 1, - mq = 0, - n_cores = 1 + +tumor_bam <- system.file("testdata", "tumour_beds_min.sorted.bam", package = "CAMDAC") +normal_bam <- system.file("testdata", "normal_beds_min.sorted.bam", package = "CAMDAC") + +# Select samples for basic tumor-normal analysis +tumor <- CamSample(id = "T", sex = "XY", bam = tumor_bam, patient_id="readme") +normal <- CamSample(id = "N", sex = "XY", bam = normal_bam, patient_id="readme") + +# Configure pipeline +config <- CamConfig( + outdir = "./validation/results/test_readme/", bsseq = "rrbs", lib = "pe", + build = "hg38", refs = "./refs", n_cores = 1, cna_caller='ascat', + min_cov=1, # Minimum tumour coverage at 1 for testing. + min_normal_cov=1, # Minimum normal coverage at 1 for testing. + min_mapq=1 # Minimum MAPQ at 1 for testing. +) + +# Run CAMDAC +CAMDAC::pipeline( + tumor, germline = normal, infiltrates = normal, origin = normal, config ) ``` + +For a more detailed walkthrough with test data, see +`vignette("pipeline")`. + +## Contributing + +To contribute to CAMDAC, fork [the +repository](https://github.com/VanLoo-lab/CAMDAC) and install the +development dependencies with `remotes::install_dev_deps('.')`. + +After making your changes, run the build and test commands listed in +`vignette("contributing")`. + +Finally, submit a [pull +request](https://github.com/VanLoo-lab/CAMDAC/pulls) with the changes on +your fork. diff --git a/_pkgdown.yml b/_pkgdown.yml new file mode 100644 index 0000000..8bb174c --- /dev/null +++ b/_pkgdown.yml @@ -0,0 +1,18 @@ +articles: +- title: Pipeline + navbar: ~ + contents: + - introduction + - setup + - pipeline + - output +- title: Experimental + navbar: ~ + contents: + - experimental +- title: Extra + navbar: ~ + contents: + - questions + - technical + - contributing diff --git a/docs/404.html b/docs/404.html new file mode 100644 index 0000000..1caa8eb --- /dev/null +++ b/docs/404.html @@ -0,0 +1,129 @@ + + + + + + + +Page not found (404) • CAMDAC + + + + + + + + + + + +
+
+ + + + +
+
+ + +Content not found. Please use links in the navbar. + +
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.9.

+
+ +
+
+ + + + + + + + diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html new file mode 100644 index 0000000..bddae3a --- /dev/null +++ b/docs/LICENSE-text.html @@ -0,0 +1,123 @@ + +License • CAMDAC + + +
+
+ + + +
+
+ + +
# MIT License
+
+Copyright (c) 2020 CAMDAC
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+ +
+ + + +
+ + + +
+ +
+

Site built with pkgdown 2.0.9.

+
+ +
+ + + + + + + + diff --git a/docs/articles/contributing.html b/docs/articles/contributing.html new file mode 100644 index 0000000..ba72f72 --- /dev/null +++ b/docs/articles/contributing.html @@ -0,0 +1,165 @@ + + + + + + + +Contributing • CAMDAC + + + + + + + + + + + + +
+
+ + + + +
+
+ + + + +

To contribute to CAMDAC, fork the repository and install the development dependencies with remotes::install_dev_deps('.').

+

After making your changes, run the test and build commands listed below, then submit a pull request with the changes on your fork.

+
+

CAMDAC test and build commands +

+
+library(devtools)
+
+# Install dev dependencies
+devtools::install_dev_deps("VanLoo-lab/CAMDAC")
+
+# Update docs
+devtools::document()
+
+# Run tests
+devtools::test()
+
+# Build readme
+rmarkdown::render('README.Rmd', output_format='github_document', output_file='README.md')
+
+# Check package builds
+devtools::check()
+
+# Build documentation
+pkgdown::build_site(examples=FALSE, devel=TRUE, lazy=TRUE, preview=FALSE)
+pkgdown::preview_site() # To view. Or: python3 -m http.server --directory docs 8000
+
+# Commit changes on the docs/ folder before submitting 
+
+
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.9.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/contributing_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/articles/contributing_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/articles/contributing_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/articles/experimental.html b/docs/articles/experimental.html new file mode 100644 index 0000000..962d3db --- /dev/null +++ b/docs/articles/experimental.html @@ -0,0 +1,641 @@ + + + + + + + +Experimental Features • CAMDAC + + + + + + + + + + + + +
+
+ + + + +
+
+ + + + +

This document describes experimental features of the CAMDAC package. These features are not yet fully tested and may change in future releases. The following features are currently under development for the WGBS pipeline only:

+
    +
  • Deconvolution only
  • +
  • Using external copy number solutions
  • +
  • Copy number calling in tumor-only mode
  • +
  • Allele-specific methylation analysis
  • +
  • Normal DNA methylation panels
  • +
  • DMR visualisation
  • +
+
+

Deconvolution only +

+

The CAMDAC equation can be used to infer pure tumour DNA methylation rates, provided the following information is available per CpG:

+
    +
  • Bulk tumour methylation rate (CpG-wise)
  • +
  • Tumour allele-specific copy number state (local region overlapping CpG)
  • +
  • Tumour purity (single parameter per-sample)
  • +
+

Here is an example for 5 CpGs from a single sample. Note: the normal copy number state is assumed diploid (2) in humans:

+
+
+# Set parameters
+bulk = c(0.3, 0.5, 0.2, 0.1, 0.9)
+normal = c(0.3, 0.9, 0.1, 0.7, 0.5)
+ploidy = c(2, 2, 1, 3, 4)
+purity = 0.8
+
+# Deconvolve methylation rates
+pure_meth = CAMDAC:::calculate_mt(bulk, normal, purity, ploidy)
+
+# Set clean rates based on threshold
+pure_meth_clean = dplyr::case_when(
+  pure_meth < 0 ~ 0,
+  pure_meth > 1 ~ 1,
+  TRUE ~ pure_meth
+)
+

After deconvolution, it may be useful to estimate the CpG coverage in the deconvolved tumour sample. Additionally, the highest density interval (HDI) of the methylation rate may be informative for quality control. These metrics can be calculated given additional information on bulk methylated and unmethylated read counts:

+
+
+# Optional: calculate effective coverage of the tumour
+# # Requires coverage per CpG in the bulk sample
+bulk_coverage = c(10, 20, 5, 15, 30)
+pure_effective_coverage = CAMDAC:::calculate_mt_cov(bulk_coverage, purity, ploidy)
+
+# Optional: calculate the HDI of the pure tumour methylation rate
+bulk_methylated_count = c(3, 10, 1, 2, 27)
+bulk_unmethylated_count = c(7, 10, 4, 13, 3)
+normal_methylated_count = c(3, 9, 1, 5, 2)
+normal_unmethylated_count = c(7, 11, 3, 8, 3)
+
+# HDI function (fast)
+CAMDAC:::hdi_norm_approx(
+  bulk_methylated_count,
+  bulk_unmethylated_count,
+  normal_methylated_count,
+  normal_unmethylated_count,
+  purity,
+  ploidy
+)
+
+# HDI function (most accurate)
+CAMDAC:::vec_HDIofMCMC_mt( 
+  bulk_methylated_count,
+  bulk_unmethylated_count,
+  normal_methylated_count,
+  normal_unmethylated_count,
+  purity,
+  ploidy,
+  credMass=0.99
+)
+
+
+

Using external copy number solutions +

+

The germline sample is optional as, in the absence of patient-matched methylation data, you may already have an allele-specific CNA solutions for your bulk tumor. For example, this could be derived from bulk WGS of the same sampl.

+

You can provide this data in tab-delimited text file as shown below. Importantly,:

+
    +
  • column names are optional
  • +
  • purity and ploidy values are taken from the first data row alone
  • +
  • chromosome names may be given with or without ‘chr’ prefix
  • +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
chromstartendmajor_cnminor_cnpurityploidy
chr11400210.673.5
chr14011000110.673.5
+

To run CAMDAC with this CNA solution, pass attach the file to the tumor CamSample() object:

+
+library(CAMDAC)
+
+# Load test data
+b_tumor <- system.file("testdata", "tumor.bam", package = "CAMDAC")
+b_normal <- system.file("testdata", "normal.bam", package = "CAMDAC")
+cna_file <- system.file("testdata", "test.cna.txt", package = "CAMDAC")
+
+# Set config
+config <- CamConfig(outdir="./results", bsseq="wgbs", lib="pe", build="hg38", n_cores=10)
+
+# Create tumor object and attach CNA solution
+tumor <- CamSample(id="T", sex="XY", bam=b_tumor)
+attach_output(tumor, config, "cna", cna_file)
+
+# Define normal object(s) for deconvolution or differential methylation
+normal <- CamSample(id="N", sex="XY", bam=b_normal)
+
+# Run pipeline with CNA solution
+pipeline(
+    tumor=tumor,
+    germline=NULL,
+    infiltrates=normal,
+    origin=normal,
+    config=config
+)
+
+
+

Copy number calling in tumor-only mode +

+

If no SNP file is present for the germline, CAMDAC will infer the copy number calls from the tumor sample alone. Here, the BAF is calculated by a threshold on the tumor BAF, and the LogR is calculated by taking the coverage relative to the median. These results are not as accurate as using a germline normal sample.

+

You may already know where heterozygous SNPs lie for your sample, obviating the need for a tumor BAF threshold. In addition, you may have a proxy of the normal coverage for your platform, which is an improvement over taking the tumor median. You can provide this information by attaching a SNPs file to the germline CamSample object. The file should contain:

+ + + + + + + + + + + + + + + + + + + + + + + +
FieldDescription
chromChromosome name
POSPosition of SNP
BAF(optional) B-allele frequency at this SNP
total_counts(optional) Total number of reads at this SNP
+

POS and total_counts are used to derive the BAF and the LogR respectively. We strongly recommend that total_counts is derived from a normal sample sequenced with the same bisulfite-sequencing assay as the tumor, and unmatched patient samples are acceptable.

+

CAMDAC may be run to the copy number calling stage using the external heterozygous SNP file:

+
+library(CAMDAC)
+
+# Load test data
+b_tumor <- system.file("testdata", "tumor.bam", package = "CAMDAC")
+snps_file <- system.file("testdata", "test.to.norm_pos.csv.gz", package = "CAMDAC")
+
+# Set config
+config <- CamConfig(outdir="./results", bsseq="wgbs", lib="pe", build="hg38", n_cores=10)
+
+# Create tumor object and attach CNA solution
+tumor <- CamSample(id="T", sex="XY", bam=b_tumor)
+attach_output(tumor, config, "cna", cna_file)
+
+# Define normal object(s) for deconvolution or differential methylation
+germline <- CamSample(id="G", sex="XY")
+attach_output(germline, config, "snps", snps_file)
+
+# Run pipeline with CNA solution
+pipeline(
+    tumor=tumor,
+    germline=germline,
+    infiltrates=NULL,
+    origin=NULL,
+    config=config
+)
+

After this, we recommend inspecting the CNA results. If all is well, the pipeline() function can be repeated with the infiltrates and origin CamSamples to complete deconvolution and differential methylation respectively.

+
+
+

Allele-specific methylation (ASM) analysis +

+

CAMDAC can be used to detect allele-specific methylation (ASM) by phasing CpGs to heterozygous SNPs and deconvolving bulk methylation rates per allele.

+

This tutorial steps through the ASM analysis pipeline (WGBS only):

+
    +
  1. Count CpG methylation on tumor and normal at sites phased to SNP loci.
  2. +
  3. Deconvolve methylation on tumor per haplotype using the normal
  4. +
  5. Assign allele-specific copy number state per CpG using the bulk tumor solution
  6. +
  7. Call allele-specific differential methylation within samples
  8. +
  9. Call allele-specific differential methylation between samples
  10. +
+

Results from this pipeline are found in the results directory under ‘PATIENT/AlleleSpecific’ and ‘PATIENT/Methylation’. See output file headings below for files and their content.

+
+

CAMDAC-ASM from BAM files +

+

The asm_pipeline() function runs CAMDAC-ASM analysis by generates the allele-specific copy number solution and heterozygous SNP loci, followed by deconvolution and differential ASM analysis:

+
+b_tumor <- system.file("testdata", "tumor.bam", package = "CAMDAC")
+b_normal <- system.file("testdata", "normal.bam", package = "CAMDAC")
+regions <- system.file("testdata", "test_wgbs_segments.bed", package = "CAMDAC") # speed up tests
+
+tumor <- CamSample(id = "T", sex = "XY", bam = b_tumor)
+normal <- CamSample(id = "N", sex = "XY", bam = b_normal)
+config <- CamConfig(
+  outdir = "./results", ref = "./pipeline_files", bsseq = "wgbs", lib = "pe", cores = 10,
+  min_cov = 1, # For test data
+  regions = regions
+)
+
+asm_pipeline(
+  tumor = tumor,
+  germline = normal,
+  infiltrates = normal,
+  origin = normal,
+  config = config
+)
+
+
+

CAMDAC-ASM from external inputs (in_development) +

+

To run the ASM pipeline without BAM files, CAMDAC requires: - Each CamSample object has SNP loci - The tumor CamSample object has an allele-specific CNA solution - All CamSample objects have BAM files available for phasing

+

CAMDAC-ASM requires a file of heterozygous SNP loci against which CpGs will be phased. This is a tab-delimited file with a header containing four fields:

+ + + + + + + + + + + + + + + + + + + + + + + +
FieldDescription
chromChromosome name
posSNP loci position
refThe reference allele (A/C/T/G)
altThe alternate SNP allele (A/C/T/G)
+

First, attach your SNP loci file to the tumor object with attach_output(), then run asm_pipeline():

+
+# Setup CAMDAC samples
+tumor <- CamSample(id = "tumor", sex = "XY", bam = b_tumor)
+normal <- CamSample(id = "normal", sex = "XY", bam = b_normal)
+config <- CamConfig(
+  outdir = "./results", ref = "./pipeline_files", bsseq = "wgbs", lib = "pe", cores = 10,
+  min_cov = 1, # For test data
+  regions = regions
+) # For arapid testing)
+
+# Add SNPs
+asm_snps_file <- system.file("testdata", "test_het_snps.tsv", package = "CAMDAC")
+attach_output(tumor, config, "asm_snps", asm_snps_file)
+attach_output(normal, config, "asm_snps", asm_snps_file)
+

Next, CAMDAC requires the allele-specific copy number solution from the tumor, attached as follows:

+
+cna_file <- system.file("testdata", "test_cna.tsv", package = "CAMDAC")
+attach_output(tumor, config, "cna", cna_file)
+

Finally, run the allele-specific methylation pipeline:

+
+asm_pipeline(
+  tumor = tumor,
+  infiltrates = normal,
+  origin = normal,
+  config = config
+)
+
+
+

CAMDAC-ASM using SNP calls from previous CAMDAC runs +

+

If you have already run the CAMDAC pipeline in tumor-normal mode, then the germline object’s SNP files will be used by default. The simplest run from BAM to ASM is shown below using matched normals for infiltrates and DMPs:

+
+b_tumor <- system.file("testdata", "tumor.bam", package = "CAMDAC")
+b_normal <- system.file("testdata", "normal.bam", package = "CAMDAC")
+regions <- system.file("testdata", "test_wgbs_segments.bed", package = "CAMDAC") # speed up tests
+
+tumor <- CamSample(id = "T", sex = "XY", bam = b_tumor)
+normal <- CamSample(id = "N", sex = "XY", bam = b_normal)
+config <- CamConfig(
+  outdir = "./test_results", bsseq = "wgbs", lib = "pe",
+  build = "hg38", n_cores = 10,
+  regions = regions,
+  min_cov = 1, # For test data
+  cna_caller = "ascat" # Battenberg always recommended, however ASCAT used here for rapid testing.
+)
+
+# Run main CAMDAC generate SNP files for ASM
+# Deconvolution skipped here for simplicity.
+pipeline(tumor, germline = normal, infiltrates = NULL, origin = NULL, config)
+
+# Run ASM pipeline
+asm_pipeline(
+  tumor = tumor,
+  germline = normal,
+  infiltrates = normal,
+  origin = normal,
+  config = config
+)
+
+
+

ASM output file headings +

+

** Allele-specific/ **

+
    +
  • *asm_counts.csv.gz - The number of reads supporting each allele at each CpG
  • +
  • *asm_hap_stats.csv.gz - Summary statistics for each phased SNP
  • +
  • *asm_phase_map.csv.gz - A mapping of CpG-SNP phased pairs per read
  • +
  • *snps.txt - The heterozygous SNP loci input for ASM analysis
  • +
  • *cna.csv - For the tumour, the allele-specific copy number profile. See format in vignettes("pipeline").
  • +
+

** Methylation/ **

+
    +
  • *asm_meth.csv.gz - Allele-specific methylation rates for bulk samples
  • +
  • *asm_ss_dmp.csv.gz - Single sample differential allele-specific methylation
  • +
  • *asm_meth_cna.csv.gz - For the tumour, ASM rates with annotated copy number states
  • +
  • *asm_meth_pure.csv.gz - For the tumour, pure methylation rates for each allele
  • +
  • *asm_dmp.csv.gz - Differential allele-specific methylation between tumor and origin sample
  • +
+
+
+
+

Normal DNA methylation panels +

+

This feature is currently described for CAMDAC-WGBS only.

+
+

Create a methylation panel from multiple normal BAM files +

+

CAMDAC supports the use of multiple DNA methylation BAM files as a source of the normal infiltrates or normal cell of origin.

+

To create a panel, process your BAM files with the CAMDAC allele counter:

+
library(CAMDAC)
+
+# Get BAM files
+b_normal1 = system.file("inst/testdata/normal.bam")
+b_normal2 = system.file("inst/testdata/normal.bam")
+b_normal3 = system.file("inst/testdata/normal.bam")
+
+# Run allele counter
+for(file in c(b_normal1, b_normal2, b_normal3)){
+    prefix = fs::path_ext_remove(file)
+    outfile = paste0(prefix, ".all.SNPs.CG.csv.gz")
+    data = cmain_count_alleles(bam_file)
+    data.table::fwrite(data, outfile)
+}
+

The allele counts files can then be merged into a single file for the panel containing methylation data for deconvolution:

+
+panel_counts <- fs::dir_ls(".", glob="*.SNPs.CG.csv.gz")
+panel <- panel_meth_from_counts(panel_counts)
+data.table::fwrite(panel, "panel.m.csv.gz")
+

By default, panel counts are merged by summing the methylation read counts for each CpG site. You can customise the proportion of each sample that is used in the panel by specifying the ac_props argument in panel_meth_from_counts. To get the mean across each CpG site, simply pass equal proportions for each sample.

+

To run CAMDAC with your newly created panel, attach your panel to a CamSample object using the meth argument.

+
+# Load test data
+b_tumor <- system.file("testdata", "tumor.bam", package = "CAMDAC")
+b_normal <- system.file("testdata", "normal.bam", package = "CAMDAC")
+
+# Setup CAMDAC samples
+tumor <- CamSample(id="tumor", sex="XY", bam=b_tumor)
+normal <- CamSample(id="normal", sex="XY", bam=b_normal)
+config <- CamConfig(outdir="./results", ref="./pipeline_files", bsseq="wgbs", lib="pe", cores=10)
+
+# Setup panel sample
+panel <- CamSample(id="panel", sex="XY")
+panel_file <- system.file("testdata", "test_panel.m.csv.gz", package = "CAMDAC")
+attach_output(panel, config, "meth", panel_file)
+
+# Run CAMDAC with panel
+pipeline(
+    tumor=tumor,
+    germline=normal,
+    infiltrates=panel,
+    origin=panel,
+    config=config
+)
+
+
+

Create a methylation panel from a matrix of beta values +

+

If you have not started from BAM files, you can create a panel using a matrix of beta values:

+ + + + + + + + + + + + + + + + + + +
sample1sample2sample3
0.50.60.7
0.40.50.6
+

Additionally, a data frame specifying the positions of each CpG site in the beta value matrix is required. Here, start and end refer to the C and G of the CpG site respectively:

+ + + + + + + + + + + + + + + + + + +
chromstartend
chr1100101
chr1200201
+

The matrix and CpG locations can be passed directly to the panel_meth_from_beta() function, along with settings.

+
+# Load beta values and chromosome positions
+ex <- system.file("testdata", "test_panel_from_beta.csv", package = "CAMDAC")
+data <- data.table::fread(ex)
+mat = data[, 4:ncol(data)] # Beta value matrix with 3 samples
+
+# Create panel from beta values
+panel_beta <- panel_meth_from_beta(
+  mat = mat,
+  chrom = data$chrom,
+  start = data$start,
+  end = data$end, 
+  cov = 100,
+  props = c(0.1, 0.8, 0.1), # Proportions of each sample in panel
+  min_samples = 1,
+  max_sd = 1
+)
+

As CAMDAC requires coverage at each CpG site to estimate uncertainty, the cov value is given to all CpG sites when building a panel from beta values. Additionally, if any beta values are missing from a sample, proportions are recalculated among the remaining samples as this is the only information available to build the panel for that site.

+

There are two experimental arguments that can be set to filter CpG sites from the panel:

+
    +
  • min_samples: The minimum number of samples that have to have a beta value for a CpG to be included in the panel. The idea here is if you have sparse data, you can skip sites where you aren’t confident in the panel. Set this to 1 to use any sample.

  • +
  • max_sd: Maximum standard deviation of beta values across samples a CpG must have to be included in the panel. The idea here is that when combining many bulk methylomes from the same tissue, sites with high variability reflect sample-specific differences and their averages are less reliable for use in a methylation panel.

  • +
+
+
+
+

DMR visualisation +

+

CAMDAC produces several output files that visualise the copy number state. DNA methylation rates can be passed to external packages for visualisation. For a quick view of DMRs in R:

+
+library(data.table)
+library(ggplot2)
+library(CAMDAC)
+
+# Show DMPs around a region
+dmr <- data.table(dmr) # Object from CAMDAC output *annotated_DMRs.fst
+dmp <- data.table(dmp) # Object from CAMDAC *results_per_CpG.fst
+chrome <- dmr[1, ]$chrom
+starte <- dmr[1, ]$start
+ende <- dmr[1, ]$end
+offset <- 1000 # Offset 1kB either side of region
+dmp <- data.table(dmp)
+dm_regions <- dmp[chrom == as.character(chrome) & start >= (starte - offset) & end <= (ende + offset), ]
+
+# Using ggplot, generate a geom where the m_t values are
+tplt <- ggplot(dm_regions, aes(x = start)) +
+  geom_point(aes(y = m_t), color = "skyblue") +
+  geom_point(aes(y = m_n), color = "grey") +
+  geom_vline(aes(xintercept = start, color = DMP_t)) +
+  theme_classic() +
+  scale_color_manual(values = c("skyblue", "blue")) +
+  scale_y_continuous(limits = c(0, 1)) +
+  geom_vline(xintercept = c(start, end), color = "red", linetype = "dashed") +
+  labs(x = dm_regions$chrom[[1]])
+tplt
+
+

CAMDAC DMR Visualization

+
+

Here, light blue dots are the pure tumour, while light-grey are the normal. The red dash is the DMR region and the vertical lines are hypomethylated DMPs (blue) and hypermethylated DMPs (light blue).

+
+
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.9.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/experimental_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/articles/experimental_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/articles/experimental_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/articles/images/CAMDAC_manual_DMR_summary_plots.png b/docs/articles/images/CAMDAC_manual_DMR_summary_plots.png new file mode 100644 index 0000000..b359f72 Binary files /dev/null and b/docs/articles/images/CAMDAC_manual_DMR_summary_plots.png differ diff --git a/docs/articles/images/CAMDAC_manual_SNP_data.png b/docs/articles/images/CAMDAC_manual_SNP_data.png new file mode 100644 index 0000000..22c7423 Binary files /dev/null and b/docs/articles/images/CAMDAC_manual_SNP_data.png differ diff --git a/docs/articles/images/CAMDAC_manual_fig1.png b/docs/articles/images/CAMDAC_manual_fig1.png new file mode 100644 index 0000000..1f3e4e8 Binary files /dev/null and b/docs/articles/images/CAMDAC_manual_fig1.png differ diff --git a/docs/articles/images/CAMDAC_manual_fig2.png b/docs/articles/images/CAMDAC_manual_fig2.png new file mode 100644 index 0000000..872b72f Binary files /dev/null and b/docs/articles/images/CAMDAC_manual_fig2.png differ diff --git a/docs/articles/images/CAMDAC_manual_formatted_allele_counts_output.png b/docs/articles/images/CAMDAC_manual_formatted_allele_counts_output.png new file mode 100644 index 0000000..20104bd Binary files /dev/null and b/docs/articles/images/CAMDAC_manual_formatted_allele_counts_output.png differ diff --git a/docs/articles/images/CAMDAC_manual_fragment_length_histogram.png b/docs/articles/images/CAMDAC_manual_fragment_length_histogram.png new file mode 100644 index 0000000..431c52b Binary files /dev/null and b/docs/articles/images/CAMDAC_manual_fragment_length_histogram.png differ diff --git a/docs/articles/images/CAMDAC_manual_normal_SNP_data.png b/docs/articles/images/CAMDAC_manual_normal_SNP_data.png new file mode 100644 index 0000000..48926b4 Binary files /dev/null and b/docs/articles/images/CAMDAC_manual_normal_SNP_data.png differ diff --git a/docs/articles/images/CAMDAC_manual_normal_methylation_output.png b/docs/articles/images/CAMDAC_manual_normal_methylation_output.png new file mode 100644 index 0000000..113023d Binary files /dev/null and b/docs/articles/images/CAMDAC_manual_normal_methylation_output.png differ diff --git a/docs/articles/images/CAMDAC_manual_normal_methylation_rate_summary.png b/docs/articles/images/CAMDAC_manual_normal_methylation_rate_summary.png new file mode 100644 index 0000000..77c21e8 Binary files /dev/null and b/docs/articles/images/CAMDAC_manual_normal_methylation_rate_summary.png differ diff --git a/docs/articles/images/CAMDAC_manual_tumour_versus_normal_methylomes.png b/docs/articles/images/CAMDAC_manual_tumour_versus_normal_methylomes.png new file mode 100644 index 0000000..71ed4d9 Binary files /dev/null and b/docs/articles/images/CAMDAC_manual_tumour_versus_normal_methylomes.png differ diff --git a/docs/articles/images/camdac_dmr_vis.png b/docs/articles/images/camdac_dmr_vis.png new file mode 100644 index 0000000..b64ad76 Binary files /dev/null and b/docs/articles/images/camdac_dmr_vis.png differ diff --git a/docs/articles/index.html b/docs/articles/index.html new file mode 100644 index 0000000..bd57d07 --- /dev/null +++ b/docs/articles/index.html @@ -0,0 +1,125 @@ + +Articles • CAMDAC + + +
+
+ + + +
+
+ + +
+

Pipeline

+

+ +
Introduction
+
+
Installation
+
+
CAMDAC pipeline
+
+
Results
+
+
+
+

Experimental

+

+ +
Experimental Features
+
+
+
+

Extra

+

+ +
FAQs
+
+
Technical Note
+
+
Contributing
+
+
+
+
+ + +
+ +
+

Site built with pkgdown 2.0.9.

+
+ +
+ + + + + + + + diff --git a/docs/articles/introduction.html b/docs/articles/introduction.html new file mode 100644 index 0000000..10aaaef --- /dev/null +++ b/docs/articles/introduction.html @@ -0,0 +1,154 @@ + + + + + + + +Introduction • CAMDAC + + + + + + + + + + + + +
+
+ + + + +
+
+ + + + +
+

Introduction +

+

Solid tumours typically contain both cancer and admixed normal contaminating cells, which confounds the analysis of bulk cancer methylomes from bisulfite sequencing. To address these issues we present CAMDAC, a tool for Copy-number Aware Methylation Deconvolution Analysis of Cancer.

+

In brief, we show that the bulk tumour methylation rate (\(m_b\)) can be expressed as a weighted sum of the methylation rates of the tumour cells and normal contaminants, accounting for tumour purity and copy number (Figure 1). We derive purity and copy number estimates directly from bulk tumour RRBS data, leveraging somatic copy number aberration calls from ASCAT or Battenberg. We use bulk tissue- and sex-matched normal samples as proxy for the normal tumour-infiltrating cells (\(m_{n,i}\)), and obtain \(m_b\) from the bulk tumour data itself. This provides all the necessary information to extract the pure tumour methylation rate (\(m_t\)).

+
+
+

Figure 1. CAMDAC principles and key variables. Adapted from Larose Cadieux et al., 2020.

+
+
+


In Larose Cadieux et al., 2020, we obtained bulk tumour RRBS data from surgically resected lung cancers and patient-matched tumour-adjacent normal lung samples. Normal samples may be used for copy number profiling, as proxy a for the normal tumour-infiltrating cells (\(m_{n,i}\)), and as a proxy for the tumour cell of origin (\(m_{n,o}\)). Here, \(m_{n,i}\) is needed for bulk tumour methylation rate deconvolution and \(m_{n,o}\) is required for differential methylation analyses (Figure 2). In non-small cell lung cancer, we demonstrate that patient-matched tumour-adjacent normal is a suitable proxy for all normals, i.e. \(m_{n,i} \approx m_{n,o}\) (Larose Cadieux et al., 2020).

+
+
+

Figure 2. Key input and output data for CAMDAC

+
+
+


If the patient-matched tumour-adjacent normal tissue is not available, a tissue- and sex-matched normal may provide a substitute for the tumour-infiltrating normal cells (Figure 2). If the tissue-matched normal is a poor representative of the cell of origin, a different proxy may be used for differential methylation analysis.

+

The purified tumour methylation rates allow for accurate differential methylation analysis, both between tumour and normal cells and, in the case of multi-region sequencing, between different tumour samples. The deconvoluted methylation profiles accurately inform inter- and intra-tumour sample relationships and could enable the timing of copy number gains and (epi)mutations in tumour evolution. This is explained in more detail in Larose Cadieux et al., 2020.

+

At time of writing, CAMDAC is compatible with human Msp1 digested single-end directional reduced representation bisulfite sequencing (RRBS) data and whole genome bisulfite sequencing (WGBS) data. The input must be in binary alignment map (BAM) format. Bases should be quality and adapter trimmed and PCR duplicates should be removed. BAM files may be aligned to hg19, hg38, GRCH37 and GRHCH38 reference human genome builds.

+
+
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.9.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/introduction_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/articles/introduction_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/articles/introduction_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/articles/output.html b/docs/articles/output.html new file mode 100644 index 0000000..220f0d8 --- /dev/null +++ b/docs/articles/output.html @@ -0,0 +1,298 @@ + + + + + + + +Results • CAMDAC + + + + + + + + + + + + +
+
+ + + + +
+
+ + + + +

The CAMDAC pipeline returns a structured directory at the outdir from the CamConfig() object. The pipeline returns files unique to the RRBS and WGBS modules with the general structure:

+
└── <CamSample.patient_id>
+    ├── Allelecounts
+    │   ├── <CamSample.id>
+    ├── Copynumber
+    │   ├── <CamSample.id>
+    └── Methylation
+        └── <CamSample.id>
+

The sections below describe each results file in more detail. Next, see vignette("questions") for frequently asked questions or vignette("experimental") for details on experimental CAMDAC features.

+
+

RRBS pipeline output +

+
results/              
+└── P                                           
+    ├── Allelecounts
+    │   ├── N                  
+    │   │   └── P.N.SNPs.CpGs.all.sorted.RData
+    │   └── T                               
+    │       └── P.T.SNPs.CpGs.all.sorted.RData
+    ├── Copy_number                             
+    │   ├── N                        
+    │   │   ├── fragment_length_histogram.pdf      
+    │   │   ├── msp1_fragments_RRBS.RData
+    │   │   ├── P_N_normal_SNP_data.pdf
+    │   │   ├── P.N.SNPs.RData
+    │   │   └── Rplots.pdf
+    │   └── T
+    │       ├── fragment_length_histogram.pdf
+    │       ├── msp1_fragments_RRBS.RData
+    │       ├── P_T_SNP_data.pdf
+    │       ├── P.T.ACF.and.ploidy.txt
+    │       ├── P.T.ascat.bc.RData
+    │       ├── P.T.ascat.frag.RData
+    │       ├── P.T.ascat.output.RData
+    │       ├── P.T.ASCATprofile.png
+    │       ├── P.T.ASPCF.png
+    │       ├── P.T.BAF.PCFed.txt
+    │       ├── P.T.germline.png
+    │       ├── P.T.LogR.PCFed.txt
+    │       ├── P.T.rawprofile.png
+    │       ├── P.T.SNPs.RData
+    │       ├── P.T.sunrise.png
+    │       ├── P.T.tumour.png
+    │       └── Rplots.pdf
+    └── Methylation
+      ├── N
+      │   ├── dt_normal_m.RData
+      │   └── P_N_methylation_rate_summary.pdf
+      └── T
+          ├── CAMDAC_DMPs.bed
+          ├── CAMDAC_purified_tumour.bed
+          ├── CAMDAC_results_per_CpG.RData
+          ├── P_T_DMP_stats.txt
+          ├── P_T_methylation_rate_summary.pdf
+          ├── purified_tumour.RData
+          └── tumour_versus_normal_methylomes.pdf  
+ ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FileDescription
P.T.SNPs.CpGs.all.sorted.RDataAllele counts for a sample. Generated by processing BAM file
P.T.ascat.output.RDataASCAT copy number results
P.T.ASCATprofile.pngASCAT copy number profile
dt_normal_m.RDataBulk normal DNA methylation data
purified_tumour.RDataCAMDAC-purified DNA methylation rates
CAMDAC_results_per_CpG.fstCAMDAC deconvolution and differential methylation results
+
+
+

WGBS pipeline output +

+

CAMDAC outputs are written in the directory given by config$outdir in the format PATIENT/DATASET/SAMPLE/:

+
└── P
+    ├── Allelecounts
+    │   ├── N
+    │   │   └── P.N.SNPs.CpGs.all.sorted.csv.gz
+    │   └── T
+    │       └── P.T.SNPs.CpGs.all.sorted.csv.gz
+    ├── Copynumber
+    │   ├── N
+    │   │   └── P.N.SNPs.csv.gz
+    │   └── T
+    │       ├── ascat
+    │       ├── battenberg
+    │       ├── P.T.cna.txt
+    │       ├── P.T.SNPs.csv.gz
+    │       └── P.T.tnSNP.csv.gz
+    └── Methylation
+        ├── N
+        │   └── P.N.m.csv.gz
+        └── T
+            ├── P.T.CAMDAC_annotated_DMRs.fst
+            ├── P.T.CAMDAC_results_per_CpG.fst
+            ├── P.T.m.csv.gz
+            └── P.T.pure.csv.gz
+ ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FileDescription
P.T.SNPs.CpGs.all.sorted.csv.gzAllele counts for a sample. Generated by processing BAM file
P.T.SNPs.csv.gzSNP counts for a sample.
P.T.cna.txtCAMDAC CNA result
P.T.m.csv.gzBulk methylation data
P.T.m.pure.csv.gzCAMDAC-deconvolved methylation data
P1.T.CAMDAC_results_per_CpG.fstCAMDAC differentially methylated cytosines
P1.T.CAMDAC_annotated_DMRs.fstCAMDAC differentially methylated regions
+

It is possible to manually override outputs for runs. See vignette("questions") for more details.

+
+
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.9.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/output_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/articles/output_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/articles/output_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/articles/pipeline.html b/docs/articles/pipeline.html new file mode 100644 index 0000000..9db662a --- /dev/null +++ b/docs/articles/pipeline.html @@ -0,0 +1,168 @@ + + + + + + + +CAMDAC pipeline • CAMDAC + + + + + + + + + + + + +
+
+ + + + +
+
+ + + + +

The entry-point to CAMDAC is the pipeline() function which expects a CamConfig() object and four CamSample() objects representing:

+
    +
  • +tumor : The bulk tumor sample to deconvolve
  • +
  • +germline : The germline normal data for copy number calling
  • +
  • +infiltrates : A proxy for the normal infiltrating cells
  • +
  • +origin : A proxy for the normal cell from which the tumour originated
  • +
+

The same normal sample may be passed repeatedly for the germline, infiltrates or origin, depending on your experimental design. See ?pipeline for more details.

+
+library(CAMDAC)
+
+# Path to BAM files
+tumor_bam <- system.file("testdata", "tumor.bam", package = "CAMDAC")
+normal_bam <- system.file("testdata", "normal.bam", package = "CAMDAC")
+
+# Select samples for basic tumor-normal analysis
+tumor <- CamSample(id = "T", sex = "XY", bam = tumor_bam)
+normal <- CamSample(id = "N", sex = "XY", bam = normal_bam)
+
+# Configure pipeline
+config <- CamConfig(
+  outdir = "./results", bsseq = "rrbs", lib = "pe",
+  build = "hg38", refs = "./refs", n_cores = 1, cna_caller = 'ascat'
+)
+
+# Run CAMDAC
+CAMDAC::pipeline(
+  tumor, germline = normal, infiltrates = normal, origin = normal, config
+)
+

Next, see vignette("output") for a detailed summary of CAMDAC results files.

+
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.9.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/questions.html b/docs/articles/questions.html new file mode 100644 index 0000000..b39fbb7 --- /dev/null +++ b/docs/articles/questions.html @@ -0,0 +1,200 @@ + + + + + + + +FAQs • CAMDAC + + + + + + + + + + + + +
+
+ + + + +
+
+ + + + +
+

General FAQ +

+
+

What if I don’t have a CNA profile or matched germline sample? (WGBS) +

+

Ideally, CAMDAC is run with a matched normal sample from which to derive heterozygous germline SNPs for copy number estimation. In the absence of matched normals, a panel of sex- and tissue-matched normal samples may be used by averaging DNA methylation rates from multiple patients. See vignette("experimental") for more information.

+
+
+

I want to run CAMDAC on something other than hg19 or hg38 (WGBS) +

+

Please raise an issue on GitHub to request files for a new reference genome.

+
+
+

Can I skip steps of the analysis? (WGBS) +

+

When calling pipeline if you do not give a normal infiltrate or cell of origin, the pipeline skips deconvolution and differential methylation respectively. This may be useful to run a quick first-pass to find and refit copy number solutions. When CAMDAC has found a solution and is rerun with the same tumor, config, and normal, the infiltrates and cell_of_origin arguments will continue the pipeline where it left off. The entire pipeline can be re-run be deleting the output directory or setting overwrite=TRUE in the CamConfig.

+
+
+

How do I run individual steps of the CAMDAC pipeline? (WGBS) +

+

The simplest way is to call pipeline with overwrite=FALSE in your config, giving the right normal sample for your step. Additionally, you CamConfig must run with the same output directory.

+

If for any reason, you have changed the output directory structure from previous run, you can initiate CAMDAC by manually passing outputs to CamSample objects. See the vignette vignette("output") for more information.

+

Finally, you can run the cmain_* functions used by pipeline() directly. For example, to run the deconvolution step, you can call cmain_deconvolve_methylation().

+
+
+

My CNA solution wasn’t right. How can I refit with different purity and ploidy values? (WGBS) +

+

If you want to use an external purity and ploidy solution, simply pass a CNA file that has only the purity and ploidy fields. Additionally, set refit==TRUE in the CamConfig and CAMDAC will use this to refit the sample.

+
+
+

Can I limit my analysis to specific regions of interest? +

+

To analyse specific genomic regions, you may pass a BED file to CAMDAC config:

+
+CamConfig(outdir=".", ref="./pipeline_files", regions="regions.bed")
+

CAMDAC will merge any overlapping regions prior to analysis.

+
+
+

How can I manually replace pipeline outputs? (WGBS) +

+

If you have outputs from a previous run, you can manually assign them to a CAMDAC object. This overwrites the expected path for that output type, allowing the pipeline to run with this data instead of computing it. Use the attach_output function, passing one of three arguments:

+
    +
  • +counts: CAMDAC allele counts *.SNP.CpGs.all.sorted.csv.gz file
  • +
  • +snps: CAMDAC sample SNP counts *.SNPs.csv.gz file
  • +
  • +meth: CAMDAC bulk methylation *.m.csv.gz file
  • +
  • +cna: CAMDAC CNA *.cna.txt file
  • +
  • +pure: CAMDAC deconvolved methylation *.m.pure.csv.gz file
  • +
+

For example, to attach a previous counts file to a CAMDAC object:

+
+library(CAMDAC)
+tumor <- CamSample(id = "T", sex = "XY", bam = NULL)
+config <- CamConfig(outdir = tempdir(), build="hg38", bsseq="wgbs", lib="pe")
+counts_file <- system.file("testdata", "test.SNPs.CpGs.all.sorted.csv.gz", package = "CAMDAC")
+tumor <- attach_output(tumor, config, "counts", counts_file)
+

The CAMDAC pipeline can now access the file in the expected location at config$outdir.

+
+
+
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.9.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/questions_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/articles/questions_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/articles/questions_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/articles/setup.html b/docs/articles/setup.html new file mode 100644 index 0000000..a3ea413 --- /dev/null +++ b/docs/articles/setup.html @@ -0,0 +1,181 @@ + + + + + + + +Installation • CAMDAC + + + + + + + + + + + + +
+
+ + + + +
+
+ + + + +
+

Install CAMDAC +

+

From the R console, install CAMDAC from github:

+
+install.packages("remotes")
+remotes::install_github("VanLoo-lab/CAMDAC")
+
+
+

Download pipeline reference files +

+

CAMDAC requires custom annotation files for RRBS and WGBS analysis, available at the Zenodo repository: (10565423). An R convenience function is provided to download these files:

+
+CAMDAC::download_pipeline_files(bsseq = "rrbs", directory = "./refs")
+CAMDAC::download_pipeline_files(bsseq = "wgbs", directory = "./refs")
+

Now, you’re ready to run CAMDAC! Next, see vignette("pipeline").

+
+

Reference file search priority +

+

CAMDAC searches for pipeline files in the following order:

+
    +
  1. A directory passed when creating the config object (see CamConfig())
  2. +
  3. The location defined by the environment variable CAMDAC_PIPELINE_FILES.
  4. +
  5. The current working directory
  6. +
+

We recommend that you set the environment variable CAMDAC_PIPELINE_FILES to the directory where you downloaded the files. This will allow CAMDAC to find the files automatically whenever you load R.

+

From a Unix terminal:

+
+

echo “CAMDAC_PIPELINE_FILES=$(realpath R)” >> ~/.Renviron

+
+
+
+
+

External dependencies +

+

CAMDAC-RRBS

+
    +
  • None
  • +
+

CAMDAC WGBS

+
    +
  • +java: To run CAMDAC on WGBS data, we leverage Battenberg which requires the java command-line utility. Download Java from https://openjdk.org/.
  • +
+
+
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.9.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/setup_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/articles/setup_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/articles/setup_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/articles/technical.html b/docs/articles/technical.html new file mode 100644 index 0000000..faab236 --- /dev/null +++ b/docs/articles/technical.html @@ -0,0 +1,355 @@ + + + + + + + +Technical Note • CAMDAC + + + + + + + + + + + + +
+
+ + + + +
+
+ + + + +


In this section, we provide a high-level summary of the CAMDAC pipeline, which covers six key steps:

+
    +
  1. +Allele Counting: Obtain allele counts at SNP and CpG loci.
    +
  2. +
  3. +Copy-number calling: Obtain allele-specific copy number profiles, tumour purity and SNP plot data.
  4. +
  5. +Methylation Processing: Filter, format and plot methylation data.
  6. +
  7. +Deconvolution: Deconvolve the pure tumour methylation rates from bulk tumour RRBS data.
  8. +
  9. +Differential methylation: Perform differential tumour-normal methylation analysis.
  10. +
+

For a full outline and validation of CAMDAC, please see Larose Cadieux et al. (2020) bioRxiv.

+
+

Allele counting +

+

Take a hypothetical female patient with primary tumour sample ID “T1” and normal-adjacent sample ID “N1”. First, CAMDAC takes the sequencing alignment files from each sample using the CamSample() functions, users should provide the full path and file name for the RRBS or WGBS binary mapping alignments (.bam) files for input samples, and use the CamConfig() sample to indicate whether they are aligned hg19, hg38, GRCH37 or GRCH38. Bases should be quality and adapter trimmed and PCR duplicates should be removed. Please ensure that the bam file is sorted and indexed.

+

CAMDAC employs an allele counter module to count SNP and CpG (methylation) alleles for downstream analysis. SNP counts are performed at 1000 genome SNP positions, and CpG alleles are counted using dinucleotides. To speed up the computation, we leverage a reference RRBS and WGBS genome files listing all genomic regions supported by the respective platforms.

+

By default, the read mapping quality filter is set to mq>=0 as default in CamConfig(). Mapping quality scores from bisulfite sequencing aligners may be biased against the alternate allele for reads with polymorphisms. Please review the mapping quality distribution of your data to determine if it is appropriate to increase this setting.

+

If the function is successful, a signle file output with the suffix “SNPs.CpGs”. This file carries compiled SNP and methylation information with the following columns:

+
+
+

Figure. Formatted SNP and methylation information

+
+
+

Each row is either a CG locus (and CCGG for RRBS) and/or a 1000g SNP position. These can be distinguished by the width column. While polymorphic CG/CCGG have the same width as their non-polymorphic counterpart, they are easily identified by looking at the POS, ref, alt and other SNP-informative columns.

+

For each SNP locus, 1000 Genomes genomic coordinate and reference and alternate alleles are listed under POS, ref and alt columns. The total_counts is the sum of alt_counts and ref_counts, which including all informative strand-specific allele counts. For example, at \(C>T\) SNPs, only the reverse strand allows to distinguish between the (un)methylated reference and the alternate allele and thus all forward read counts would be excluded from the total_counts column, but included in the total_depth. The SNP type column is only added to the patient-matched normal, which is used to assign SNP genotypes as either Homozygous or Heterozygous based on internal B-allele frequency (BAF) cut-offs.

+

M, UM, total_counts_m, and m are the counts methylated, counts unmethylated, the total counts (un)methylated and the methylation rate, respectively. Methylation rates are calculated per CG allele, meaning that at polymorphic CpGs, only the CG-forming allele counts are considered. CAMDAC methylation rates are therefore polymorphism-independent.

+

For CCGG loci found in RRBS, the CCGG column indicates the number of fragments with a 5’ end at this CCGG loci. This number may be 0 at polymorphic CCGG loci homozygous for the CCGG-destroying allele. Furthermore, for RRBS, MspI fragment boundaries are determined from the aligned reads and MspI fragment the size distribution is visualised for quality assessment in the file fragment_length_histogram.pdf. You should observe 3 disctinct peaks in the fragment length distribution. This is characteristic of human RRBS libraries and originates from MspI containing micro-satellite repeats of distinct lengths. The MspI fragment boundaries and their GC content are saved as an .RData object and used downstream in RRBS copy number profiling.

+
+
+

Figure. MspI fragment size distribution

+
+
+


+
+
+

Copy number calling +

+

B-allele frequencies at heterozygous SNPs are leveraged to calculate pure tumour copy number aberrations using either ASCAT.m for RRBS or Battenberg.m for WGBS. These tools are inspired from ASCAT (Van Loo et al., 2010) and Battenberg (Nik-Zainal et al., 2012). If sucessful, CAMDAC writes copy number output to the “Copy_number” directory.

+

A SNPs file lists the heterozygous SNPs selected for copy number analysis, resulting in a table where each row is a 1000g SNP position with minimum coverage defined by the germline sample with a minimum coverage set by the min_normal argument. The total_counts column is the total informative read counts. For example, at C\(>\)T SNPs, only the reverse strand allows to distinguish between the unmethylated reference and the alternate allele and thus, forward read counts would not contribute to the total_counts and the BAF (B-allele frequency calculation). rBAF is randomly assigned BAF or 1-BAF to remove biases against the alternate allele in downstream tumour copy number profiling. All read counts however contribute to the total_depth which is used for LogR calculation, a measure of total coverage. Genotyping is performed and assignments stored under type.

+

For the RRBS pipeline, we provide an experimental feature to visualise the magnitude of biases against alternate of (B)-alleles. The number of homozygous to heterozygous SNPs is depicted and any biases in coverage against the latter can be evaluated. Due to being biases for CpG-rich genomic regions, a typical RRBS sample should show a high ratio of C\(>\)T SNPs. We note that C\(>\)T and A\(>\)G germline heterozygous SNPs will have roughly half the coverage of the 4 types of SNPs.

+
+
+

Figure Normal SNP data QC

+
+
+


+

In addition to the above-mentionned columns, we also adjust for biases in the tumour LogR. The LogR is a normalised measure of tumour coverage used by ASCAT.m and Battenberg.m for copy number profiling together with the BAF. The covariates used for LogR correction are:

+
    +
  • +GC_content: The GC content of fragments leads to sequencing biases, namely at the PCR amplification step.
  • +
  • +replic: The local genomic replication timing affects the number of copies present at a given locus in cells undergoing S phase.
  • +
  • +msp1_length: RRBS only. The MspI fragment length is highly variable and we observe sequencing biases against fragments at the extremes of the fragment size distribution.
  • +
+

Next, the standard ASCAT or Battenberg output are then generated. All files have the dot-separated patient and sample IDs as prefix. In addition, we plot the BAF and LogR. In the BAF profiles, heterozygous SNPs are highlighted in red. The BAF and LogR tracks are then segmented by the respective tools. The segmentation is then analysed to determine the optimal tumour purity and ploidy solution via a grid search (see sunrise plot). Raw and rounded allele-specific copy number segments are provided as output png images.

+

Finally, the purity, ploidy, number of heterozygous and homozygous 1000g SNP positions and median tumour and normal SNP depth are saved for each tumour sample. For RRBS, summary SNP data is plotted and saved as a pdf with filename "*_SNP_data.pdf*" and may help you troubleshoot your data.

+
+
+

Figure. Tumour SNP data summary

+
+
+


+
+
+

Methylation processing +

+

As part of the allele counting step, CAMDAC calculates bulk DNA methylation rates for each input sample. For the patient- and tissue-matched normal sample “N1”, the methylation data columns have the suffix is \(x = n\), since \(m_{n,i} \sim m_{n,o}\). Where \(m_{n,i} \neq m_{n,o}\), the suffix is set to \(x = n\_i\) for the normal infiltrates and \(x = n\_o\) for the normal cell of origin proxy sample. The uncertainty on \(m_{x}\) is computed as the lower and upper boundaries of the 99% Highest Density Interval (HDI) are stored under columns \(m_{x,low}\) and \(m_{x,high}\).

+
    +
  • +CHR: Chromosome name with ‘chr’ prefix
  • +
  • +start: First base of CG/CCGG
  • +
  • +end: Last base of CG/CCGG
  • +
  • +M_x: Counts methylated
  • +
  • +UM_x: Counts unmethylated
  • +
  • +m_x: Methylation rate
  • +
  • +m_x_low: Lower boundary of the 99% HDI for \(m_{x}\) +
  • +
  • +m_x_high: Upper boundary of the 99% HDI for \(m_{x}\) +
  • +
+
+
+

Figure. Normal methylation output.

+
+
+

In the normal sample methylation output directory, you will find a pdf with methylation data summary and QC (RRBS only). We expect DNA methylation rates to sit near 0 and 1. CAMDAC calculates DNA methylation rates in a polymorphism-independent manner, meaning that the CG-destroying allele at a heterozygous CpG does not contribute to its methylation rate. The minimum coverage threshold applied to CpG sites is based on the CpG allele read depth, so any heterozygous SNPs present at the CG location may be removed due to insufficient coverage.

+
+
+

Figure. Normal methylation rate QC.

+
+
+


+
+
+

Deconvolution +

+

At this stage, CAMDAC has obtained methylation rates for both the normal infiltrates and bulk tumour, as well as tumour copy number and purity estimates. The DNA methylation profile of the normal-adjacent samples may be used as a proxy for the methylation rate of tumour-infiltrating normal cells (\(m_{n,i}\)). We have all the necessary information to obtain CAMDAC pure tumour methylation rates, \(m_t\).

+

In the Methylation/ output directory, CpG copy number and purified tumour methylation data are written to output CSV files. Header fields include:

+
    +
  • +nA: Major allele copy number
  • +
  • +nB: Minor allele copy number
  • +
  • +CN: Total allele copy number
  • +
  • +seg_start: Copy number segment start point
  • +
  • +seg_end: Copy number segment end point
  • +
  • +CG_CN: CpG allele total copy number (this differ from CN at polymorphic CpGs)
  • +
  • +m_t_raw: Raw CAMDAC purified tumour methylation rate
  • +
  • +m_t_corr: Corrected CAMDAC purified tumour methylation rate
  • +
  • +cov_t: CAMDAC purified tumour effective read coverage
  • +
  • +m_t_low: CAMDAC purified tumour 99% HDI lower boundary
  • +
  • +m_t_high: CAMDAC purified tumour 99% HDI upper boundary
  • +
+

CAMDAC-deconvoluted methylation rate can have any values between 0 and 1 while the range of bulk tumour methylation rates is driven by tumour DNA content. In the bulk tumour profiles, bi-allelic tumour-normal differentially methylated positions appear at intermediate methylation values while after purification, they form a peak near 0 or 1 for hypo- and hypermethylated positions, respectively.

+
+
+

Figure. Tumour versus normal methylation rates from before and after CAMDAC.

+
+
+


+
+
+

Differential methylation +

+

For tumour-normal differential methylation analysis, CAMDAC expects a DNA methylation profile representing the tumour cell of origin (\(m_{n,o}\)). In this hypothetical example, we set the normal sample N1 as the cell of origin. Leveraging CAMDAC purified methylomes, we then obtain differentially methylated positions and regions.

+

Differential DNA methylation is detected with a minimum tumour-normal methylation rate difference (effect size, where \(\delta\beta\) >= 0.2) and a probability threshold, representing the probability that the tumour and normal beta distributions do not overlap. Both variables are used for calling differentially methylated positions (DMPs).

+

Next, CAMDAC builds on DMP calls to call DMRs. To identify differentially methylated regions (DMRs), we group CpGs into bins and look for clusters with at least 5 DMPs (min_DMP_counts_in_DMR=5), 4 of which must be consecutive (min_consec_DMP_in_DMR=4). After completion, this function generates a pure tumor methylation file (CAMDAC_results_per_CpG.RData for RRBS or pure.csv.gz for WGBS) in the CAMDAC methylation output directory. This R object is a combination of all CAMDAC results per CpG with DMP information included:

+
    +
  • +cluster_id: RRBS CpG cluster
  • +
  • +chrom: Chromsome name (i.e. 1, 2, …, X)
  • +
  • +start: First base of CG/CCGG
  • +
  • +end: Last base of CG/CCGG
  • +
  • +m_n: normal methylation rate
  • +
  • +m_n_low: normal methylation rate HDI99 lower boundary
  • +
  • +m_n_high: normal methylation rate HDI99 upper boundary
  • +
  • +m_t: CAMDAC pure tumour methylation rate
  • +
  • +m_t_low: CAMDAC pure tumour methylation rate HDI99 lower boundary
  • +
  • +m_t_high: CAMDAC pure tumour methylation rate HDI99 upper boundary
  • +
  • +prob: Tumour-noraml DMP probability
  • +
  • +CG_CN: CpG allele total copy number
  • +
  • +nA: Major allele copy number
  • +
  • +nB: Minor allele copy number
  • +
  • +segment: Copy number segment endpoints
  • +
  • +DMR_type: “hyper”, “hypo” or “mixed”
  • +
  • +CpG_counts: Number of CpGs in a given bin
  • +
  • +DMP_counts: Number of DMPs
  • +
  • +consec_DMPs: Number of consecutive DMPs
  • +
  • +DMR: “DMR” if differentially methylated, NA otherwise.
  • +
  • +m_diff_tn: CAMDAC-purified tumour \(-\) normal methylation rates
  • +
  • +prob_DMP: DMP probability
  • +
  • +DMP_t: DMP calls based on CAMDAC-purified tumour versus normal methylation rates
  • +
+

The ratio of hyper- to hypomethylated DMRs varies across genomic regions is reflected by the tumour-normal methylation rate difference.

+
+
+

Figure. DMR summary data.

+
+
+


+
+

Leveraging CAMDAC outputs +

+

CAMDAC outputs will be stored at the user-defined project outdir variable given to the configuration (CamConfig()). A patient folder is created at this path with directory name set to patient_id. This will contain 3 subdirectories: Allelecounts, Copy_number and Methylation, with further sub-directories created for each of a given patient’s samples.

+

With CAMDAC differential methylation calls in hand, users may choose to look for recurrently aberrated loci across their cohort. Note that tumour-tumour DMPs can be easily identified by looking for overlap between the 99% HDIs for CAMDAC pure tumour methylation rates between samples (99% HDI \(\subseteq\) [m_t_low,m_t_high]).

+

Clustering* analyses can also easily be performed by the user using well-established R packages such as ‘pvclust’ for hierarchical clustering with bootstrap and ‘umap’ (uniform manifold approximation and projection) for non-linear dimensionality reduction. Clustering of pure tumour methylation rates at promoter DMRs across large cohorts by ‘umap’ may reveal histology and/or sex-driven clusters as described in non-small cell lung cancer Larose Cadieux et al., 20201.

+

For multi-region data, sample tree reconstruction by neighbour joining leveraging CAMDAC pure tumour methylation rates at hypermethylated DMPs in at least on sample, subset to loci confidently unmethylated in the normal cell of origin (m_n_high<0.2), can reveal inter-sample relationships, as demonstrated in non-small cell lung cancer Larose Cadieux et al., 20201.

+

When running gene-set enrichment analysis (GSEA) on CAMDAC DMR calls, gene sets should be limited to those genes with promoters covered by RRBS. It may be desirable to subset DMR calls to hypermethylated promoter-associated CpG Islands given that methylation at these loci is most correlated with expression.

+

Users may leverage normal, deconvoluted tumour methylation rates and tumour-normal DMP calls to separate clonal mono- and bi-allelic from subclonal bi-allelic methylation changes to shed light into tumour evolutionary histories Larose Cadieux et al., 20201. The allele-specific CAMDAC module will be made available in future releases.

+
+
+

References +

+

Larose Cadieux et al. (2020). Copy number-aware deconvolution of tumor-normal DNA methylation profiles, bioRxiv 2020.11.03.366252

+
+
+
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.9.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/technical_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/articles/technical_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/articles/technical_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/authors.html b/docs/authors.html new file mode 100644 index 0000000..64004e9 --- /dev/null +++ b/docs/authors.html @@ -0,0 +1,142 @@ + +Authors and Citation • CAMDAC + + +
+
+ + + +
+
+
+ + + +
  • +

    Elizabeth Larose Cadieux. Author. +

    +
  • +
  • +

    Nana Mensah. Author, maintainer. +

    +
  • +
  • +

    Siqi Lai. Author. +

    +
  • +
  • +

    Carla Castignani. Author. +

    +
  • +
  • +

    Jonas Demeulemeester. Author. +

    +
  • +
  • +

    Peter Van Loo. Author, funder. +

    +
  • +
+
+
+

Citation

+ +
+
+ + +

Larose Cadieux E, Mensah N, Lai S, Castignani C, Demeulemeester J, Van Loo P (2025). +CAMDAC: Copy-number Aware Methylation Deconvolution and Analysis of Cancers. +R package version 0.2.0. +

+
@Manual{,
+  title = {CAMDAC: Copy-number Aware Methylation Deconvolution and Analysis of Cancers},
+  author = {Elizabeth {Larose Cadieux} and Nana Mensah and Siqi Lai and Carla Castignani and Jonas Demeulemeester and Peter {Van Loo}},
+  year = {2025},
+  note = {R package version 0.2.0},
+}
+ +
+ +
+ + + +
+ +
+

Site built with pkgdown 2.0.9.

+
+ +
+ + + + + + + + diff --git a/docs/bootstrap-toc.css b/docs/bootstrap-toc.css new file mode 100644 index 0000000..5a85941 --- /dev/null +++ b/docs/bootstrap-toc.css @@ -0,0 +1,60 @@ +/*! + * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) + * Copyright 2015 Aidan Feldman + * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ + +/* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */ + +/* All levels of nav */ +nav[data-toggle='toc'] .nav > li > a { + display: block; + padding: 4px 20px; + font-size: 13px; + font-weight: 500; + color: #767676; +} +nav[data-toggle='toc'] .nav > li > a:hover, +nav[data-toggle='toc'] .nav > li > a:focus { + padding-left: 19px; + color: #563d7c; + text-decoration: none; + background-color: transparent; + border-left: 1px solid #563d7c; +} +nav[data-toggle='toc'] .nav > .active > a, +nav[data-toggle='toc'] .nav > .active:hover > a, +nav[data-toggle='toc'] .nav > .active:focus > a { + padding-left: 18px; + font-weight: bold; + color: #563d7c; + background-color: transparent; + border-left: 2px solid #563d7c; +} + +/* Nav: second level (shown on .active) */ +nav[data-toggle='toc'] .nav .nav { + display: none; /* Hide by default, but at >768px, show it */ + padding-bottom: 10px; +} +nav[data-toggle='toc'] .nav .nav > li > a { + padding-top: 1px; + padding-bottom: 1px; + padding-left: 30px; + font-size: 12px; + font-weight: normal; +} +nav[data-toggle='toc'] .nav .nav > li > a:hover, +nav[data-toggle='toc'] .nav .nav > li > a:focus { + padding-left: 29px; +} +nav[data-toggle='toc'] .nav .nav > .active > a, +nav[data-toggle='toc'] .nav .nav > .active:hover > a, +nav[data-toggle='toc'] .nav .nav > .active:focus > a { + padding-left: 28px; + font-weight: 500; +} + +/* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */ +nav[data-toggle='toc'] .nav > .active > ul { + display: block; +} diff --git a/docs/bootstrap-toc.js b/docs/bootstrap-toc.js new file mode 100644 index 0000000..1cdd573 --- /dev/null +++ b/docs/bootstrap-toc.js @@ -0,0 +1,159 @@ +/*! + * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) + * Copyright 2015 Aidan Feldman + * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ +(function() { + 'use strict'; + + window.Toc = { + helpers: { + // return all matching elements in the set, or their descendants + findOrFilter: function($el, selector) { + // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/ + // http://stackoverflow.com/a/12731439/358804 + var $descendants = $el.find(selector); + return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])'); + }, + + generateUniqueIdBase: function(el) { + var text = $(el).text(); + var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-'); + return anchor || el.tagName.toLowerCase(); + }, + + generateUniqueId: function(el) { + var anchorBase = this.generateUniqueIdBase(el); + for (var i = 0; ; i++) { + var anchor = anchorBase; + if (i > 0) { + // add suffix + anchor += '-' + i; + } + // check if ID already exists + if (!document.getElementById(anchor)) { + return anchor; + } + } + }, + + generateAnchor: function(el) { + if (el.id) { + return el.id; + } else { + var anchor = this.generateUniqueId(el); + el.id = anchor; + return anchor; + } + }, + + createNavList: function() { + return $(''); + }, + + createChildNavList: function($parent) { + var $childList = this.createNavList(); + $parent.append($childList); + return $childList; + }, + + generateNavEl: function(anchor, text) { + var $a = $(''); + $a.attr('href', '#' + anchor); + $a.text(text); + var $li = $('
  • '); + $li.append($a); + return $li; + }, + + generateNavItem: function(headingEl) { + var anchor = this.generateAnchor(headingEl); + var $heading = $(headingEl); + var text = $heading.data('toc-text') || $heading.text(); + return this.generateNavEl(anchor, text); + }, + + // Find the first heading level (`

    `, then `

    `, etc.) that has more than one element. Defaults to 1 (for `

    `). + getTopLevel: function($scope) { + for (var i = 1; i <= 6; i++) { + var $headings = this.findOrFilter($scope, 'h' + i); + if ($headings.length > 1) { + return i; + } + } + + return 1; + }, + + // returns the elements for the top level, and the next below it + getHeadings: function($scope, topLevel) { + var topSelector = 'h' + topLevel; + + var secondaryLevel = topLevel + 1; + var secondarySelector = 'h' + secondaryLevel; + + return this.findOrFilter($scope, topSelector + ',' + secondarySelector); + }, + + getNavLevel: function(el) { + return parseInt(el.tagName.charAt(1), 10); + }, + + populateNav: function($topContext, topLevel, $headings) { + var $context = $topContext; + var $prevNav; + + var helpers = this; + $headings.each(function(i, el) { + var $newNav = helpers.generateNavItem(el); + var navLevel = helpers.getNavLevel(el); + + // determine the proper $context + if (navLevel === topLevel) { + // use top level + $context = $topContext; + } else if ($prevNav && $context === $topContext) { + // create a new level of the tree and switch to it + $context = helpers.createChildNavList($prevNav); + } // else use the current $context + + $context.append($newNav); + + $prevNav = $newNav; + }); + }, + + parseOps: function(arg) { + var opts; + if (arg.jquery) { + opts = { + $nav: arg + }; + } else { + opts = arg; + } + opts.$scope = opts.$scope || $(document.body); + return opts; + } + }, + + // accepts a jQuery object, or an options object + init: function(opts) { + opts = this.helpers.parseOps(opts); + + // ensure that the data attribute is in place for styling + opts.$nav.attr('data-toggle', 'toc'); + + var $topContext = this.helpers.createChildNavList(opts.$nav); + var topLevel = this.helpers.getTopLevel(opts.$scope); + var $headings = this.helpers.getHeadings(opts.$scope, topLevel); + this.helpers.populateNav($topContext, topLevel, $headings); + } + }; + + $(function() { + $('nav[data-toggle="toc"]').each(function(i, el) { + var $nav = $(el); + Toc.init($nav); + }); + }); +})(); diff --git a/docs/docsearch.css b/docs/docsearch.css new file mode 100644 index 0000000..e5f1fe1 --- /dev/null +++ b/docs/docsearch.css @@ -0,0 +1,148 @@ +/* Docsearch -------------------------------------------------------------- */ +/* + Source: https://github.com/algolia/docsearch/ + License: MIT +*/ + +.algolia-autocomplete { + display: block; + -webkit-box-flex: 1; + -ms-flex: 1; + flex: 1 +} + +.algolia-autocomplete .ds-dropdown-menu { + width: 100%; + min-width: none; + max-width: none; + padding: .75rem 0; + background-color: #fff; + background-clip: padding-box; + border: 1px solid rgba(0, 0, 0, .1); + box-shadow: 0 .5rem 1rem rgba(0, 0, 0, .175); +} + +@media (min-width:768px) { + .algolia-autocomplete .ds-dropdown-menu { + width: 175% + } +} + +.algolia-autocomplete .ds-dropdown-menu::before { + display: none +} + +.algolia-autocomplete .ds-dropdown-menu [class^=ds-dataset-] { + padding: 0; + background-color: rgb(255,255,255); + border: 0; + max-height: 80vh; +} + +.algolia-autocomplete .ds-dropdown-menu .ds-suggestions { + margin-top: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion { + padding: 0; + overflow: visible +} + +.algolia-autocomplete .algolia-docsearch-suggestion--category-header { + padding: .125rem 1rem; + margin-top: 0; + font-size: 1.3em; + font-weight: 500; + color: #00008B; + border-bottom: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--wrapper { + float: none; + padding-top: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--subcategory-column { + float: none; + width: auto; + padding: 0; + text-align: left +} + +.algolia-autocomplete .algolia-docsearch-suggestion--content { + float: none; + width: auto; + padding: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--content::before { + display: none +} + +.algolia-autocomplete .ds-suggestion:not(:first-child) .algolia-docsearch-suggestion--category-header { + padding-top: .75rem; + margin-top: .75rem; + border-top: 1px solid rgba(0, 0, 0, .1) +} + +.algolia-autocomplete .ds-suggestion .algolia-docsearch-suggestion--subcategory-column { + display: block; + padding: .1rem 1rem; + margin-bottom: 0.1; + font-size: 1.0em; + font-weight: 400 + /* display: none */ +} + +.algolia-autocomplete .algolia-docsearch-suggestion--title { + display: block; + padding: .25rem 1rem; + margin-bottom: 0; + font-size: 0.9em; + font-weight: 400 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--text { + padding: 0 1rem .5rem; + margin-top: -.25rem; + font-size: 0.8em; + font-weight: 400; + line-height: 1.25 +} + +.algolia-autocomplete .algolia-docsearch-footer { + width: 110px; + height: 20px; + z-index: 3; + margin-top: 10.66667px; + float: right; + font-size: 0; + line-height: 0; +} + +.algolia-autocomplete .algolia-docsearch-footer--logo { + background-image: url("data:image/svg+xml;utf8,"); + background-repeat: no-repeat; + background-position: 50%; + background-size: 100%; + overflow: hidden; + text-indent: -9000px; + width: 100%; + height: 100%; + display: block; + transform: translate(-8px); +} + +.algolia-autocomplete .algolia-docsearch-suggestion--highlight { + color: #FF8C00; + background: rgba(232, 189, 54, 0.1) +} + + +.algolia-autocomplete .algolia-docsearch-suggestion--text .algolia-docsearch-suggestion--highlight { + box-shadow: inset 0 -2px 0 0 rgba(105, 105, 105, .5) +} + +.algolia-autocomplete .ds-suggestion.ds-cursor .algolia-docsearch-suggestion--content { + background-color: rgba(192, 192, 192, .15) +} diff --git a/docs/docsearch.js b/docs/docsearch.js new file mode 100644 index 0000000..b35504c --- /dev/null +++ b/docs/docsearch.js @@ -0,0 +1,85 @@ +$(function() { + + // register a handler to move the focus to the search bar + // upon pressing shift + "/" (i.e. "?") + $(document).on('keydown', function(e) { + if (e.shiftKey && e.keyCode == 191) { + e.preventDefault(); + $("#search-input").focus(); + } + }); + + $(document).ready(function() { + // do keyword highlighting + /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ + var mark = function() { + + var referrer = document.URL ; + var paramKey = "q" ; + + if (referrer.indexOf("?") !== -1) { + var qs = referrer.substr(referrer.indexOf('?') + 1); + var qs_noanchor = qs.split('#')[0]; + var qsa = qs_noanchor.split('&'); + var keyword = ""; + + for (var i = 0; i < qsa.length; i++) { + var currentParam = qsa[i].split('='); + + if (currentParam.length !== 2) { + continue; + } + + if (currentParam[0] == paramKey) { + keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); + } + } + + if (keyword !== "") { + $(".contents").unmark({ + done: function() { + $(".contents").mark(keyword); + } + }); + } + } + }; + + mark(); + }); +}); + +/* Search term highlighting ------------------------------*/ + +function matchedWords(hit) { + var words = []; + + var hierarchy = hit._highlightResult.hierarchy; + // loop to fetch from lvl0, lvl1, etc. + for (var idx in hierarchy) { + words = words.concat(hierarchy[idx].matchedWords); + } + + var content = hit._highlightResult.content; + if (content) { + words = words.concat(content.matchedWords); + } + + // return unique words + var words_uniq = [...new Set(words)]; + return words_uniq; +} + +function updateHitURL(hit) { + + var words = matchedWords(hit); + var url = ""; + + if (hit.anchor) { + url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; + } else { + url = hit.url + '?q=' + escape(words.join(" ")); + } + + return url; +} diff --git a/docs/html/404.html b/docs/html/404.html new file mode 100644 index 0000000..1caa8eb --- /dev/null +++ b/docs/html/404.html @@ -0,0 +1,129 @@ + + + + + + + +Page not found (404) • CAMDAC + + + + + + + + + + + +
    +
    + + + + +
    +
    + + +Content not found. Please use links in the navbar. + +
    + + + +
    + + + +
    + +
    +

    +

    Site built with pkgdown 2.0.9.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/html/DEV.html b/docs/html/DEV.html new file mode 100644 index 0000000..b0b4136 --- /dev/null +++ b/docs/html/DEV.html @@ -0,0 +1,125 @@ + +Dev • CAMDAC + + +
    +
    + + + +
    +
    + + + +
    pkgdown::build_articles(override=list(destination='docs/html'))
    +pkgdown::build_site(examples=FALSE, devel=TRUE, lazy=TRUE, preview=FALSE, override=list(destination='docs/html'))
    +
    +

    Development environment

    +

    Currently running in Docker. Usied

    +
    # Start server
    +colima start --cpu 4 --memory 13
    +
    +# Build docker image
    +docker build -t camdac .
    +docker buildx build --platform linux/amd64 -t nmensah5/camdac:latest .
    +
    +# Run and enter image interactive mode
    +docker run -it -v "$(pwd):/app" camdac:latest bash
    +docker run -it -v "$(pwd):/app" --entrypoint=/bin/bash 4de139ba6ced 
    +
    +# Within the container, start R and load CAMDAC files
    +R
    +devtools::load_all()
    +
    docker buildx build --platform linux/amd64 -t nmensah5/camdac:latest .
    +docker run -it -v "$(pwd):/app" nmensah5/camdac-env:latest bash
    +
    + + +
    + + + +
    + + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/LICENSE-text.html b/docs/html/LICENSE-text.html new file mode 100644 index 0000000..bddae3a --- /dev/null +++ b/docs/html/LICENSE-text.html @@ -0,0 +1,123 @@ + +License • CAMDAC + + +
    +
    + + + +
    +
    + + +
    # MIT License
    +
    +Copyright (c) 2020 CAMDAC
    +
    +Permission is hereby granted, free of charge, to any person obtaining a copy
    +of this software and associated documentation files (the "Software"), to deal
    +in the Software without restriction, including without limitation the rights
    +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    +copies of the Software, and to permit persons to whom the Software is
    +furnished to do so, subject to the following conditions:
    +
    +The above copyright notice and this permission notice shall be included in all
    +copies or substantial portions of the Software.
    +
    +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    +SOFTWARE.
    +
    + +
    + + + +
    + + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/articles/contributing.html b/docs/html/articles/contributing.html new file mode 100644 index 0000000..ba72f72 --- /dev/null +++ b/docs/html/articles/contributing.html @@ -0,0 +1,165 @@ + + + + + + + +Contributing • CAMDAC + + + + + + + + + + + + +
    +
    + + + + +
    +
    + + + + +

    To contribute to CAMDAC, fork the repository and install the development dependencies with remotes::install_dev_deps('.').

    +

    After making your changes, run the test and build commands listed below, then submit a pull request with the changes on your fork.

    +
    +

    CAMDAC test and build commands +

    +
    +library(devtools)
    +
    +# Install dev dependencies
    +devtools::install_dev_deps("VanLoo-lab/CAMDAC")
    +
    +# Update docs
    +devtools::document()
    +
    +# Run tests
    +devtools::test()
    +
    +# Build readme
    +rmarkdown::render('README.Rmd', output_format='github_document', output_file='README.md')
    +
    +# Check package builds
    +devtools::check()
    +
    +# Build documentation
    +pkgdown::build_site(examples=FALSE, devel=TRUE, lazy=TRUE, preview=FALSE)
    +pkgdown::preview_site() # To view. Or: python3 -m http.server --directory docs 8000
    +
    +# Commit changes on the docs/ folder before submitting 
    +
    +
    + + + +
    + + + +
    + +
    +

    +

    Site built with pkgdown 2.0.9.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/html/articles/contributing_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/html/articles/contributing_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/html/articles/contributing_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/html/articles/experimental.html b/docs/html/articles/experimental.html new file mode 100644 index 0000000..962d3db --- /dev/null +++ b/docs/html/articles/experimental.html @@ -0,0 +1,641 @@ + + + + + + + +Experimental Features • CAMDAC + + + + + + + + + + + + +
    +
    + + + + +
    +
    + + + + +

    This document describes experimental features of the CAMDAC package. These features are not yet fully tested and may change in future releases. The following features are currently under development for the WGBS pipeline only:

    +
      +
    • Deconvolution only
    • +
    • Using external copy number solutions
    • +
    • Copy number calling in tumor-only mode
    • +
    • Allele-specific methylation analysis
    • +
    • Normal DNA methylation panels
    • +
    • DMR visualisation
    • +
    +
    +

    Deconvolution only +

    +

    The CAMDAC equation can be used to infer pure tumour DNA methylation rates, provided the following information is available per CpG:

    +
      +
    • Bulk tumour methylation rate (CpG-wise)
    • +
    • Tumour allele-specific copy number state (local region overlapping CpG)
    • +
    • Tumour purity (single parameter per-sample)
    • +
    +

    Here is an example for 5 CpGs from a single sample. Note: the normal copy number state is assumed diploid (2) in humans:

    +
    +
    +# Set parameters
    +bulk = c(0.3, 0.5, 0.2, 0.1, 0.9)
    +normal = c(0.3, 0.9, 0.1, 0.7, 0.5)
    +ploidy = c(2, 2, 1, 3, 4)
    +purity = 0.8
    +
    +# Deconvolve methylation rates
    +pure_meth = CAMDAC:::calculate_mt(bulk, normal, purity, ploidy)
    +
    +# Set clean rates based on threshold
    +pure_meth_clean = dplyr::case_when(
    +  pure_meth < 0 ~ 0,
    +  pure_meth > 1 ~ 1,
    +  TRUE ~ pure_meth
    +)
    +

    After deconvolution, it may be useful to estimate the CpG coverage in the deconvolved tumour sample. Additionally, the highest density interval (HDI) of the methylation rate may be informative for quality control. These metrics can be calculated given additional information on bulk methylated and unmethylated read counts:

    +
    +
    +# Optional: calculate effective coverage of the tumour
    +# # Requires coverage per CpG in the bulk sample
    +bulk_coverage = c(10, 20, 5, 15, 30)
    +pure_effective_coverage = CAMDAC:::calculate_mt_cov(bulk_coverage, purity, ploidy)
    +
    +# Optional: calculate the HDI of the pure tumour methylation rate
    +bulk_methylated_count = c(3, 10, 1, 2, 27)
    +bulk_unmethylated_count = c(7, 10, 4, 13, 3)
    +normal_methylated_count = c(3, 9, 1, 5, 2)
    +normal_unmethylated_count = c(7, 11, 3, 8, 3)
    +
    +# HDI function (fast)
    +CAMDAC:::hdi_norm_approx(
    +  bulk_methylated_count,
    +  bulk_unmethylated_count,
    +  normal_methylated_count,
    +  normal_unmethylated_count,
    +  purity,
    +  ploidy
    +)
    +
    +# HDI function (most accurate)
    +CAMDAC:::vec_HDIofMCMC_mt( 
    +  bulk_methylated_count,
    +  bulk_unmethylated_count,
    +  normal_methylated_count,
    +  normal_unmethylated_count,
    +  purity,
    +  ploidy,
    +  credMass=0.99
    +)
    +
    +
    +

    Using external copy number solutions +

    +

    The germline sample is optional as, in the absence of patient-matched methylation data, you may already have an allele-specific CNA solutions for your bulk tumor. For example, this could be derived from bulk WGS of the same sampl.

    +

    You can provide this data in tab-delimited text file as shown below. Importantly,:

    +
      +
    • column names are optional
    • +
    • purity and ploidy values are taken from the first data row alone
    • +
    • chromosome names may be given with or without ‘chr’ prefix
    • +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    chromstartendmajor_cnminor_cnpurityploidy
    chr11400210.673.5
    chr14011000110.673.5
    +

    To run CAMDAC with this CNA solution, pass attach the file to the tumor CamSample() object:

    +
    +library(CAMDAC)
    +
    +# Load test data
    +b_tumor <- system.file("testdata", "tumor.bam", package = "CAMDAC")
    +b_normal <- system.file("testdata", "normal.bam", package = "CAMDAC")
    +cna_file <- system.file("testdata", "test.cna.txt", package = "CAMDAC")
    +
    +# Set config
    +config <- CamConfig(outdir="./results", bsseq="wgbs", lib="pe", build="hg38", n_cores=10)
    +
    +# Create tumor object and attach CNA solution
    +tumor <- CamSample(id="T", sex="XY", bam=b_tumor)
    +attach_output(tumor, config, "cna", cna_file)
    +
    +# Define normal object(s) for deconvolution or differential methylation
    +normal <- CamSample(id="N", sex="XY", bam=b_normal)
    +
    +# Run pipeline with CNA solution
    +pipeline(
    +    tumor=tumor,
    +    germline=NULL,
    +    infiltrates=normal,
    +    origin=normal,
    +    config=config
    +)
    +
    +
    +

    Copy number calling in tumor-only mode +

    +

    If no SNP file is present for the germline, CAMDAC will infer the copy number calls from the tumor sample alone. Here, the BAF is calculated by a threshold on the tumor BAF, and the LogR is calculated by taking the coverage relative to the median. These results are not as accurate as using a germline normal sample.

    +

    You may already know where heterozygous SNPs lie for your sample, obviating the need for a tumor BAF threshold. In addition, you may have a proxy of the normal coverage for your platform, which is an improvement over taking the tumor median. You can provide this information by attaching a SNPs file to the germline CamSample object. The file should contain:

    + + + + + + + + + + + + + + + + + + + + + + + +
    FieldDescription
    chromChromosome name
    POSPosition of SNP
    BAF(optional) B-allele frequency at this SNP
    total_counts(optional) Total number of reads at this SNP
    +

    POS and total_counts are used to derive the BAF and the LogR respectively. We strongly recommend that total_counts is derived from a normal sample sequenced with the same bisulfite-sequencing assay as the tumor, and unmatched patient samples are acceptable.

    +

    CAMDAC may be run to the copy number calling stage using the external heterozygous SNP file:

    +
    +library(CAMDAC)
    +
    +# Load test data
    +b_tumor <- system.file("testdata", "tumor.bam", package = "CAMDAC")
    +snps_file <- system.file("testdata", "test.to.norm_pos.csv.gz", package = "CAMDAC")
    +
    +# Set config
    +config <- CamConfig(outdir="./results", bsseq="wgbs", lib="pe", build="hg38", n_cores=10)
    +
    +# Create tumor object and attach CNA solution
    +tumor <- CamSample(id="T", sex="XY", bam=b_tumor)
    +attach_output(tumor, config, "cna", cna_file)
    +
    +# Define normal object(s) for deconvolution or differential methylation
    +germline <- CamSample(id="G", sex="XY")
    +attach_output(germline, config, "snps", snps_file)
    +
    +# Run pipeline with CNA solution
    +pipeline(
    +    tumor=tumor,
    +    germline=germline,
    +    infiltrates=NULL,
    +    origin=NULL,
    +    config=config
    +)
    +

    After this, we recommend inspecting the CNA results. If all is well, the pipeline() function can be repeated with the infiltrates and origin CamSamples to complete deconvolution and differential methylation respectively.

    +
    +
    +

    Allele-specific methylation (ASM) analysis +

    +

    CAMDAC can be used to detect allele-specific methylation (ASM) by phasing CpGs to heterozygous SNPs and deconvolving bulk methylation rates per allele.

    +

    This tutorial steps through the ASM analysis pipeline (WGBS only):

    +
      +
    1. Count CpG methylation on tumor and normal at sites phased to SNP loci.
    2. +
    3. Deconvolve methylation on tumor per haplotype using the normal
    4. +
    5. Assign allele-specific copy number state per CpG using the bulk tumor solution
    6. +
    7. Call allele-specific differential methylation within samples
    8. +
    9. Call allele-specific differential methylation between samples
    10. +
    +

    Results from this pipeline are found in the results directory under ‘PATIENT/AlleleSpecific’ and ‘PATIENT/Methylation’. See output file headings below for files and their content.

    +
    +

    CAMDAC-ASM from BAM files +

    +

    The asm_pipeline() function runs CAMDAC-ASM analysis by generates the allele-specific copy number solution and heterozygous SNP loci, followed by deconvolution and differential ASM analysis:

    +
    +b_tumor <- system.file("testdata", "tumor.bam", package = "CAMDAC")
    +b_normal <- system.file("testdata", "normal.bam", package = "CAMDAC")
    +regions <- system.file("testdata", "test_wgbs_segments.bed", package = "CAMDAC") # speed up tests
    +
    +tumor <- CamSample(id = "T", sex = "XY", bam = b_tumor)
    +normal <- CamSample(id = "N", sex = "XY", bam = b_normal)
    +config <- CamConfig(
    +  outdir = "./results", ref = "./pipeline_files", bsseq = "wgbs", lib = "pe", cores = 10,
    +  min_cov = 1, # For test data
    +  regions = regions
    +)
    +
    +asm_pipeline(
    +  tumor = tumor,
    +  germline = normal,
    +  infiltrates = normal,
    +  origin = normal,
    +  config = config
    +)
    +
    +
    +

    CAMDAC-ASM from external inputs (in_development) +

    +

    To run the ASM pipeline without BAM files, CAMDAC requires: - Each CamSample object has SNP loci - The tumor CamSample object has an allele-specific CNA solution - All CamSample objects have BAM files available for phasing

    +

    CAMDAC-ASM requires a file of heterozygous SNP loci against which CpGs will be phased. This is a tab-delimited file with a header containing four fields:

    + + + + + + + + + + + + + + + + + + + + + + + +
    FieldDescription
    chromChromosome name
    posSNP loci position
    refThe reference allele (A/C/T/G)
    altThe alternate SNP allele (A/C/T/G)
    +

    First, attach your SNP loci file to the tumor object with attach_output(), then run asm_pipeline():

    +
    +# Setup CAMDAC samples
    +tumor <- CamSample(id = "tumor", sex = "XY", bam = b_tumor)
    +normal <- CamSample(id = "normal", sex = "XY", bam = b_normal)
    +config <- CamConfig(
    +  outdir = "./results", ref = "./pipeline_files", bsseq = "wgbs", lib = "pe", cores = 10,
    +  min_cov = 1, # For test data
    +  regions = regions
    +) # For arapid testing)
    +
    +# Add SNPs
    +asm_snps_file <- system.file("testdata", "test_het_snps.tsv", package = "CAMDAC")
    +attach_output(tumor, config, "asm_snps", asm_snps_file)
    +attach_output(normal, config, "asm_snps", asm_snps_file)
    +

    Next, CAMDAC requires the allele-specific copy number solution from the tumor, attached as follows:

    +
    +cna_file <- system.file("testdata", "test_cna.tsv", package = "CAMDAC")
    +attach_output(tumor, config, "cna", cna_file)
    +

    Finally, run the allele-specific methylation pipeline:

    +
    +asm_pipeline(
    +  tumor = tumor,
    +  infiltrates = normal,
    +  origin = normal,
    +  config = config
    +)
    +
    +
    +

    CAMDAC-ASM using SNP calls from previous CAMDAC runs +

    +

    If you have already run the CAMDAC pipeline in tumor-normal mode, then the germline object’s SNP files will be used by default. The simplest run from BAM to ASM is shown below using matched normals for infiltrates and DMPs:

    +
    +b_tumor <- system.file("testdata", "tumor.bam", package = "CAMDAC")
    +b_normal <- system.file("testdata", "normal.bam", package = "CAMDAC")
    +regions <- system.file("testdata", "test_wgbs_segments.bed", package = "CAMDAC") # speed up tests
    +
    +tumor <- CamSample(id = "T", sex = "XY", bam = b_tumor)
    +normal <- CamSample(id = "N", sex = "XY", bam = b_normal)
    +config <- CamConfig(
    +  outdir = "./test_results", bsseq = "wgbs", lib = "pe",
    +  build = "hg38", n_cores = 10,
    +  regions = regions,
    +  min_cov = 1, # For test data
    +  cna_caller = "ascat" # Battenberg always recommended, however ASCAT used here for rapid testing.
    +)
    +
    +# Run main CAMDAC generate SNP files for ASM
    +# Deconvolution skipped here for simplicity.
    +pipeline(tumor, germline = normal, infiltrates = NULL, origin = NULL, config)
    +
    +# Run ASM pipeline
    +asm_pipeline(
    +  tumor = tumor,
    +  germline = normal,
    +  infiltrates = normal,
    +  origin = normal,
    +  config = config
    +)
    +
    +
    +

    ASM output file headings +

    +

    ** Allele-specific/ **

    +
      +
    • *asm_counts.csv.gz - The number of reads supporting each allele at each CpG
    • +
    • *asm_hap_stats.csv.gz - Summary statistics for each phased SNP
    • +
    • *asm_phase_map.csv.gz - A mapping of CpG-SNP phased pairs per read
    • +
    • *snps.txt - The heterozygous SNP loci input for ASM analysis
    • +
    • *cna.csv - For the tumour, the allele-specific copy number profile. See format in vignettes("pipeline").
    • +
    +

    ** Methylation/ **

    +
      +
    • *asm_meth.csv.gz - Allele-specific methylation rates for bulk samples
    • +
    • *asm_ss_dmp.csv.gz - Single sample differential allele-specific methylation
    • +
    • *asm_meth_cna.csv.gz - For the tumour, ASM rates with annotated copy number states
    • +
    • *asm_meth_pure.csv.gz - For the tumour, pure methylation rates for each allele
    • +
    • *asm_dmp.csv.gz - Differential allele-specific methylation between tumor and origin sample
    • +
    +
    +
    +
    +

    Normal DNA methylation panels +

    +

    This feature is currently described for CAMDAC-WGBS only.

    +
    +

    Create a methylation panel from multiple normal BAM files +

    +

    CAMDAC supports the use of multiple DNA methylation BAM files as a source of the normal infiltrates or normal cell of origin.

    +

    To create a panel, process your BAM files with the CAMDAC allele counter:

    +
    library(CAMDAC)
    +
    +# Get BAM files
    +b_normal1 = system.file("inst/testdata/normal.bam")
    +b_normal2 = system.file("inst/testdata/normal.bam")
    +b_normal3 = system.file("inst/testdata/normal.bam")
    +
    +# Run allele counter
    +for(file in c(b_normal1, b_normal2, b_normal3)){
    +    prefix = fs::path_ext_remove(file)
    +    outfile = paste0(prefix, ".all.SNPs.CG.csv.gz")
    +    data = cmain_count_alleles(bam_file)
    +    data.table::fwrite(data, outfile)
    +}
    +

    The allele counts files can then be merged into a single file for the panel containing methylation data for deconvolution:

    +
    +panel_counts <- fs::dir_ls(".", glob="*.SNPs.CG.csv.gz")
    +panel <- panel_meth_from_counts(panel_counts)
    +data.table::fwrite(panel, "panel.m.csv.gz")
    +

    By default, panel counts are merged by summing the methylation read counts for each CpG site. You can customise the proportion of each sample that is used in the panel by specifying the ac_props argument in panel_meth_from_counts. To get the mean across each CpG site, simply pass equal proportions for each sample.

    +

    To run CAMDAC with your newly created panel, attach your panel to a CamSample object using the meth argument.

    +
    +# Load test data
    +b_tumor <- system.file("testdata", "tumor.bam", package = "CAMDAC")
    +b_normal <- system.file("testdata", "normal.bam", package = "CAMDAC")
    +
    +# Setup CAMDAC samples
    +tumor <- CamSample(id="tumor", sex="XY", bam=b_tumor)
    +normal <- CamSample(id="normal", sex="XY", bam=b_normal)
    +config <- CamConfig(outdir="./results", ref="./pipeline_files", bsseq="wgbs", lib="pe", cores=10)
    +
    +# Setup panel sample
    +panel <- CamSample(id="panel", sex="XY")
    +panel_file <- system.file("testdata", "test_panel.m.csv.gz", package = "CAMDAC")
    +attach_output(panel, config, "meth", panel_file)
    +
    +# Run CAMDAC with panel
    +pipeline(
    +    tumor=tumor,
    +    germline=normal,
    +    infiltrates=panel,
    +    origin=panel,
    +    config=config
    +)
    +
    +
    +

    Create a methylation panel from a matrix of beta values +

    +

    If you have not started from BAM files, you can create a panel using a matrix of beta values:

    + + + + + + + + + + + + + + + + + + +
    sample1sample2sample3
    0.50.60.7
    0.40.50.6
    +

    Additionally, a data frame specifying the positions of each CpG site in the beta value matrix is required. Here, start and end refer to the C and G of the CpG site respectively:

    + + + + + + + + + + + + + + + + + + +
    chromstartend
    chr1100101
    chr1200201
    +

    The matrix and CpG locations can be passed directly to the panel_meth_from_beta() function, along with settings.

    +
    +# Load beta values and chromosome positions
    +ex <- system.file("testdata", "test_panel_from_beta.csv", package = "CAMDAC")
    +data <- data.table::fread(ex)
    +mat = data[, 4:ncol(data)] # Beta value matrix with 3 samples
    +
    +# Create panel from beta values
    +panel_beta <- panel_meth_from_beta(
    +  mat = mat,
    +  chrom = data$chrom,
    +  start = data$start,
    +  end = data$end, 
    +  cov = 100,
    +  props = c(0.1, 0.8, 0.1), # Proportions of each sample in panel
    +  min_samples = 1,
    +  max_sd = 1
    +)
    +

    As CAMDAC requires coverage at each CpG site to estimate uncertainty, the cov value is given to all CpG sites when building a panel from beta values. Additionally, if any beta values are missing from a sample, proportions are recalculated among the remaining samples as this is the only information available to build the panel for that site.

    +

    There are two experimental arguments that can be set to filter CpG sites from the panel:

    +
      +
    • min_samples: The minimum number of samples that have to have a beta value for a CpG to be included in the panel. The idea here is if you have sparse data, you can skip sites where you aren’t confident in the panel. Set this to 1 to use any sample.

    • +
    • max_sd: Maximum standard deviation of beta values across samples a CpG must have to be included in the panel. The idea here is that when combining many bulk methylomes from the same tissue, sites with high variability reflect sample-specific differences and their averages are less reliable for use in a methylation panel.

    • +
    +
    +
    +
    +

    DMR visualisation +

    +

    CAMDAC produces several output files that visualise the copy number state. DNA methylation rates can be passed to external packages for visualisation. For a quick view of DMRs in R:

    +
    +library(data.table)
    +library(ggplot2)
    +library(CAMDAC)
    +
    +# Show DMPs around a region
    +dmr <- data.table(dmr) # Object from CAMDAC output *annotated_DMRs.fst
    +dmp <- data.table(dmp) # Object from CAMDAC *results_per_CpG.fst
    +chrome <- dmr[1, ]$chrom
    +starte <- dmr[1, ]$start
    +ende <- dmr[1, ]$end
    +offset <- 1000 # Offset 1kB either side of region
    +dmp <- data.table(dmp)
    +dm_regions <- dmp[chrom == as.character(chrome) & start >= (starte - offset) & end <= (ende + offset), ]
    +
    +# Using ggplot, generate a geom where the m_t values are
    +tplt <- ggplot(dm_regions, aes(x = start)) +
    +  geom_point(aes(y = m_t), color = "skyblue") +
    +  geom_point(aes(y = m_n), color = "grey") +
    +  geom_vline(aes(xintercept = start, color = DMP_t)) +
    +  theme_classic() +
    +  scale_color_manual(values = c("skyblue", "blue")) +
    +  scale_y_continuous(limits = c(0, 1)) +
    +  geom_vline(xintercept = c(start, end), color = "red", linetype = "dashed") +
    +  labs(x = dm_regions$chrom[[1]])
    +tplt
    +
    +

    CAMDAC DMR Visualization

    +
    +

    Here, light blue dots are the pure tumour, while light-grey are the normal. The red dash is the DMR region and the vertical lines are hypomethylated DMPs (blue) and hypermethylated DMPs (light blue).

    +
    +
    + + + +
    + + + +
    + +
    +

    +

    Site built with pkgdown 2.0.9.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/html/articles/experimental_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/html/articles/experimental_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/html/articles/experimental_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/html/articles/images/CAMDAC_manual_DMR_summary_plots.png b/docs/html/articles/images/CAMDAC_manual_DMR_summary_plots.png new file mode 100644 index 0000000..b359f72 Binary files /dev/null and b/docs/html/articles/images/CAMDAC_manual_DMR_summary_plots.png differ diff --git a/docs/html/articles/images/CAMDAC_manual_SNP_data.png b/docs/html/articles/images/CAMDAC_manual_SNP_data.png new file mode 100644 index 0000000..22c7423 Binary files /dev/null and b/docs/html/articles/images/CAMDAC_manual_SNP_data.png differ diff --git a/docs/html/articles/images/CAMDAC_manual_fig1.png b/docs/html/articles/images/CAMDAC_manual_fig1.png new file mode 100644 index 0000000..1f3e4e8 Binary files /dev/null and b/docs/html/articles/images/CAMDAC_manual_fig1.png differ diff --git a/docs/html/articles/images/CAMDAC_manual_fig2.png b/docs/html/articles/images/CAMDAC_manual_fig2.png new file mode 100644 index 0000000..872b72f Binary files /dev/null and b/docs/html/articles/images/CAMDAC_manual_fig2.png differ diff --git a/docs/html/articles/images/CAMDAC_manual_formatted_allele_counts_output.png b/docs/html/articles/images/CAMDAC_manual_formatted_allele_counts_output.png new file mode 100644 index 0000000..20104bd Binary files /dev/null and b/docs/html/articles/images/CAMDAC_manual_formatted_allele_counts_output.png differ diff --git a/docs/html/articles/images/CAMDAC_manual_fragment_length_histogram.png b/docs/html/articles/images/CAMDAC_manual_fragment_length_histogram.png new file mode 100644 index 0000000..431c52b Binary files /dev/null and b/docs/html/articles/images/CAMDAC_manual_fragment_length_histogram.png differ diff --git a/docs/html/articles/images/CAMDAC_manual_normal_SNP_data.png b/docs/html/articles/images/CAMDAC_manual_normal_SNP_data.png new file mode 100644 index 0000000..48926b4 Binary files /dev/null and b/docs/html/articles/images/CAMDAC_manual_normal_SNP_data.png differ diff --git a/docs/html/articles/images/CAMDAC_manual_normal_methylation_output.png b/docs/html/articles/images/CAMDAC_manual_normal_methylation_output.png new file mode 100644 index 0000000..113023d Binary files /dev/null and b/docs/html/articles/images/CAMDAC_manual_normal_methylation_output.png differ diff --git a/docs/html/articles/images/CAMDAC_manual_normal_methylation_rate_summary.png b/docs/html/articles/images/CAMDAC_manual_normal_methylation_rate_summary.png new file mode 100644 index 0000000..77c21e8 Binary files /dev/null and b/docs/html/articles/images/CAMDAC_manual_normal_methylation_rate_summary.png differ diff --git a/docs/html/articles/images/CAMDAC_manual_tumour_versus_normal_methylomes.png b/docs/html/articles/images/CAMDAC_manual_tumour_versus_normal_methylomes.png new file mode 100644 index 0000000..71ed4d9 Binary files /dev/null and b/docs/html/articles/images/CAMDAC_manual_tumour_versus_normal_methylomes.png differ diff --git a/docs/html/articles/images/camdac_dmr_vis.png b/docs/html/articles/images/camdac_dmr_vis.png new file mode 100644 index 0000000..b64ad76 Binary files /dev/null and b/docs/html/articles/images/camdac_dmr_vis.png differ diff --git a/docs/html/articles/index.html b/docs/html/articles/index.html new file mode 100644 index 0000000..bd57d07 --- /dev/null +++ b/docs/html/articles/index.html @@ -0,0 +1,125 @@ + +Articles • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Pipeline

    +

    + +
    Introduction
    +
    +
    Installation
    +
    +
    CAMDAC pipeline
    +
    +
    Results
    +
    +
    +
    +

    Experimental

    +

    + +
    Experimental Features
    +
    +
    +
    +

    Extra

    +

    + +
    FAQs
    +
    +
    Technical Note
    +
    +
    Contributing
    +
    +
    +
    +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/articles/introduction.html b/docs/html/articles/introduction.html new file mode 100644 index 0000000..10aaaef --- /dev/null +++ b/docs/html/articles/introduction.html @@ -0,0 +1,154 @@ + + + + + + + +Introduction • CAMDAC + + + + + + + + + + + + +
    +
    + + + + +
    +
    + + + + +
    +

    Introduction +

    +

    Solid tumours typically contain both cancer and admixed normal contaminating cells, which confounds the analysis of bulk cancer methylomes from bisulfite sequencing. To address these issues we present CAMDAC, a tool for Copy-number Aware Methylation Deconvolution Analysis of Cancer.

    +

    In brief, we show that the bulk tumour methylation rate (\(m_b\)) can be expressed as a weighted sum of the methylation rates of the tumour cells and normal contaminants, accounting for tumour purity and copy number (Figure 1). We derive purity and copy number estimates directly from bulk tumour RRBS data, leveraging somatic copy number aberration calls from ASCAT or Battenberg. We use bulk tissue- and sex-matched normal samples as proxy for the normal tumour-infiltrating cells (\(m_{n,i}\)), and obtain \(m_b\) from the bulk tumour data itself. This provides all the necessary information to extract the pure tumour methylation rate (\(m_t\)).

    +
    +
    +

    Figure 1. CAMDAC principles and key variables. Adapted from Larose Cadieux et al., 2020.

    +
    +
    +


    In Larose Cadieux et al., 2020, we obtained bulk tumour RRBS data from surgically resected lung cancers and patient-matched tumour-adjacent normal lung samples. Normal samples may be used for copy number profiling, as proxy a for the normal tumour-infiltrating cells (\(m_{n,i}\)), and as a proxy for the tumour cell of origin (\(m_{n,o}\)). Here, \(m_{n,i}\) is needed for bulk tumour methylation rate deconvolution and \(m_{n,o}\) is required for differential methylation analyses (Figure 2). In non-small cell lung cancer, we demonstrate that patient-matched tumour-adjacent normal is a suitable proxy for all normals, i.e. \(m_{n,i} \approx m_{n,o}\) (Larose Cadieux et al., 2020).

    +
    +
    +

    Figure 2. Key input and output data for CAMDAC

    +
    +
    +


    If the patient-matched tumour-adjacent normal tissue is not available, a tissue- and sex-matched normal may provide a substitute for the tumour-infiltrating normal cells (Figure 2). If the tissue-matched normal is a poor representative of the cell of origin, a different proxy may be used for differential methylation analysis.

    +

    The purified tumour methylation rates allow for accurate differential methylation analysis, both between tumour and normal cells and, in the case of multi-region sequencing, between different tumour samples. The deconvoluted methylation profiles accurately inform inter- and intra-tumour sample relationships and could enable the timing of copy number gains and (epi)mutations in tumour evolution. This is explained in more detail in Larose Cadieux et al., 2020.

    +

    At time of writing, CAMDAC is compatible with human Msp1 digested single-end directional reduced representation bisulfite sequencing (RRBS) data and whole genome bisulfite sequencing (WGBS) data. The input must be in binary alignment map (BAM) format. Bases should be quality and adapter trimmed and PCR duplicates should be removed. BAM files may be aligned to hg19, hg38, GRCH37 and GRHCH38 reference human genome builds.

    +
    +
    + + + +
    + + + +
    + +
    +

    +

    Site built with pkgdown 2.0.9.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/html/articles/introduction_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/html/articles/introduction_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/html/articles/introduction_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/html/articles/output.html b/docs/html/articles/output.html new file mode 100644 index 0000000..220f0d8 --- /dev/null +++ b/docs/html/articles/output.html @@ -0,0 +1,298 @@ + + + + + + + +Results • CAMDAC + + + + + + + + + + + + +
    +
    + + + + +
    +
    + + + + +

    The CAMDAC pipeline returns a structured directory at the outdir from the CamConfig() object. The pipeline returns files unique to the RRBS and WGBS modules with the general structure:

    +
    └── <CamSample.patient_id>
    +    ├── Allelecounts
    +    │   ├── <CamSample.id>
    +    ├── Copynumber
    +    │   ├── <CamSample.id>
    +    └── Methylation
    +        └── <CamSample.id>
    +

    The sections below describe each results file in more detail. Next, see vignette("questions") for frequently asked questions or vignette("experimental") for details on experimental CAMDAC features.

    +
    +

    RRBS pipeline output +

    +
    results/              
    +└── P                                           
    +    ├── Allelecounts
    +    │   ├── N                  
    +    │   │   └── P.N.SNPs.CpGs.all.sorted.RData
    +    │   └── T                               
    +    │       └── P.T.SNPs.CpGs.all.sorted.RData
    +    ├── Copy_number                             
    +    │   ├── N                        
    +    │   │   ├── fragment_length_histogram.pdf      
    +    │   │   ├── msp1_fragments_RRBS.RData
    +    │   │   ├── P_N_normal_SNP_data.pdf
    +    │   │   ├── P.N.SNPs.RData
    +    │   │   └── Rplots.pdf
    +    │   └── T
    +    │       ├── fragment_length_histogram.pdf
    +    │       ├── msp1_fragments_RRBS.RData
    +    │       ├── P_T_SNP_data.pdf
    +    │       ├── P.T.ACF.and.ploidy.txt
    +    │       ├── P.T.ascat.bc.RData
    +    │       ├── P.T.ascat.frag.RData
    +    │       ├── P.T.ascat.output.RData
    +    │       ├── P.T.ASCATprofile.png
    +    │       ├── P.T.ASPCF.png
    +    │       ├── P.T.BAF.PCFed.txt
    +    │       ├── P.T.germline.png
    +    │       ├── P.T.LogR.PCFed.txt
    +    │       ├── P.T.rawprofile.png
    +    │       ├── P.T.SNPs.RData
    +    │       ├── P.T.sunrise.png
    +    │       ├── P.T.tumour.png
    +    │       └── Rplots.pdf
    +    └── Methylation
    +      ├── N
    +      │   ├── dt_normal_m.RData
    +      │   └── P_N_methylation_rate_summary.pdf
    +      └── T
    +          ├── CAMDAC_DMPs.bed
    +          ├── CAMDAC_purified_tumour.bed
    +          ├── CAMDAC_results_per_CpG.RData
    +          ├── P_T_DMP_stats.txt
    +          ├── P_T_methylation_rate_summary.pdf
    +          ├── purified_tumour.RData
    +          └── tumour_versus_normal_methylomes.pdf  
    + ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    FileDescription
    P.T.SNPs.CpGs.all.sorted.RDataAllele counts for a sample. Generated by processing BAM file
    P.T.ascat.output.RDataASCAT copy number results
    P.T.ASCATprofile.pngASCAT copy number profile
    dt_normal_m.RDataBulk normal DNA methylation data
    purified_tumour.RDataCAMDAC-purified DNA methylation rates
    CAMDAC_results_per_CpG.fstCAMDAC deconvolution and differential methylation results
    +
    +
    +

    WGBS pipeline output +

    +

    CAMDAC outputs are written in the directory given by config$outdir in the format PATIENT/DATASET/SAMPLE/:

    +
    └── P
    +    ├── Allelecounts
    +    │   ├── N
    +    │   │   └── P.N.SNPs.CpGs.all.sorted.csv.gz
    +    │   └── T
    +    │       └── P.T.SNPs.CpGs.all.sorted.csv.gz
    +    ├── Copynumber
    +    │   ├── N
    +    │   │   └── P.N.SNPs.csv.gz
    +    │   └── T
    +    │       ├── ascat
    +    │       ├── battenberg
    +    │       ├── P.T.cna.txt
    +    │       ├── P.T.SNPs.csv.gz
    +    │       └── P.T.tnSNP.csv.gz
    +    └── Methylation
    +        ├── N
    +        │   └── P.N.m.csv.gz
    +        └── T
    +            ├── P.T.CAMDAC_annotated_DMRs.fst
    +            ├── P.T.CAMDAC_results_per_CpG.fst
    +            ├── P.T.m.csv.gz
    +            └── P.T.pure.csv.gz
    + ++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    FileDescription
    P.T.SNPs.CpGs.all.sorted.csv.gzAllele counts for a sample. Generated by processing BAM file
    P.T.SNPs.csv.gzSNP counts for a sample.
    P.T.cna.txtCAMDAC CNA result
    P.T.m.csv.gzBulk methylation data
    P.T.m.pure.csv.gzCAMDAC-deconvolved methylation data
    P1.T.CAMDAC_results_per_CpG.fstCAMDAC differentially methylated cytosines
    P1.T.CAMDAC_annotated_DMRs.fstCAMDAC differentially methylated regions
    +

    It is possible to manually override outputs for runs. See vignette("questions") for more details.

    +
    +
    + + + +
    + + + +
    + +
    +

    +

    Site built with pkgdown 2.0.9.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/html/articles/output_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/html/articles/output_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/html/articles/output_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/html/articles/pipeline.html b/docs/html/articles/pipeline.html new file mode 100644 index 0000000..9db662a --- /dev/null +++ b/docs/html/articles/pipeline.html @@ -0,0 +1,168 @@ + + + + + + + +CAMDAC pipeline • CAMDAC + + + + + + + + + + + + +
    +
    + + + + +
    +
    + + + + +

    The entry-point to CAMDAC is the pipeline() function which expects a CamConfig() object and four CamSample() objects representing:

    +
      +
    • +tumor : The bulk tumor sample to deconvolve
    • +
    • +germline : The germline normal data for copy number calling
    • +
    • +infiltrates : A proxy for the normal infiltrating cells
    • +
    • +origin : A proxy for the normal cell from which the tumour originated
    • +
    +

    The same normal sample may be passed repeatedly for the germline, infiltrates or origin, depending on your experimental design. See ?pipeline for more details.

    +
    +library(CAMDAC)
    +
    +# Path to BAM files
    +tumor_bam <- system.file("testdata", "tumor.bam", package = "CAMDAC")
    +normal_bam <- system.file("testdata", "normal.bam", package = "CAMDAC")
    +
    +# Select samples for basic tumor-normal analysis
    +tumor <- CamSample(id = "T", sex = "XY", bam = tumor_bam)
    +normal <- CamSample(id = "N", sex = "XY", bam = normal_bam)
    +
    +# Configure pipeline
    +config <- CamConfig(
    +  outdir = "./results", bsseq = "rrbs", lib = "pe",
    +  build = "hg38", refs = "./refs", n_cores = 1, cna_caller = 'ascat'
    +)
    +
    +# Run CAMDAC
    +CAMDAC::pipeline(
    +  tumor, germline = normal, infiltrates = normal, origin = normal, config
    +)
    +

    Next, see vignette("output") for a detailed summary of CAMDAC results files.

    +
    + + + +
    + + + +
    + +
    +

    +

    Site built with pkgdown 2.0.9.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/html/articles/questions.html b/docs/html/articles/questions.html new file mode 100644 index 0000000..b39fbb7 --- /dev/null +++ b/docs/html/articles/questions.html @@ -0,0 +1,200 @@ + + + + + + + +FAQs • CAMDAC + + + + + + + + + + + + +
    +
    + + + + +
    +
    + + + + +
    +

    General FAQ +

    +
    +

    What if I don’t have a CNA profile or matched germline sample? (WGBS) +

    +

    Ideally, CAMDAC is run with a matched normal sample from which to derive heterozygous germline SNPs for copy number estimation. In the absence of matched normals, a panel of sex- and tissue-matched normal samples may be used by averaging DNA methylation rates from multiple patients. See vignette("experimental") for more information.

    +
    +
    +

    I want to run CAMDAC on something other than hg19 or hg38 (WGBS) +

    +

    Please raise an issue on GitHub to request files for a new reference genome.

    +
    +
    +

    Can I skip steps of the analysis? (WGBS) +

    +

    When calling pipeline if you do not give a normal infiltrate or cell of origin, the pipeline skips deconvolution and differential methylation respectively. This may be useful to run a quick first-pass to find and refit copy number solutions. When CAMDAC has found a solution and is rerun with the same tumor, config, and normal, the infiltrates and cell_of_origin arguments will continue the pipeline where it left off. The entire pipeline can be re-run be deleting the output directory or setting overwrite=TRUE in the CamConfig.

    +
    +
    +

    How do I run individual steps of the CAMDAC pipeline? (WGBS) +

    +

    The simplest way is to call pipeline with overwrite=FALSE in your config, giving the right normal sample for your step. Additionally, you CamConfig must run with the same output directory.

    +

    If for any reason, you have changed the output directory structure from previous run, you can initiate CAMDAC by manually passing outputs to CamSample objects. See the vignette vignette("output") for more information.

    +

    Finally, you can run the cmain_* functions used by pipeline() directly. For example, to run the deconvolution step, you can call cmain_deconvolve_methylation().

    +
    +
    +

    My CNA solution wasn’t right. How can I refit with different purity and ploidy values? (WGBS) +

    +

    If you want to use an external purity and ploidy solution, simply pass a CNA file that has only the purity and ploidy fields. Additionally, set refit==TRUE in the CamConfig and CAMDAC will use this to refit the sample.

    +
    +
    +

    Can I limit my analysis to specific regions of interest? +

    +

    To analyse specific genomic regions, you may pass a BED file to CAMDAC config:

    +
    +CamConfig(outdir=".", ref="./pipeline_files", regions="regions.bed")
    +

    CAMDAC will merge any overlapping regions prior to analysis.

    +
    +
    +

    How can I manually replace pipeline outputs? (WGBS) +

    +

    If you have outputs from a previous run, you can manually assign them to a CAMDAC object. This overwrites the expected path for that output type, allowing the pipeline to run with this data instead of computing it. Use the attach_output function, passing one of three arguments:

    +
      +
    • +counts: CAMDAC allele counts *.SNP.CpGs.all.sorted.csv.gz file
    • +
    • +snps: CAMDAC sample SNP counts *.SNPs.csv.gz file
    • +
    • +meth: CAMDAC bulk methylation *.m.csv.gz file
    • +
    • +cna: CAMDAC CNA *.cna.txt file
    • +
    • +pure: CAMDAC deconvolved methylation *.m.pure.csv.gz file
    • +
    +

    For example, to attach a previous counts file to a CAMDAC object:

    +
    +library(CAMDAC)
    +tumor <- CamSample(id = "T", sex = "XY", bam = NULL)
    +config <- CamConfig(outdir = tempdir(), build="hg38", bsseq="wgbs", lib="pe")
    +counts_file <- system.file("testdata", "test.SNPs.CpGs.all.sorted.csv.gz", package = "CAMDAC")
    +tumor <- attach_output(tumor, config, "counts", counts_file)
    +

    The CAMDAC pipeline can now access the file in the expected location at config$outdir.

    +
    +
    +
    + + + +
    + + + +
    + +
    +

    +

    Site built with pkgdown 2.0.9.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/html/articles/questions_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/html/articles/questions_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/html/articles/questions_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/html/articles/setup.html b/docs/html/articles/setup.html new file mode 100644 index 0000000..a3ea413 --- /dev/null +++ b/docs/html/articles/setup.html @@ -0,0 +1,181 @@ + + + + + + + +Installation • CAMDAC + + + + + + + + + + + + +
    +
    + + + + +
    +
    + + + + +
    +

    Install CAMDAC +

    +

    From the R console, install CAMDAC from github:

    +
    +install.packages("remotes")
    +remotes::install_github("VanLoo-lab/CAMDAC")
    +
    +
    +

    Download pipeline reference files +

    +

    CAMDAC requires custom annotation files for RRBS and WGBS analysis, available at the Zenodo repository: (10565423). An R convenience function is provided to download these files:

    +
    +CAMDAC::download_pipeline_files(bsseq = "rrbs", directory = "./refs")
    +CAMDAC::download_pipeline_files(bsseq = "wgbs", directory = "./refs")
    +

    Now, you’re ready to run CAMDAC! Next, see vignette("pipeline").

    +
    +

    Reference file search priority +

    +

    CAMDAC searches for pipeline files in the following order:

    +
      +
    1. A directory passed when creating the config object (see CamConfig())
    2. +
    3. The location defined by the environment variable CAMDAC_PIPELINE_FILES.
    4. +
    5. The current working directory
    6. +
    +

    We recommend that you set the environment variable CAMDAC_PIPELINE_FILES to the directory where you downloaded the files. This will allow CAMDAC to find the files automatically whenever you load R.

    +

    From a Unix terminal:

    +
    +

    echo “CAMDAC_PIPELINE_FILES=$(realpath R)” >> ~/.Renviron

    +
    +
    +
    +
    +

    External dependencies +

    +

    CAMDAC-RRBS

    +
      +
    • None
    • +
    +

    CAMDAC WGBS

    +
      +
    • +java: To run CAMDAC on WGBS data, we leverage Battenberg which requires the java command-line utility. Download Java from https://openjdk.org/.
    • +
    +
    +
    + + + +
    + + + +
    + +
    +

    +

    Site built with pkgdown 2.0.9.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/html/articles/setup_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/html/articles/setup_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/html/articles/setup_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/html/articles/technical.html b/docs/html/articles/technical.html new file mode 100644 index 0000000..faab236 --- /dev/null +++ b/docs/html/articles/technical.html @@ -0,0 +1,355 @@ + + + + + + + +Technical Note • CAMDAC + + + + + + + + + + + + +
    +
    + + + + +
    +
    + + + + +


    In this section, we provide a high-level summary of the CAMDAC pipeline, which covers six key steps:

    +
      +
    1. +Allele Counting: Obtain allele counts at SNP and CpG loci.
      +
    2. +
    3. +Copy-number calling: Obtain allele-specific copy number profiles, tumour purity and SNP plot data.
    4. +
    5. +Methylation Processing: Filter, format and plot methylation data.
    6. +
    7. +Deconvolution: Deconvolve the pure tumour methylation rates from bulk tumour RRBS data.
    8. +
    9. +Differential methylation: Perform differential tumour-normal methylation analysis.
    10. +
    +

    For a full outline and validation of CAMDAC, please see Larose Cadieux et al. (2020) bioRxiv.

    +
    +

    Allele counting +

    +

    Take a hypothetical female patient with primary tumour sample ID “T1” and normal-adjacent sample ID “N1”. First, CAMDAC takes the sequencing alignment files from each sample using the CamSample() functions, users should provide the full path and file name for the RRBS or WGBS binary mapping alignments (.bam) files for input samples, and use the CamConfig() sample to indicate whether they are aligned hg19, hg38, GRCH37 or GRCH38. Bases should be quality and adapter trimmed and PCR duplicates should be removed. Please ensure that the bam file is sorted and indexed.

    +

    CAMDAC employs an allele counter module to count SNP and CpG (methylation) alleles for downstream analysis. SNP counts are performed at 1000 genome SNP positions, and CpG alleles are counted using dinucleotides. To speed up the computation, we leverage a reference RRBS and WGBS genome files listing all genomic regions supported by the respective platforms.

    +

    By default, the read mapping quality filter is set to mq>=0 as default in CamConfig(). Mapping quality scores from bisulfite sequencing aligners may be biased against the alternate allele for reads with polymorphisms. Please review the mapping quality distribution of your data to determine if it is appropriate to increase this setting.

    +

    If the function is successful, a signle file output with the suffix “SNPs.CpGs”. This file carries compiled SNP and methylation information with the following columns:

    +
    +
    +

    Figure. Formatted SNP and methylation information

    +
    +
    +

    Each row is either a CG locus (and CCGG for RRBS) and/or a 1000g SNP position. These can be distinguished by the width column. While polymorphic CG/CCGG have the same width as their non-polymorphic counterpart, they are easily identified by looking at the POS, ref, alt and other SNP-informative columns.

    +

    For each SNP locus, 1000 Genomes genomic coordinate and reference and alternate alleles are listed under POS, ref and alt columns. The total_counts is the sum of alt_counts and ref_counts, which including all informative strand-specific allele counts. For example, at \(C>T\) SNPs, only the reverse strand allows to distinguish between the (un)methylated reference and the alternate allele and thus all forward read counts would be excluded from the total_counts column, but included in the total_depth. The SNP type column is only added to the patient-matched normal, which is used to assign SNP genotypes as either Homozygous or Heterozygous based on internal B-allele frequency (BAF) cut-offs.

    +

    M, UM, total_counts_m, and m are the counts methylated, counts unmethylated, the total counts (un)methylated and the methylation rate, respectively. Methylation rates are calculated per CG allele, meaning that at polymorphic CpGs, only the CG-forming allele counts are considered. CAMDAC methylation rates are therefore polymorphism-independent.

    +

    For CCGG loci found in RRBS, the CCGG column indicates the number of fragments with a 5’ end at this CCGG loci. This number may be 0 at polymorphic CCGG loci homozygous for the CCGG-destroying allele. Furthermore, for RRBS, MspI fragment boundaries are determined from the aligned reads and MspI fragment the size distribution is visualised for quality assessment in the file fragment_length_histogram.pdf. You should observe 3 disctinct peaks in the fragment length distribution. This is characteristic of human RRBS libraries and originates from MspI containing micro-satellite repeats of distinct lengths. The MspI fragment boundaries and their GC content are saved as an .RData object and used downstream in RRBS copy number profiling.

    +
    +
    +

    Figure. MspI fragment size distribution

    +
    +
    +


    +
    +
    +

    Copy number calling +

    +

    B-allele frequencies at heterozygous SNPs are leveraged to calculate pure tumour copy number aberrations using either ASCAT.m for RRBS or Battenberg.m for WGBS. These tools are inspired from ASCAT (Van Loo et al., 2010) and Battenberg (Nik-Zainal et al., 2012). If sucessful, CAMDAC writes copy number output to the “Copy_number” directory.

    +

    A SNPs file lists the heterozygous SNPs selected for copy number analysis, resulting in a table where each row is a 1000g SNP position with minimum coverage defined by the germline sample with a minimum coverage set by the min_normal argument. The total_counts column is the total informative read counts. For example, at C\(>\)T SNPs, only the reverse strand allows to distinguish between the unmethylated reference and the alternate allele and thus, forward read counts would not contribute to the total_counts and the BAF (B-allele frequency calculation). rBAF is randomly assigned BAF or 1-BAF to remove biases against the alternate allele in downstream tumour copy number profiling. All read counts however contribute to the total_depth which is used for LogR calculation, a measure of total coverage. Genotyping is performed and assignments stored under type.

    +

    For the RRBS pipeline, we provide an experimental feature to visualise the magnitude of biases against alternate of (B)-alleles. The number of homozygous to heterozygous SNPs is depicted and any biases in coverage against the latter can be evaluated. Due to being biases for CpG-rich genomic regions, a typical RRBS sample should show a high ratio of C\(>\)T SNPs. We note that C\(>\)T and A\(>\)G germline heterozygous SNPs will have roughly half the coverage of the 4 types of SNPs.

    +
    +
    +

    Figure Normal SNP data QC

    +
    +
    +


    +

    In addition to the above-mentionned columns, we also adjust for biases in the tumour LogR. The LogR is a normalised measure of tumour coverage used by ASCAT.m and Battenberg.m for copy number profiling together with the BAF. The covariates used for LogR correction are:

    +
      +
    • +GC_content: The GC content of fragments leads to sequencing biases, namely at the PCR amplification step.
    • +
    • +replic: The local genomic replication timing affects the number of copies present at a given locus in cells undergoing S phase.
    • +
    • +msp1_length: RRBS only. The MspI fragment length is highly variable and we observe sequencing biases against fragments at the extremes of the fragment size distribution.
    • +
    +

    Next, the standard ASCAT or Battenberg output are then generated. All files have the dot-separated patient and sample IDs as prefix. In addition, we plot the BAF and LogR. In the BAF profiles, heterozygous SNPs are highlighted in red. The BAF and LogR tracks are then segmented by the respective tools. The segmentation is then analysed to determine the optimal tumour purity and ploidy solution via a grid search (see sunrise plot). Raw and rounded allele-specific copy number segments are provided as output png images.

    +

    Finally, the purity, ploidy, number of heterozygous and homozygous 1000g SNP positions and median tumour and normal SNP depth are saved for each tumour sample. For RRBS, summary SNP data is plotted and saved as a pdf with filename "*_SNP_data.pdf*" and may help you troubleshoot your data.

    +
    +
    +

    Figure. Tumour SNP data summary

    +
    +
    +


    +
    +
    +

    Methylation processing +

    +

    As part of the allele counting step, CAMDAC calculates bulk DNA methylation rates for each input sample. For the patient- and tissue-matched normal sample “N1”, the methylation data columns have the suffix is \(x = n\), since \(m_{n,i} \sim m_{n,o}\). Where \(m_{n,i} \neq m_{n,o}\), the suffix is set to \(x = n\_i\) for the normal infiltrates and \(x = n\_o\) for the normal cell of origin proxy sample. The uncertainty on \(m_{x}\) is computed as the lower and upper boundaries of the 99% Highest Density Interval (HDI) are stored under columns \(m_{x,low}\) and \(m_{x,high}\).

    +
      +
    • +CHR: Chromosome name with ‘chr’ prefix
    • +
    • +start: First base of CG/CCGG
    • +
    • +end: Last base of CG/CCGG
    • +
    • +M_x: Counts methylated
    • +
    • +UM_x: Counts unmethylated
    • +
    • +m_x: Methylation rate
    • +
    • +m_x_low: Lower boundary of the 99% HDI for \(m_{x}\) +
    • +
    • +m_x_high: Upper boundary of the 99% HDI for \(m_{x}\) +
    • +
    +
    +
    +

    Figure. Normal methylation output.

    +
    +
    +

    In the normal sample methylation output directory, you will find a pdf with methylation data summary and QC (RRBS only). We expect DNA methylation rates to sit near 0 and 1. CAMDAC calculates DNA methylation rates in a polymorphism-independent manner, meaning that the CG-destroying allele at a heterozygous CpG does not contribute to its methylation rate. The minimum coverage threshold applied to CpG sites is based on the CpG allele read depth, so any heterozygous SNPs present at the CG location may be removed due to insufficient coverage.

    +
    +
    +

    Figure. Normal methylation rate QC.

    +
    +
    +


    +
    +
    +

    Deconvolution +

    +

    At this stage, CAMDAC has obtained methylation rates for both the normal infiltrates and bulk tumour, as well as tumour copy number and purity estimates. The DNA methylation profile of the normal-adjacent samples may be used as a proxy for the methylation rate of tumour-infiltrating normal cells (\(m_{n,i}\)). We have all the necessary information to obtain CAMDAC pure tumour methylation rates, \(m_t\).

    +

    In the Methylation/ output directory, CpG copy number and purified tumour methylation data are written to output CSV files. Header fields include:

    +
      +
    • +nA: Major allele copy number
    • +
    • +nB: Minor allele copy number
    • +
    • +CN: Total allele copy number
    • +
    • +seg_start: Copy number segment start point
    • +
    • +seg_end: Copy number segment end point
    • +
    • +CG_CN: CpG allele total copy number (this differ from CN at polymorphic CpGs)
    • +
    • +m_t_raw: Raw CAMDAC purified tumour methylation rate
    • +
    • +m_t_corr: Corrected CAMDAC purified tumour methylation rate
    • +
    • +cov_t: CAMDAC purified tumour effective read coverage
    • +
    • +m_t_low: CAMDAC purified tumour 99% HDI lower boundary
    • +
    • +m_t_high: CAMDAC purified tumour 99% HDI upper boundary
    • +
    +

    CAMDAC-deconvoluted methylation rate can have any values between 0 and 1 while the range of bulk tumour methylation rates is driven by tumour DNA content. In the bulk tumour profiles, bi-allelic tumour-normal differentially methylated positions appear at intermediate methylation values while after purification, they form a peak near 0 or 1 for hypo- and hypermethylated positions, respectively.

    +
    +
    +

    Figure. Tumour versus normal methylation rates from before and after CAMDAC.

    +
    +
    +


    +
    +
    +

    Differential methylation +

    +

    For tumour-normal differential methylation analysis, CAMDAC expects a DNA methylation profile representing the tumour cell of origin (\(m_{n,o}\)). In this hypothetical example, we set the normal sample N1 as the cell of origin. Leveraging CAMDAC purified methylomes, we then obtain differentially methylated positions and regions.

    +

    Differential DNA methylation is detected with a minimum tumour-normal methylation rate difference (effect size, where \(\delta\beta\) >= 0.2) and a probability threshold, representing the probability that the tumour and normal beta distributions do not overlap. Both variables are used for calling differentially methylated positions (DMPs).

    +

    Next, CAMDAC builds on DMP calls to call DMRs. To identify differentially methylated regions (DMRs), we group CpGs into bins and look for clusters with at least 5 DMPs (min_DMP_counts_in_DMR=5), 4 of which must be consecutive (min_consec_DMP_in_DMR=4). After completion, this function generates a pure tumor methylation file (CAMDAC_results_per_CpG.RData for RRBS or pure.csv.gz for WGBS) in the CAMDAC methylation output directory. This R object is a combination of all CAMDAC results per CpG with DMP information included:

    +
      +
    • +cluster_id: RRBS CpG cluster
    • +
    • +chrom: Chromsome name (i.e. 1, 2, …, X)
    • +
    • +start: First base of CG/CCGG
    • +
    • +end: Last base of CG/CCGG
    • +
    • +m_n: normal methylation rate
    • +
    • +m_n_low: normal methylation rate HDI99 lower boundary
    • +
    • +m_n_high: normal methylation rate HDI99 upper boundary
    • +
    • +m_t: CAMDAC pure tumour methylation rate
    • +
    • +m_t_low: CAMDAC pure tumour methylation rate HDI99 lower boundary
    • +
    • +m_t_high: CAMDAC pure tumour methylation rate HDI99 upper boundary
    • +
    • +prob: Tumour-noraml DMP probability
    • +
    • +CG_CN: CpG allele total copy number
    • +
    • +nA: Major allele copy number
    • +
    • +nB: Minor allele copy number
    • +
    • +segment: Copy number segment endpoints
    • +
    • +DMR_type: “hyper”, “hypo” or “mixed”
    • +
    • +CpG_counts: Number of CpGs in a given bin
    • +
    • +DMP_counts: Number of DMPs
    • +
    • +consec_DMPs: Number of consecutive DMPs
    • +
    • +DMR: “DMR” if differentially methylated, NA otherwise.
    • +
    • +m_diff_tn: CAMDAC-purified tumour \(-\) normal methylation rates
    • +
    • +prob_DMP: DMP probability
    • +
    • +DMP_t: DMP calls based on CAMDAC-purified tumour versus normal methylation rates
    • +
    +

    The ratio of hyper- to hypomethylated DMRs varies across genomic regions is reflected by the tumour-normal methylation rate difference.

    +
    +
    +

    Figure. DMR summary data.

    +
    +
    +


    +
    +

    Leveraging CAMDAC outputs +

    +

    CAMDAC outputs will be stored at the user-defined project outdir variable given to the configuration (CamConfig()). A patient folder is created at this path with directory name set to patient_id. This will contain 3 subdirectories: Allelecounts, Copy_number and Methylation, with further sub-directories created for each of a given patient’s samples.

    +

    With CAMDAC differential methylation calls in hand, users may choose to look for recurrently aberrated loci across their cohort. Note that tumour-tumour DMPs can be easily identified by looking for overlap between the 99% HDIs for CAMDAC pure tumour methylation rates between samples (99% HDI \(\subseteq\) [m_t_low,m_t_high]).

    +

    Clustering* analyses can also easily be performed by the user using well-established R packages such as ‘pvclust’ for hierarchical clustering with bootstrap and ‘umap’ (uniform manifold approximation and projection) for non-linear dimensionality reduction. Clustering of pure tumour methylation rates at promoter DMRs across large cohorts by ‘umap’ may reveal histology and/or sex-driven clusters as described in non-small cell lung cancer Larose Cadieux et al., 20201.

    +

    For multi-region data, sample tree reconstruction by neighbour joining leveraging CAMDAC pure tumour methylation rates at hypermethylated DMPs in at least on sample, subset to loci confidently unmethylated in the normal cell of origin (m_n_high<0.2), can reveal inter-sample relationships, as demonstrated in non-small cell lung cancer Larose Cadieux et al., 20201.

    +

    When running gene-set enrichment analysis (GSEA) on CAMDAC DMR calls, gene sets should be limited to those genes with promoters covered by RRBS. It may be desirable to subset DMR calls to hypermethylated promoter-associated CpG Islands given that methylation at these loci is most correlated with expression.

    +

    Users may leverage normal, deconvoluted tumour methylation rates and tumour-normal DMP calls to separate clonal mono- and bi-allelic from subclonal bi-allelic methylation changes to shed light into tumour evolutionary histories Larose Cadieux et al., 20201. The allele-specific CAMDAC module will be made available in future releases.

    +
    +
    +

    References +

    +

    Larose Cadieux et al. (2020). Copy number-aware deconvolution of tumor-normal DNA methylation profiles, bioRxiv 2020.11.03.366252

    +
    +
    +
    + + + +
    + + + +
    + +
    +

    +

    Site built with pkgdown 2.0.9.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/html/articles/technical_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/html/articles/technical_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/html/articles/technical_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/html/authors.html b/docs/html/authors.html new file mode 100644 index 0000000..a1d20a2 --- /dev/null +++ b/docs/html/authors.html @@ -0,0 +1,142 @@ + +Authors and Citation • CAMDAC + + +
    +
    + + + +
    +
    +
    + + + +
    • +

      Elizabeth Larose Cadieux. Author, maintainer. +

      +
    • +
    • +

      Nana Mensah. Author, maintainer. +

      +
    • +
    • +

      Siqi Lai. Author. +

      +
    • +
    • +

      Carla Castignani. Author. +

      +
    • +
    • +

      Jonas Demeulemeester. Author. +

      +
    • +
    • +

      Peter Van Loo. Author, funder. +

      +
    • +
    +
    +
    +

    Citation

    + +
    +
    + + +

    Larose Cadieux E, Mensah N, Lai S, Castignani C, Demeulemeester J, Van Loo P (2025). +CAMDAC: Copy-number Aware Methylation Deconvolution and Analysis of Cancers. +R package version 0.2.0. +

    +
    @Manual{,
    +  title = {CAMDAC: Copy-number Aware Methylation Deconvolution and Analysis of Cancers},
    +  author = {Elizabeth {Larose Cadieux} and Nana Mensah and Siqi Lai and Carla Castignani and Jonas Demeulemeester and Peter {Van Loo}},
    +  year = {2025},
    +  note = {R package version 0.2.0},
    +}
    + +
    + +
    + + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/bootstrap-toc.css b/docs/html/bootstrap-toc.css new file mode 100644 index 0000000..5a85941 --- /dev/null +++ b/docs/html/bootstrap-toc.css @@ -0,0 +1,60 @@ +/*! + * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) + * Copyright 2015 Aidan Feldman + * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ + +/* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */ + +/* All levels of nav */ +nav[data-toggle='toc'] .nav > li > a { + display: block; + padding: 4px 20px; + font-size: 13px; + font-weight: 500; + color: #767676; +} +nav[data-toggle='toc'] .nav > li > a:hover, +nav[data-toggle='toc'] .nav > li > a:focus { + padding-left: 19px; + color: #563d7c; + text-decoration: none; + background-color: transparent; + border-left: 1px solid #563d7c; +} +nav[data-toggle='toc'] .nav > .active > a, +nav[data-toggle='toc'] .nav > .active:hover > a, +nav[data-toggle='toc'] .nav > .active:focus > a { + padding-left: 18px; + font-weight: bold; + color: #563d7c; + background-color: transparent; + border-left: 2px solid #563d7c; +} + +/* Nav: second level (shown on .active) */ +nav[data-toggle='toc'] .nav .nav { + display: none; /* Hide by default, but at >768px, show it */ + padding-bottom: 10px; +} +nav[data-toggle='toc'] .nav .nav > li > a { + padding-top: 1px; + padding-bottom: 1px; + padding-left: 30px; + font-size: 12px; + font-weight: normal; +} +nav[data-toggle='toc'] .nav .nav > li > a:hover, +nav[data-toggle='toc'] .nav .nav > li > a:focus { + padding-left: 29px; +} +nav[data-toggle='toc'] .nav .nav > .active > a, +nav[data-toggle='toc'] .nav .nav > .active:hover > a, +nav[data-toggle='toc'] .nav .nav > .active:focus > a { + padding-left: 28px; + font-weight: 500; +} + +/* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */ +nav[data-toggle='toc'] .nav > .active > ul { + display: block; +} diff --git a/docs/html/bootstrap-toc.js b/docs/html/bootstrap-toc.js new file mode 100644 index 0000000..1cdd573 --- /dev/null +++ b/docs/html/bootstrap-toc.js @@ -0,0 +1,159 @@ +/*! + * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) + * Copyright 2015 Aidan Feldman + * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ +(function() { + 'use strict'; + + window.Toc = { + helpers: { + // return all matching elements in the set, or their descendants + findOrFilter: function($el, selector) { + // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/ + // http://stackoverflow.com/a/12731439/358804 + var $descendants = $el.find(selector); + return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])'); + }, + + generateUniqueIdBase: function(el) { + var text = $(el).text(); + var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-'); + return anchor || el.tagName.toLowerCase(); + }, + + generateUniqueId: function(el) { + var anchorBase = this.generateUniqueIdBase(el); + for (var i = 0; ; i++) { + var anchor = anchorBase; + if (i > 0) { + // add suffix + anchor += '-' + i; + } + // check if ID already exists + if (!document.getElementById(anchor)) { + return anchor; + } + } + }, + + generateAnchor: function(el) { + if (el.id) { + return el.id; + } else { + var anchor = this.generateUniqueId(el); + el.id = anchor; + return anchor; + } + }, + + createNavList: function() { + return $(''); + }, + + createChildNavList: function($parent) { + var $childList = this.createNavList(); + $parent.append($childList); + return $childList; + }, + + generateNavEl: function(anchor, text) { + var $a = $(''); + $a.attr('href', '#' + anchor); + $a.text(text); + var $li = $('
  • '); + $li.append($a); + return $li; + }, + + generateNavItem: function(headingEl) { + var anchor = this.generateAnchor(headingEl); + var $heading = $(headingEl); + var text = $heading.data('toc-text') || $heading.text(); + return this.generateNavEl(anchor, text); + }, + + // Find the first heading level (`

    `, then `

    `, etc.) that has more than one element. Defaults to 1 (for `

    `). + getTopLevel: function($scope) { + for (var i = 1; i <= 6; i++) { + var $headings = this.findOrFilter($scope, 'h' + i); + if ($headings.length > 1) { + return i; + } + } + + return 1; + }, + + // returns the elements for the top level, and the next below it + getHeadings: function($scope, topLevel) { + var topSelector = 'h' + topLevel; + + var secondaryLevel = topLevel + 1; + var secondarySelector = 'h' + secondaryLevel; + + return this.findOrFilter($scope, topSelector + ',' + secondarySelector); + }, + + getNavLevel: function(el) { + return parseInt(el.tagName.charAt(1), 10); + }, + + populateNav: function($topContext, topLevel, $headings) { + var $context = $topContext; + var $prevNav; + + var helpers = this; + $headings.each(function(i, el) { + var $newNav = helpers.generateNavItem(el); + var navLevel = helpers.getNavLevel(el); + + // determine the proper $context + if (navLevel === topLevel) { + // use top level + $context = $topContext; + } else if ($prevNav && $context === $topContext) { + // create a new level of the tree and switch to it + $context = helpers.createChildNavList($prevNav); + } // else use the current $context + + $context.append($newNav); + + $prevNav = $newNav; + }); + }, + + parseOps: function(arg) { + var opts; + if (arg.jquery) { + opts = { + $nav: arg + }; + } else { + opts = arg; + } + opts.$scope = opts.$scope || $(document.body); + return opts; + } + }, + + // accepts a jQuery object, or an options object + init: function(opts) { + opts = this.helpers.parseOps(opts); + + // ensure that the data attribute is in place for styling + opts.$nav.attr('data-toggle', 'toc'); + + var $topContext = this.helpers.createChildNavList(opts.$nav); + var topLevel = this.helpers.getTopLevel(opts.$scope); + var $headings = this.helpers.getHeadings(opts.$scope, topLevel); + this.helpers.populateNav($topContext, topLevel, $headings); + } + }; + + $(function() { + $('nav[data-toggle="toc"]').each(function(i, el) { + var $nav = $(el); + Toc.init($nav); + }); + }); +})(); diff --git a/docs/html/docsearch.css b/docs/html/docsearch.css new file mode 100644 index 0000000..e5f1fe1 --- /dev/null +++ b/docs/html/docsearch.css @@ -0,0 +1,148 @@ +/* Docsearch -------------------------------------------------------------- */ +/* + Source: https://github.com/algolia/docsearch/ + License: MIT +*/ + +.algolia-autocomplete { + display: block; + -webkit-box-flex: 1; + -ms-flex: 1; + flex: 1 +} + +.algolia-autocomplete .ds-dropdown-menu { + width: 100%; + min-width: none; + max-width: none; + padding: .75rem 0; + background-color: #fff; + background-clip: padding-box; + border: 1px solid rgba(0, 0, 0, .1); + box-shadow: 0 .5rem 1rem rgba(0, 0, 0, .175); +} + +@media (min-width:768px) { + .algolia-autocomplete .ds-dropdown-menu { + width: 175% + } +} + +.algolia-autocomplete .ds-dropdown-menu::before { + display: none +} + +.algolia-autocomplete .ds-dropdown-menu [class^=ds-dataset-] { + padding: 0; + background-color: rgb(255,255,255); + border: 0; + max-height: 80vh; +} + +.algolia-autocomplete .ds-dropdown-menu .ds-suggestions { + margin-top: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion { + padding: 0; + overflow: visible +} + +.algolia-autocomplete .algolia-docsearch-suggestion--category-header { + padding: .125rem 1rem; + margin-top: 0; + font-size: 1.3em; + font-weight: 500; + color: #00008B; + border-bottom: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--wrapper { + float: none; + padding-top: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--subcategory-column { + float: none; + width: auto; + padding: 0; + text-align: left +} + +.algolia-autocomplete .algolia-docsearch-suggestion--content { + float: none; + width: auto; + padding: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--content::before { + display: none +} + +.algolia-autocomplete .ds-suggestion:not(:first-child) .algolia-docsearch-suggestion--category-header { + padding-top: .75rem; + margin-top: .75rem; + border-top: 1px solid rgba(0, 0, 0, .1) +} + +.algolia-autocomplete .ds-suggestion .algolia-docsearch-suggestion--subcategory-column { + display: block; + padding: .1rem 1rem; + margin-bottom: 0.1; + font-size: 1.0em; + font-weight: 400 + /* display: none */ +} + +.algolia-autocomplete .algolia-docsearch-suggestion--title { + display: block; + padding: .25rem 1rem; + margin-bottom: 0; + font-size: 0.9em; + font-weight: 400 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--text { + padding: 0 1rem .5rem; + margin-top: -.25rem; + font-size: 0.8em; + font-weight: 400; + line-height: 1.25 +} + +.algolia-autocomplete .algolia-docsearch-footer { + width: 110px; + height: 20px; + z-index: 3; + margin-top: 10.66667px; + float: right; + font-size: 0; + line-height: 0; +} + +.algolia-autocomplete .algolia-docsearch-footer--logo { + background-image: url("data:image/svg+xml;utf8,"); + background-repeat: no-repeat; + background-position: 50%; + background-size: 100%; + overflow: hidden; + text-indent: -9000px; + width: 100%; + height: 100%; + display: block; + transform: translate(-8px); +} + +.algolia-autocomplete .algolia-docsearch-suggestion--highlight { + color: #FF8C00; + background: rgba(232, 189, 54, 0.1) +} + + +.algolia-autocomplete .algolia-docsearch-suggestion--text .algolia-docsearch-suggestion--highlight { + box-shadow: inset 0 -2px 0 0 rgba(105, 105, 105, .5) +} + +.algolia-autocomplete .ds-suggestion.ds-cursor .algolia-docsearch-suggestion--content { + background-color: rgba(192, 192, 192, .15) +} diff --git a/docs/html/docsearch.js b/docs/html/docsearch.js new file mode 100644 index 0000000..b35504c --- /dev/null +++ b/docs/html/docsearch.js @@ -0,0 +1,85 @@ +$(function() { + + // register a handler to move the focus to the search bar + // upon pressing shift + "/" (i.e. "?") + $(document).on('keydown', function(e) { + if (e.shiftKey && e.keyCode == 191) { + e.preventDefault(); + $("#search-input").focus(); + } + }); + + $(document).ready(function() { + // do keyword highlighting + /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ + var mark = function() { + + var referrer = document.URL ; + var paramKey = "q" ; + + if (referrer.indexOf("?") !== -1) { + var qs = referrer.substr(referrer.indexOf('?') + 1); + var qs_noanchor = qs.split('#')[0]; + var qsa = qs_noanchor.split('&'); + var keyword = ""; + + for (var i = 0; i < qsa.length; i++) { + var currentParam = qsa[i].split('='); + + if (currentParam.length !== 2) { + continue; + } + + if (currentParam[0] == paramKey) { + keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); + } + } + + if (keyword !== "") { + $(".contents").unmark({ + done: function() { + $(".contents").mark(keyword); + } + }); + } + } + }; + + mark(); + }); +}); + +/* Search term highlighting ------------------------------*/ + +function matchedWords(hit) { + var words = []; + + var hierarchy = hit._highlightResult.hierarchy; + // loop to fetch from lvl0, lvl1, etc. + for (var idx in hierarchy) { + words = words.concat(hierarchy[idx].matchedWords); + } + + var content = hit._highlightResult.content; + if (content) { + words = words.concat(content.matchedWords); + } + + // return unique words + var words_uniq = [...new Set(words)]; + return words_uniq; +} + +function updateHitURL(hit) { + + var words = matchedWords(hit); + var url = ""; + + if (hit.anchor) { + url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; + } else { + url = hit.url + '?q=' + escape(words.join(" ")); + } + + return url; +} diff --git a/docs/html/index.html b/docs/html/index.html new file mode 100644 index 0000000..2583f8b --- /dev/null +++ b/docs/html/index.html @@ -0,0 +1,204 @@ + + + + + + + +Copy-number Aware Methylation Deconvolution and Analysis of Cancers • CAMDAC + + + + + + + + + + + + +
    +
    + + + + +
    +
    +
    + +

    Copy-number Aware Methylation Deconvolution Analysis of Cancer (CAMDAC) is an R library for deconvolving bulk tumor DNA methylation (bisulfite) sequencing data (Larose Cadieux et al., 2022, bioRxiv).

    +
    +

    Documentation +

    +

    Visit https://vanloo-lab.github.io/CAMDAC/.

    +
    +
    +

    Quickstart +

    +

    CAMDAC can be installed from an R console:

    +
    +install.packages("remotes")
    +remotes::install_github("VanLoo-lab/CAMDAC")
    +

    Download reference datasets required to run CAMDAC for RRBS and/or WGBS analysis from the Zenodo repository: (10565423). An R helper function is provided for convenience:

    +
    +CAMDAC::download_pipeline_files(bsseq = "rrbs", directory = "./refs")
    +CAMDAC::download_pipeline_files(bsseq = "wgbs", directory = "./refs")
    +

    Run the tumor-normal deconvolution pipeline with test data:

    +

    [!NOTE]
    +We provide downsampled BAM files for testing the pipeline. For representative results, please use your own BAM files.

    +
    +library(CAMDAC)
    +
    +tumor_bam <- system.file("testdata", "tumor.bam", package = "CAMDAC")
    +normal_bam <- system.file("testdata", "normal.bam", package = "CAMDAC")
    +
    +# Select samples for basic tumor-normal analysis
    +tumor <- CamSample(id = "T", sex = "XY", bam = tumor_bam)
    +normal <- CamSample(id = "N", sex = "XY", bam = normal_bam)
    +
    +# Configure pipeline
    +config <- CamConfig(
    +  outdir = "./results", bsseq = "rrbs", lib = "pe",
    +  build = "hg38", refs = "./refs", n_cores = 1, cna_caller='ascat'
    +)
    +
    +# Run CAMDAC
    +CAMDAC::pipeline(
    +  tumor, germline = normal, infiltrates = normal, origin = normal, config
    +)
    +

    For a more detailed walkthrough with test data, see vignette("pipeline").

    +
    +
    +

    Contributing +

    +

    To contribute to CAMDAC, fork the repository and install the development dependencies with remotes::install_dev_deps('.').

    +

    After making your changes, run the build and test commands listed in vignette("contributing").

    +

    Finally, submit a pull request with the changes on your fork.

    +
    +
    +
    + + +
    + + +
    + +
    +

    +

    Site built with pkgdown 2.0.9.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/html/link.svg b/docs/html/link.svg new file mode 100644 index 0000000..88ad827 --- /dev/null +++ b/docs/html/link.svg @@ -0,0 +1,12 @@ + + + + + + diff --git a/docs/html/news/index.html b/docs/html/news/index.html new file mode 100644 index 0000000..270b913 --- /dev/null +++ b/docs/html/news/index.html @@ -0,0 +1,108 @@ + +Changelog • CAMDAC + + +
    +
    + + + +
    +
    + + +
    + +
    • Integrated RRBS and WGBS analysis under a single call to the pipeline() function.
    • +
    • Added option for paired end reads
    • +
    +
    + +
    • Minor documentation updates.
    +
    + + + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/pkgdown.css b/docs/html/pkgdown.css new file mode 100644 index 0000000..80ea5b8 --- /dev/null +++ b/docs/html/pkgdown.css @@ -0,0 +1,384 @@ +/* Sticky footer */ + +/** + * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/ + * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css + * + * .Site -> body > .container + * .Site-content -> body > .container .row + * .footer -> footer + * + * Key idea seems to be to ensure that .container and __all its parents__ + * have height set to 100% + * + */ + +html, body { + height: 100%; +} + +body { + position: relative; +} + +body > .container { + display: flex; + height: 100%; + flex-direction: column; +} + +body > .container .row { + flex: 1 0 auto; +} + +footer { + margin-top: 45px; + padding: 35px 0 36px; + border-top: 1px solid #e5e5e5; + color: #666; + display: flex; + flex-shrink: 0; +} +footer p { + margin-bottom: 0; +} +footer div { + flex: 1; +} +footer .pkgdown { + text-align: right; +} +footer p { + margin-bottom: 0; +} + +img.icon { + float: right; +} + +/* Ensure in-page images don't run outside their container */ +.contents img { + max-width: 100%; + height: auto; +} + +/* Fix bug in bootstrap (only seen in firefox) */ +summary { + display: list-item; +} + +/* Typographic tweaking ---------------------------------*/ + +.contents .page-header { + margin-top: calc(-60px + 1em); +} + +dd { + margin-left: 3em; +} + +/* Section anchors ---------------------------------*/ + +a.anchor { + display: none; + margin-left: 5px; + width: 20px; + height: 20px; + + background-image: url(./link.svg); + background-repeat: no-repeat; + background-size: 20px 20px; + background-position: center center; +} + +h1:hover .anchor, +h2:hover .anchor, +h3:hover .anchor, +h4:hover .anchor, +h5:hover .anchor, +h6:hover .anchor { + display: inline-block; +} + +/* Fixes for fixed navbar --------------------------*/ + +.contents h1, .contents h2, .contents h3, .contents h4 { + padding-top: 60px; + margin-top: -40px; +} + +/* Navbar submenu --------------------------*/ + +.dropdown-submenu { + position: relative; +} + +.dropdown-submenu>.dropdown-menu { + top: 0; + left: 100%; + margin-top: -6px; + margin-left: -1px; + border-radius: 0 6px 6px 6px; +} + +.dropdown-submenu:hover>.dropdown-menu { + display: block; +} + +.dropdown-submenu>a:after { + display: block; + content: " "; + float: right; + width: 0; + height: 0; + border-color: transparent; + border-style: solid; + border-width: 5px 0 5px 5px; + border-left-color: #cccccc; + margin-top: 5px; + margin-right: -10px; +} + +.dropdown-submenu:hover>a:after { + border-left-color: #ffffff; +} + +.dropdown-submenu.pull-left { + float: none; +} + +.dropdown-submenu.pull-left>.dropdown-menu { + left: -100%; + margin-left: 10px; + border-radius: 6px 0 6px 6px; +} + +/* Sidebar --------------------------*/ + +#pkgdown-sidebar { + margin-top: 30px; + position: -webkit-sticky; + position: sticky; + top: 70px; +} + +#pkgdown-sidebar h2 { + font-size: 1.5em; + margin-top: 1em; +} + +#pkgdown-sidebar h2:first-child { + margin-top: 0; +} + +#pkgdown-sidebar .list-unstyled li { + margin-bottom: 0.5em; +} + +/* bootstrap-toc tweaks ------------------------------------------------------*/ + +/* All levels of nav */ + +nav[data-toggle='toc'] .nav > li > a { + padding: 4px 20px 4px 6px; + font-size: 1.5rem; + font-weight: 400; + color: inherit; +} + +nav[data-toggle='toc'] .nav > li > a:hover, +nav[data-toggle='toc'] .nav > li > a:focus { + padding-left: 5px; + color: inherit; + border-left: 1px solid #878787; +} + +nav[data-toggle='toc'] .nav > .active > a, +nav[data-toggle='toc'] .nav > .active:hover > a, +nav[data-toggle='toc'] .nav > .active:focus > a { + padding-left: 5px; + font-size: 1.5rem; + font-weight: 400; + color: inherit; + border-left: 2px solid #878787; +} + +/* Nav: second level (shown on .active) */ + +nav[data-toggle='toc'] .nav .nav { + display: none; /* Hide by default, but at >768px, show it */ + padding-bottom: 10px; +} + +nav[data-toggle='toc'] .nav .nav > li > a { + padding-left: 16px; + font-size: 1.35rem; +} + +nav[data-toggle='toc'] .nav .nav > li > a:hover, +nav[data-toggle='toc'] .nav .nav > li > a:focus { + padding-left: 15px; +} + +nav[data-toggle='toc'] .nav .nav > .active > a, +nav[data-toggle='toc'] .nav .nav > .active:hover > a, +nav[data-toggle='toc'] .nav .nav > .active:focus > a { + padding-left: 15px; + font-weight: 500; + font-size: 1.35rem; +} + +/* orcid ------------------------------------------------------------------- */ + +.orcid { + font-size: 16px; + color: #A6CE39; + /* margins are required by official ORCID trademark and display guidelines */ + margin-left:4px; + margin-right:4px; + vertical-align: middle; +} + +/* Reference index & topics ----------------------------------------------- */ + +.ref-index th {font-weight: normal;} + +.ref-index td {vertical-align: top; min-width: 100px} +.ref-index .icon {width: 40px;} +.ref-index .alias {width: 40%;} +.ref-index-icons .alias {width: calc(40% - 40px);} +.ref-index .title {width: 60%;} + +.ref-arguments th {text-align: right; padding-right: 10px;} +.ref-arguments th, .ref-arguments td {vertical-align: top; min-width: 100px} +.ref-arguments .name {width: 20%;} +.ref-arguments .desc {width: 80%;} + +/* Nice scrolling for wide elements --------------------------------------- */ + +table { + display: block; + overflow: auto; +} + +/* Syntax highlighting ---------------------------------------------------- */ + +pre, code, pre code { + background-color: #f8f8f8; + color: #333; +} +pre, pre code { + white-space: pre-wrap; + word-break: break-all; + overflow-wrap: break-word; +} + +pre { + border: 1px solid #eee; +} + +pre .img, pre .r-plt { + margin: 5px 0; +} + +pre .img img, pre .r-plt img { + background-color: #fff; +} + +code a, pre a { + color: #375f84; +} + +a.sourceLine:hover { + text-decoration: none; +} + +.fl {color: #1514b5;} +.fu {color: #000000;} /* function */ +.ch,.st {color: #036a07;} /* string */ +.kw {color: #264D66;} /* keyword */ +.co {color: #888888;} /* comment */ + +.error {font-weight: bolder;} +.warning {font-weight: bolder;} + +/* Clipboard --------------------------*/ + +.hasCopyButton { + position: relative; +} + +.btn-copy-ex { + position: absolute; + right: 0; + top: 0; + visibility: hidden; +} + +.hasCopyButton:hover button.btn-copy-ex { + visibility: visible; +} + +/* headroom.js ------------------------ */ + +.headroom { + will-change: transform; + transition: transform 200ms linear; +} +.headroom--pinned { + transform: translateY(0%); +} +.headroom--unpinned { + transform: translateY(-100%); +} + +/* mark.js ----------------------------*/ + +mark { + background-color: rgba(255, 255, 51, 0.5); + border-bottom: 2px solid rgba(255, 153, 51, 0.3); + padding: 1px; +} + +/* vertical spacing after htmlwidgets */ +.html-widget { + margin-bottom: 10px; +} + +/* fontawesome ------------------------ */ + +.fab { + font-family: "Font Awesome 5 Brands" !important; +} + +/* don't display links in code chunks when printing */ +/* source: https://stackoverflow.com/a/10781533 */ +@media print { + code a:link:after, code a:visited:after { + content: ""; + } +} + +/* Section anchors --------------------------------- + Added in pandoc 2.11: https://github.com/jgm/pandoc-templates/commit/9904bf71 +*/ + +div.csl-bib-body { } +div.csl-entry { + clear: both; +} +.hanging-indent div.csl-entry { + margin-left:2em; + text-indent:-2em; +} +div.csl-left-margin { + min-width:2em; + float:left; +} +div.csl-right-inline { + margin-left:2em; + padding-left:1em; +} +div.csl-indent { + margin-left: 2em; +} diff --git a/docs/html/pkgdown.js b/docs/html/pkgdown.js new file mode 100644 index 0000000..6f0eee4 --- /dev/null +++ b/docs/html/pkgdown.js @@ -0,0 +1,108 @@ +/* http://gregfranko.com/blog/jquery-best-practices/ */ +(function($) { + $(function() { + + $('.navbar-fixed-top').headroom(); + + $('body').css('padding-top', $('.navbar').height() + 10); + $(window).resize(function(){ + $('body').css('padding-top', $('.navbar').height() + 10); + }); + + $('[data-toggle="tooltip"]').tooltip(); + + var cur_path = paths(location.pathname); + var links = $("#navbar ul li a"); + var max_length = -1; + var pos = -1; + for (var i = 0; i < links.length; i++) { + if (links[i].getAttribute("href") === "#") + continue; + // Ignore external links + if (links[i].host !== location.host) + continue; + + var nav_path = paths(links[i].pathname); + + var length = prefix_length(nav_path, cur_path); + if (length > max_length) { + max_length = length; + pos = i; + } + } + + // Add class to parent
  • , and enclosing
  • if in dropdown + if (pos >= 0) { + var menu_anchor = $(links[pos]); + menu_anchor.parent().addClass("active"); + menu_anchor.closest("li.dropdown").addClass("active"); + } + }); + + function paths(pathname) { + var pieces = pathname.split("/"); + pieces.shift(); // always starts with / + + var end = pieces[pieces.length - 1]; + if (end === "index.html" || end === "") + pieces.pop(); + return(pieces); + } + + // Returns -1 if not found + function prefix_length(needle, haystack) { + if (needle.length > haystack.length) + return(-1); + + // Special case for length-0 haystack, since for loop won't run + if (haystack.length === 0) { + return(needle.length === 0 ? 0 : -1); + } + + for (var i = 0; i < haystack.length; i++) { + if (needle[i] != haystack[i]) + return(i); + } + + return(haystack.length); + } + + /* Clipboard --------------------------*/ + + function changeTooltipMessage(element, msg) { + var tooltipOriginalTitle=element.getAttribute('data-original-title'); + element.setAttribute('data-original-title', msg); + $(element).tooltip('show'); + element.setAttribute('data-original-title', tooltipOriginalTitle); + } + + if(ClipboardJS.isSupported()) { + $(document).ready(function() { + var copyButton = ""; + + $("div.sourceCode").addClass("hasCopyButton"); + + // Insert copy buttons: + $(copyButton).prependTo(".hasCopyButton"); + + // Initialize tooltips: + $('.btn-copy-ex').tooltip({container: 'body'}); + + // Initialize clipboard: + var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { + text: function(trigger) { + return trigger.parentNode.textContent.replace(/\n#>[^\n]*/g, ""); + } + }); + + clipboardBtnCopies.on('success', function(e) { + changeTooltipMessage(e.trigger, 'Copied!'); + e.clearSelection(); + }); + + clipboardBtnCopies.on('error', function() { + changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); + }); + }); + } +})(window.jQuery || window.$) diff --git a/docs/html/pkgdown.yml b/docs/html/pkgdown.yml new file mode 100644 index 0000000..def29a6 --- /dev/null +++ b/docs/html/pkgdown.yml @@ -0,0 +1,14 @@ +pandoc: 2.9.2.1 +pkgdown: 2.0.9 +pkgdown_sha: ~ +articles: + contributing: contributing.html + experimental: experimental.html + introduction: introduction.html + output: output.html + pipeline: pipeline.html + questions: questions.html + setup: setup.html + technical: technical.html +last_built: 2025-05-27T23:48Z + diff --git a/docs/html/reference/CamConfig.html b/docs/html/reference/CamConfig.html new file mode 100644 index 0000000..5d0a3e1 --- /dev/null +++ b/docs/html/reference/CamConfig.html @@ -0,0 +1,176 @@ + +Set CAMDAC configuration — CamConfig • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Set CAMDAC configuration

    +
    + +
    +
    CamConfig(
    +  outdir,
    +  bsseq,
    +  lib,
    +  build,
    +  n_cores = 1,
    +  regions = NULL,
    +  refs = NULL,
    +  n_seg_split = 50,
    +  min_mapq = 1,
    +  min_cov = 1,
    +  min_normal_cov = 10,
    +  overwrite = FALSE,
    +  cna_caller = "battenberg",
    +  cna_settings = NULL
    +)
    +
    + +
    +

    Arguments

    +
    outdir
    +

    A path to save CAMDAC results. The results folder structure +follows the format PATIENT/DATASET/SAMPLE/.

    + + +
    bsseq
    +

    Bisulfite sequencing platform. Choose between "wgbs" or "rrbs".

    + + +
    lib
    +

    Bisulfite sequencing library. Choose "pe" for paired end, "se" for single end.

    + + +
    build
    +

    Reference genome build. Choose "hg38" or "hg19".

    + + +
    n_cores
    +

    Number of cores to process CAMDAC data in parallel wherever possible.

    + + +
    regions
    +

    A BED file with regions to restrict the analysis to

    + + +
    refs
    +

    Path to CAMDAC reference files. If this is not given, CAMDAC searches the +environment variable CAMDAC_PIPELINE_FILES. If this is not set, CAMDAC searches recursively in the current +working directory.

    + + +
    min_mapq
    +

    Minimum mapping quality filter used in cmain_allele_counts().

    + + +
    min_cov
    +

    Minimum coverage filter for: DNA methylation, Normal SNP selection.

    + + +
    overwrite
    +

    Config to overwrite files if they already exist.

    + + +
    cna_caller
    +

    The CNA caller to use. "ascat" or "battenberg". Default is "battenberg"

    + + +
    cna_settings
    +

    A list of settings to pass to the CNA caller. rho, psi, java, beaglemaxmem

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/CamSample.html b/docs/html/reference/CamSample.html new file mode 100644 index 0000000..e18f996 --- /dev/null +++ b/docs/html/reference/CamSample.html @@ -0,0 +1,126 @@ + +Build CAMDAC sample object — CamSample • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Build CAMDAC sample object

    +
    + +
    +
    CamSample(id, sex, bam = NULL, patient_id = "P")
    +
    + +
    +

    Arguments

    +
    id
    +

    Unique identifier for the sample

    + + +
    sex
    +

    The sex of the patient, "XX" or "XY". Required for CNA calling.

    + + +
    bam
    +

    Sample BAM file. If not given, CAMDAC expects files linked with attach_output.

    + + +
    patient_id
    +

    An identifier for the patient

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/HDIofICDF.html b/docs/html/reference/HDIofICDF.html new file mode 100644 index 0000000..b730bab --- /dev/null +++ b/docs/html/reference/HDIofICDF.html @@ -0,0 +1,135 @@ + +HDI of ICDF — HDIofICDF • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    HDI of ICDF

    +
    + +
    +
    HDIofICDF(ICDFname, credMass = 0.99, tol = 0.0001, ...)
    +
    + +
    +

    Arguments

    +
    ICDFname
    +

    The inverse cumulative density function of the distribution.

    + + +
    credMass
    +

    The desired mass of the HDI region.

    + + +
    tol
    +

    Tolerance parameter for optimisation. the lower the tolerance,the +longer the optimisation, but the higher the accuracy. +According to CAMDAC RRBS comments, tol=1e-4 gives values +of the same accuracy as our max resolution. +This function is adapted from Greg Snow's TeachingDemos package +E.g.Determine HDI of a M=30 and UM=12 CpG +Adding 1 to shape parameter ensures uniform beta(1,1) is updated with our counts +HDIofICDF(qbeta,shape1 = 30+1 , shape2 = 12+1 )

    + +
    +
    +

    Value

    + + +

    Highest density interval (HDI) limits in a vector.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/HDIofMCMC.html b/docs/html/reference/HDIofMCMC.html new file mode 100644 index 0000000..33850b7 --- /dev/null +++ b/docs/html/reference/HDIofMCMC.html @@ -0,0 +1,150 @@ + +HDI of MCMC — HDIofMCMC • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    HDI of MCMC

    +
    + +
    +
    HDIofMCMC(M_b, UM_b, M_n, UM_n, p, CN, CN_n, credMass = 0.99)
    +
    + +
    +

    Arguments

    +
    M_b
    +

    counts methylated in the tumour

    + + +
    UM_b
    +

    counts unmethylated in the tumour

    + + +
    M_n
    +

    counts methylated in the normal

    + + +
    UM_n
    +

    counts unmethylated in the normal

    + + +
    p
    +

    tumour purity

    + + +
    CN
    +

    total tumour copy number

    + + +
    CN_n
    +

    total normal copy number

    + + +
    credMass
    +

    default is 0.99 +credMass is a scalar between 0 and 1, indicating the mass within the +credible interval that is to be estimated.

    + +
    +
    +

    Value

    + + +

    Value: HDIlim is a vector containing the limits of the HDI

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/HDIofMCMC_mt.html b/docs/html/reference/HDIofMCMC_mt.html new file mode 100644 index 0000000..2321853 --- /dev/null +++ b/docs/html/reference/HDIofMCMC_mt.html @@ -0,0 +1,152 @@ + +Calculate HDI by simulation — HDIofMCMC_mt • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Computes highest density interval from a sample of representative values, +estimated as shortest credible interval for a unimodal distribution

    +
    + +
    +
    HDIofMCMC_mt(M_b, UM_b, M_n, UM_n, p, CN, credMass = 0.99)
    +
    + +
    +

    Arguments

    +
    M_b
    +

    counts methylated in the tumour

    + + +
    UM_b
    +

    counts unmethylated in the tumour

    + + +
    M_n
    +

    counts methylated in the normal

    + + +
    UM_n
    +

    counts unmethylated in the normal

    + + +
    p
    +

    tumour purity

    + + +
    CN
    +

    total tumour copy number

    + + +
    credMass
    +

    default is 0.99 +credMass is a scalar between 0 and 1, indicating the mass within the +credible interval that is to be estimated.

    + + +
    CN_n
    +

    total normal copy number

    + +
    +
    +

    Value

    + + +

    Value: HDIlim is a vector containing the limits of the HDI

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/LogR_correction.html b/docs/html/reference/LogR_correction.html new file mode 100644 index 0000000..ded5e60 --- /dev/null +++ b/docs/html/reference/LogR_correction.html @@ -0,0 +1,155 @@ + +LogR_correction — LogR_correction • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Correct logR for msp1 fragment size bias and GC content

    +
    + +
    +
    LogR_correction(
    +  dt_sample,
    +  dt_SNPs,
    +  build,
    +  chr_names,
    +  min_normal,
    +  fragments_file,
    +  replic_timing_file_prefix,
    +  n_cores
    +)
    +
    + +
    +

    Arguments

    +
    dt_sample
    +

    Allelecounts output as a data.table

    + + +
    dt_SNPs
    +

    Allelecounts output subset to QC'ed SNP positions

    + + +
    build
    +

    Character variable corresponding to the reference genome version used for alignment

    + + +
    chr_names
    +

    Character variable with the seqlevels.

    + + +
    min_normal
    +

    Numerical with the minimum normal coverage threshold

    + + +
    fragments_file
    +

    CAMDAC reference MspI fragments file

    + + +
    replic_timing_file_prefix
    +

    CAMDAC reference replication timing files path and file name prefix

    + + +
    n_cores
    +

    Numerical value correspdonding to the number of cores for parallel processing

    + +
    +
    +

    Author

    +

    Elizabeth Larose Cadieux

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/annotate_copy_number.html b/docs/html/reference/annotate_copy_number.html new file mode 100644 index 0000000..716c43d --- /dev/null +++ b/docs/html/reference/annotate_copy_number.html @@ -0,0 +1,128 @@ + +Assign copy number calls — annotate_copy_number • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    annotate_copy_number returns the data.table dt_sample annotated with allele-specific copy numbers

    +
    + +
    +
    annotate_copy_number(dt_sample, seg, rm_sex_chrom = FALSE)
    +
    + +
    +

    Arguments

    +
    dt_sample
    +

    data.table object with each CpG and their coverage, counts methylated and methylation rate

    + + +
    seg
    +

    ASCAT.m copy number segements object

    + + +
    rm_sex_chrom
    +

    Logical indicating if you would like to remove sex chrom from downstream analyses

    + +
    +
    +

    Value

    + + +

    A dataframe for each sample_id with the copy number calls added

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/ascat.m.plotRawData.html b/docs/html/reference/ascat.m.plotRawData.html new file mode 100644 index 0000000..82f66fe --- /dev/null +++ b/docs/html/reference/ascat.m.plotRawData.html @@ -0,0 +1,140 @@ + +ascat.m.plotRawData — ascat.m.plotRawData • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Plot tumour and germline BAF and LogR

    +
    + +
    +
    ascat.m.plotRawData(ASCATobj, raw_LogR, pch, cex, lim_logR)
    +
    + +
    +

    Arguments

    +
    ASCATobj
    +

    an ASCAT object (e.g. data structure from ascat.loadData)

    + + +
    raw_LogR
    +

    vector with the LogR values before correction

    + + +
    pch
    +

    type of data points in plot

    + + +
    cex
    +

    size of data points in plot

    + + +
    lim_logR
    +

    y-axis limits on logR plot

    + +
    +
    +

    Value

    + + +

    Produces png files showing the logR and BAF values for tumour and germline samples

    +
    +
    +

    Author

    +

    Peter Van Loo

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/ascat.m.plotSegmentedData.html b/docs/html/reference/ascat.m.plotSegmentedData.html new file mode 100644 index 0000000..bafe911 --- /dev/null +++ b/docs/html/reference/ascat.m.plotSegmentedData.html @@ -0,0 +1,124 @@ + +ascat.m.plotSegmentedData — ascat.m.plotSegmentedData • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Plot segmentated BAF LogR

    +
    + +
    +
    ascat.m.plotSegmentedData(ASCATobj, lim_logR = 2)
    +
    + +
    +

    Arguments

    +
    ASCATobj
    +

    an ASCAT object (e.g. data structure from ascat.loadData)

    + +
    +
    +

    Value

    + + +

    Produces png files showing the logR and BAF values for tumour and germline samples

    +
    +
    +

    Author

    +

    Peter Van Loo

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/ascat.plotRawData.flags.html b/docs/html/reference/ascat.plotRawData.flags.html new file mode 100644 index 0000000..1d085e2 --- /dev/null +++ b/docs/html/reference/ascat.plotRawData.flags.html @@ -0,0 +1,136 @@ + +ascat.plotRawData — ascat.plotRawData.flags • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Plot BAF LogR

    +
    + +
    +
    ascat.plotRawData.flags(ASCATobj, pch, cex, lim_logR)
    +
    + +
    +

    Arguments

    +
    ASCATobj
    +

    an ASCAT object (e.g. data structure from ascat.loadData)

    + + +
    pch
    +

    type of data points in plot

    + + +
    cex
    +

    size of data points in plot

    + + +
    lim_logR
    +

    y-axis limits on logR plot

    + +
    +
    +

    Value

    + + +

    Produces png files showing the logR and BAF values for tumour and germline samples

    +
    +
    +

    Author

    +

    Peter Van Loo

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/asm_pipeline.html b/docs/html/reference/asm_pipeline.html new file mode 100644 index 0000000..3c0c339 --- /dev/null +++ b/docs/html/reference/asm_pipeline.html @@ -0,0 +1,130 @@ + +Run allele-specific methylation analysis pipeline — asm_pipeline • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Run allele-specific methylation analysis pipeline

    +
    + +
    +
    asm_pipeline(tumor, germline = NULL, infiltrates = NULL, origin = NULL, config)
    +
    + +
    +

    Arguments

    +
    tumor.
    +

    CamSample object for tumor sample.

    + + +
    germline.
    +

    CamSample object for germline sample. Used for CNA calling.

    + + +
    infiltrates.
    +

    CamSample object for infiltrating normal sample. Used for deconvolution.

    + + +
    origin.
    +

    CamSample object for cell of origin sample. Used for differential methylation.

    + + +
    config.
    +

    CamConfig object.

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/attach_output.html b/docs/html/reference/attach_output.html new file mode 100644 index 0000000..4f66171 --- /dev/null +++ b/docs/html/reference/attach_output.html @@ -0,0 +1,126 @@ + +Manually assign output file to CAMDAC sample — attach_output • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Manually assign output file to CAMDAC sample

    +
    + +
    +
    attach_output(sample, config, code, file)
    +
    + +
    +

    Arguments

    +
    sample
    +

    CamSample object

    + + +
    config
    +

    CamConfig object

    + + +
    code
    +

    Code for output file. See vignettes("output") for descriptions.

    + + +
    file
    +

    Path to file to copy to expected location

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/bin_CpGs.html b/docs/html/reference/bin_CpGs.html new file mode 100644 index 0000000..a587541 --- /dev/null +++ b/docs/html/reference/bin_CpGs.html @@ -0,0 +1,141 @@ + +Cluster CpGs into annotated bins — bin_CpGs • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    bin_CpGs returns the df with the annotation for each CpG

    +
    + +
    +
    bin_CpGs(path, patient_id, sample_id, dt, anno_list, n_cores)
    +
    + +
    +

    Arguments

    +
    path
    +

    Character string of the output directory

    + + +
    patient_id
    +

    Character string containting the patient ID

    + + +
    sample_id
    +

    Character string containting the sample ID.

    + + +
    dt
    +

    data.table where each CG is a row with DMP info.

    + + +
    anno_list
    +

    A data.table object containing annotated genomic bins including +genes, exons, introns, UTRs, CGI, CGI shores, CGI shelves, promoters or enhancers

    + + +
    n_cores
    +

    number of cores for parallel processing

    + +
    +
    +

    Value

    + + +

    A dataframe for each sample_id with the copy number calls added

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/calculate_m_t_hdi.html b/docs/html/reference/calculate_m_t_hdi.html new file mode 100644 index 0000000..032bea1 --- /dev/null +++ b/docs/html/reference/calculate_m_t_hdi.html @@ -0,0 +1,108 @@ + +Calculate HDI by simulation — calculate_m_t_hdi • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Calculate HDI by simulation

    +
    + +
    +
    calculate_m_t_hdi(meth_c, n_cores, itersplit = 100000)
    +
    + + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/call_dmps.html b/docs/html/reference/call_dmps.html new file mode 100644 index 0000000..30e5eaf --- /dev/null +++ b/docs/html/reference/call_dmps.html @@ -0,0 +1,115 @@ + +Call differentially methylated positions — call_dmps • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Call differentially methylated positions

    +
    + +
    +
    call_dmps(
    +  pmeth,
    +  nmeth,
    +  effect_size = 0.2,
    +  prob = 0.99,
    +  itersplit = 500000,
    +  ncores = 5
    +)
    +
    + + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/call_dmr_routine.html b/docs/html/reference/call_dmr_routine.html new file mode 100644 index 0000000..ac296ff --- /dev/null +++ b/docs/html/reference/call_dmr_routine.html @@ -0,0 +1,113 @@ + +Function to call DMRs on a camdac dmp dataset — call_dmr_routine • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Function to call DMRs on a camdac dmp dataset

    +
    + +
    +
    call_dmr_routine(
    +  tmeth_dmps,
    +  regions_annotations,
    +  min_DMP_counts,
    +  min_consec_DMP
    +)
    +
    + + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/camdac_to_battenberg_prepare_wgbs.html b/docs/html/reference/camdac_to_battenberg_prepare_wgbs.html new file mode 100644 index 0000000..a11f642 --- /dev/null +++ b/docs/html/reference/camdac_to_battenberg_prepare_wgbs.html @@ -0,0 +1,137 @@ + +Generate alleleCounter file from CAMDAC — camdac_to_battenberg_prepare_wgbs • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    camdac_to_battenberg_prepare_wgbs converts CAMDAC allele counter results to a format for processing.

    +
    + +
    +
    camdac_to_battenberg_prepare_wgbs(
    +  tumour_prefix,
    +  normal_prefix,
    +  camdac_tsnps,
    +  outdir
    +)
    +
    + +
    +

    Arguments

    +
    camdac_tumour_ac
    +

    CAMDAC tumour allele counts filepath. Expected *.gz

    + + +
    camdac_normal_ac
    +

    CAMDAC normal allele couts filepath. Expected *.gz

    + + +
    camdac_tnsps
    +

    CAMDAC tumour-normal-snps object. Expected *.gz

    + + +
    output_file
    +

    allelecounter formatted-file output directory.

    + +
    +
    +

    Value

    + + +

    File handle for allele counter file generated

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/cmain_bind_snps.html b/docs/html/reference/cmain_bind_snps.html new file mode 100644 index 0000000..ea3330d --- /dev/null +++ b/docs/html/reference/cmain_bind_snps.html @@ -0,0 +1,122 @@ + +Bind SNPs — cmain_bind_snps • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Combing tumour-normal SNP file for CNA analysis (ASCAT or BATTENBERG)

    +
    + +
    +
    cmain_bind_snps(tumour, normal, config)
    +
    + +
    +

    Arguments

    +
    tumour
    +

    A camdac sample object

    + + +
    normal
    +

    A camdac sample object

    + + +
    config
    +

    A camdac config object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/cmain_call_cna.html b/docs/html/reference/cmain_call_cna.html new file mode 100644 index 0000000..f4ad912 --- /dev/null +++ b/docs/html/reference/cmain_call_cna.html @@ -0,0 +1,122 @@ + +Call CNA — cmain_call_cna • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Config determines whether ASCAT or Battenberg is used

    +
    + +
    +
    cmain_call_cna(tumour, config)
    +
    + +
    +

    Arguments

    +
    tumour
    +

    A camdac sample object

    + + +
    config
    +

    A camdac config object

    + + +
    normal
    +

    A camdac sample object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/cmain_call_dmps.html b/docs/html/reference/cmain_call_dmps.html new file mode 100644 index 0000000..388c61a --- /dev/null +++ b/docs/html/reference/cmain_call_dmps.html @@ -0,0 +1,122 @@ + +Call tumour-normal DMPs — cmain_call_dmps • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Single-sample DMP calling on CAMDAC-deconvolved data

    +
    + +
    +
    cmain_call_dmps(tumour, normal, config)
    +
    + +
    +

    Arguments

    +
    tumour
    +

    A camdac sample object

    + + +
    normal
    +

    A camdac sample object

    + + +
    config
    +

    A camdac config object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/cmain_call_dmrs.html b/docs/html/reference/cmain_call_dmrs.html new file mode 100644 index 0000000..c2c7742 --- /dev/null +++ b/docs/html/reference/cmain_call_dmrs.html @@ -0,0 +1,122 @@ + +Call tumour-normal DMRs — cmain_call_dmrs • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Single-sample DMR calling on CAMDAC DMP data

    +
    + +
    +
    cmain_call_dmrs(tumour, config)
    +
    + +
    +

    Arguments

    +
    tumour
    +

    A camdac sample object

    + + +
    config
    +

    A camdac config object

    + + +
    normal
    +

    A camdac sample object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/cmain_count_alleles.html b/docs/html/reference/cmain_count_alleles.html new file mode 100644 index 0000000..3c12114 --- /dev/null +++ b/docs/html/reference/cmain_count_alleles.html @@ -0,0 +1,118 @@ + +Count alleles — cmain_count_alleles • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Count alleles

    +
    + +
    +
    cmain_count_alleles(sample, config)
    +
    + +
    +

    Arguments

    +
    sample
    +

    A camdac sample object

    + + +
    config
    +

    A camac allele object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/cmain_deconvolve_methylation.html b/docs/html/reference/cmain_deconvolve_methylation.html new file mode 100644 index 0000000..1ca56ec --- /dev/null +++ b/docs/html/reference/cmain_deconvolve_methylation.html @@ -0,0 +1,122 @@ + +Deconvolve methylation — cmain_deconvolve_methylation • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Deconvolve methylation

    +
    + +
    +
    cmain_deconvolve_methylation(tumour, normal, config)
    +
    + +
    +

    Arguments

    +
    tumour
    +

    A camdac sample object

    + + +
    normal
    +

    A camdac sample object

    + + +
    config
    +

    A camdac config object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/cmain_make_methylation_profile.html b/docs/html/reference/cmain_make_methylation_profile.html new file mode 100644 index 0000000..5f85d1d --- /dev/null +++ b/docs/html/reference/cmain_make_methylation_profile.html @@ -0,0 +1,118 @@ + +Make methylation — cmain_make_methylation_profile • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Pre-process methylation from allele counts for CAMDAC deconvolution

    +
    + +
    +
    cmain_make_methylation_profile(sample, config)
    +
    + +
    +

    Arguments

    +
    sample
    +

    A camdac sample object

    + + +
    config
    +

    A camdac config object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/cmain_make_snps.html b/docs/html/reference/cmain_make_snps.html new file mode 100644 index 0000000..e6de204 --- /dev/null +++ b/docs/html/reference/cmain_make_snps.html @@ -0,0 +1,118 @@ + +Make SNPs — cmain_make_snps • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Format and save SNP file for CNA analysis (ASCAT or BATTENBERG)

    +
    + +
    +
    cmain_make_snps(sample, config)
    +
    + +
    +

    Arguments

    +
    sample
    +

    A camdac sample object

    + + +
    config
    +

    A camdac config object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/cmain_run_ascat.html b/docs/html/reference/cmain_run_ascat.html new file mode 100644 index 0000000..701b415 --- /dev/null +++ b/docs/html/reference/cmain_run_ascat.html @@ -0,0 +1,122 @@ + +Run ASCAT.m — cmain_run_ascat • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Expects SNP profiles to have been created using cmain_make_snp_profiles

    +
    + +
    +
    cmain_run_ascat(tumour, config)
    +
    + +
    +

    Arguments

    +
    tumour
    +

    A camdac sample object

    + + +
    config
    +

    A camdac config object

    + + +
    normal
    +

    A camdac sample object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/cmain_run_battenberg.html b/docs/html/reference/cmain_run_battenberg.html new file mode 100644 index 0000000..2e17525 --- /dev/null +++ b/docs/html/reference/cmain_run_battenberg.html @@ -0,0 +1,122 @@ + +Run battenberg — cmain_run_battenberg • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Expects SNP profiles to have been created using cmain_make_snp_profiles

    +
    + +
    +
    cmain_run_battenberg(tumour, config)
    +
    + +
    +

    Arguments

    +
    tumour
    +

    A camdac sample object

    + + +
    config
    +

    A camdac config object

    + + +
    normal
    +

    A camdac sample object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/collapse_cpg_to_dmr.html b/docs/html/reference/collapse_cpg_to_dmr.html new file mode 100644 index 0000000..ac92a5f --- /dev/null +++ b/docs/html/reference/collapse_cpg_to_dmr.html @@ -0,0 +1,108 @@ + +Summarise CG stats per DMR — collapse_cpg_to_dmr • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Summarise CG stats per DMR

    +
    + +
    +
    collapse_cpg_to_dmr(dt)
    +
    + + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/compute_tumour_methylome.html b/docs/html/reference/compute_tumour_methylome.html new file mode 100644 index 0000000..484d1ac --- /dev/null +++ b/docs/html/reference/compute_tumour_methylome.html @@ -0,0 +1,139 @@ + +Compute the tumour methylation rate — compute_tumour_methylome • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    compute_tumour_methylome returns the data.table dt annotated with +CAMDAC pure tumour methylation rates

    +
    + +
    +
    compute_tumour_methylome(dt, p, min_cov_t = 3, sex, build)
    +
    + +
    +

    Arguments

    +
    dt
    +

    data.table object with each CpG and their coverage, counts methylated, +methylation rate and copy number and matched normal methylation info

    + + +
    p
    +

    Numerical - Sample purity estimates

    + + +
    min_cov_t
    +

    Numerical - Minimum tumour coverage

    + + +
    sex
    +

    Character variable with the patient expressed as "XX" for female or "XY" for male.

    + + +
    build
    +

    Character variable corresponding to the reference genome used for alignment.

    + +
    +
    +

    Value

    + + +

    A dataframe for each sample_id with the tumour methylome added

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/cwrap_asm_get_allele_counts.html b/docs/html/reference/cwrap_asm_get_allele_counts.html new file mode 100644 index 0000000..88dcf6b --- /dev/null +++ b/docs/html/reference/cwrap_asm_get_allele_counts.html @@ -0,0 +1,153 @@ + +Count alleles for reads phased to SNPs in a BAM file — cwrap_asm_get_allele_counts • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Count alleles for reads phased to SNPs in a BAM file

    +
    + +
    +
    cwrap_asm_get_allele_counts(
    +  bam_file,
    +  snps_gr,
    +  loci_dt,
    +  paired_end,
    +  drop_ccgg,
    +  min_mapq = min_mapq,
    +  min_cov = min_cov
    +)
    +
    + +
    +

    Arguments

    +
    bam_file
    +

    Path to BAM file

    + + +
    snps_gr
    +

    GRanges object with heterozygous SNP loci for phasing

    + + +
    loci_dt
    +

    Data table with CAMDAC CpG loci from reference files

    + + +
    paired_end
    +

    Logical indicating if BAM is paired end

    + + +
    drop_ccgg
    +

    Logical indicating if CCGG should be dropped (i.e. rrbs mode)

    + + +
    min_mapq
    +

    Minimum mapping quality to consider a read

    + + +
    min_cov
    +

    Minimum coverage to consider a read

    + +
    +
    +

    Value

    + + +

    A list with three slots: stats, qnames and asm_cg. stats describes counts of reads phased, +qnames determines which SNPs each read was phased to and asm_cg is the data table with read counts

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/download_pipeline_files.html b/docs/html/reference/download_pipeline_files.html new file mode 100644 index 0000000..5bb4977 --- /dev/null +++ b/docs/html/reference/download_pipeline_files.html @@ -0,0 +1,130 @@ + +Download CAMDAC pipeline files — download_pipeline_files • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    CAMDAC pipeline files are required for analysis. This function downloads the files to +the output directory and unpacks them. By default, CAMDAC searches for the files in the +environment variable CAMDAC_PIPELINE_FILES. If this is missing, the current directory is used.

    +

    CAMDAC pipeline files are required for analysis. This function downloads the files to +the output directory and unpacks them. By default, CAMDAC searches for the files in the +environment variable CAMDAC_PIPELINE_FILES. If this is missing, the current directory is used.

    +
    + +
    +
    download_pipeline_files(bsseq, directory = NULL, quiet = TRUE)
    +
    +download_pipeline_files(bsseq, directory = NULL, quiet = TRUE)
    +
    + +
    +

    Arguments

    +
    directory
    +

    Optional. Directory to download files to.

    + + +
    assay
    +

    Sequencing assay. Either wgbs or rrbs.

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/format_methylation_df.html b/docs/html/reference/format_methylation_df.html new file mode 100644 index 0000000..29e43e8 --- /dev/null +++ b/docs/html/reference/format_methylation_df.html @@ -0,0 +1,156 @@ + +Format methylation rates format_methylation_df — format_methylation_df • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Format methylation rates +format_methylation_df

    +
    + +
    +
    format_methylation_df(
    +  dt,
    +  sample_id,
    +  normal_ids,
    +  path_output,
    +  n_cores,
    +  suffix,
    +  trim = FALSE
    +)
    +
    + +
    +

    Arguments

    +
    dt
    +

    data.table containing the methylation information for each CpG

    + + +
    sample_id
    +

    sample ID

    + + +
    normal_ids
    +

    sample ID of normal sample(s)

    + + +
    path_output
    +

    output directory

    + + +
    n_cores
    +

    number of threads for HDI calculation

    + + +
    suffix
    +

    string containing the column names suffix for normal samples +This is to distinguish between the proxy supplied for the normal infiltrates +for use in deconvolution and the normal cell of origin for use in DMP/DMR calling

    + + +
    trim
    +

    Logical value establishing whether regions with extremely high coverage be trimmed or not

    + +
    +
    +

    Value

    + + +

    A GRanges object with all the CpG loci, their coverage, counts methylated and methylation rate

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/format_output.html b/docs/html/reference/format_output.html new file mode 100644 index 0000000..c0b8a60 --- /dev/null +++ b/docs/html/reference/format_output.html @@ -0,0 +1,159 @@ + +Format output nucleotide counts format_output — format_output • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Format output nucleotide counts +format_output

    +
    + +
    +
    format_output(
    +  patient_id,
    +  sample_id,
    +  sex,
    +  is_normal = FALSE,
    +  path,
    +  path_to_CAMDAC,
    +  build
    +)
    +
    + +
    +

    Arguments

    +
    patient_id
    +

    Character variable containting the patient id number

    + + +
    sample_id
    +

    Character variable with the sample ID

    + + +
    sex
    +

    Character variable with the patient expressed as "XX" for female or "XY" for male.

    + + +
    is_normal
    +

    Logical flag set to false if the sample to be formatted is normal or tumour

    + + +
    path
    +

    Character path variable pointing to the desired working directory. +This is where the output will be stored and should be constant for all CAMDAC functions. +Do not alter the output directory structure while running CAMDAC.

    + + +
    path_to_CAMDAC
    +

    Character variable containting the path to the CAMDAC directory +including dir name (e.g. "/path/to/CAMDAC/").

    + + +
    build
    +

    Character variable corresponding to the reference genome used for alignment. +CAMDAC is compatible with "hg19", "hg38", "GRCH37","GRCH38". +is desired in addition to GRanges object in .RData file

    + +
    +
    +

    Value

    + + +

    Concatenated SNP and CpG information

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/get_DMPs.html b/docs/html/reference/get_DMPs.html new file mode 100644 index 0000000..9d47112 --- /dev/null +++ b/docs/html/reference/get_DMPs.html @@ -0,0 +1,143 @@ + +Get DMPs — get_DMPs • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    get_DMPs returns a df with annotated statistics for each CpG

    +
    + +
    +
    get_DMPs(path, patient_id, sample_id, df, prob = 0.99, n_cores)
    +
    + +
    +

    Arguments

    +
    path
    +

    Complete path to the CAMDAC methylation output directory +fir this sample

    + + +
    patient_id
    +

    Character string containting the patient number

    + + +
    sample_id
    +

    Character variable with the tumour sample_id

    + + +
    df
    +

    A data.table with pure, bulk and normal methylation info

    + + +
    prob
    +

    Numerical value representing the threshold for statistically +significant DMP (default is p=0.99)

    + + +
    n_cores
    +

    Number of cores to do the statistical testing over

    + +
    +
    +

    Value

    + + +

    A data.table object with all the CpG loci, their coverage, counts +methylated and methylation rate

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/get_DMRs.html b/docs/html/reference/get_DMRs.html new file mode 100644 index 0000000..5ff26d6 --- /dev/null +++ b/docs/html/reference/get_DMRs.html @@ -0,0 +1,159 @@ + +Assign bins — get_DMRs • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    annotate_DMRs returns the df with the annotation for each CpG

    +
    + +
    +
    get_DMRs(
    +  path,
    +  patient_id,
    +  sample_id,
    +  dt,
    +  anno_list,
    +  min_DMP_counts,
    +  min_consec_DMP,
    +  n_cores,
    +  bulk = FALSE
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Character string of the output directory

    + + +
    patient_id
    +

    Character string containting the patient_id ID

    + + +
    sample_id
    +

    Character string containting the sample ID.

    + + +
    dt
    +

    dataframe where each CG is a row with DMP info.

    + + +
    anno_list
    +

    A data.table object containing annotated genomic bins including +genes, exons, introns, UTRs, CGI, CGI shores, CGI shelves, promoters or enhancers

    + + +
    min_DMP_counts
    +

    Numerical - number of DMPs required in a DMR

    + + +
    min_consec_DMP
    +

    Numerical - number of consecutive DMPs required in a DMR

    + + +
    n_cores
    +

    number of cores for parallel processing

    + +
    +
    +

    Value

    + + +

    A dataframe for each sample_id with the copy number calls added

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/get_allele_counts.html b/docs/html/reference/get_allele_counts.html new file mode 100644 index 0000000..a54f341 --- /dev/null +++ b/docs/html/reference/get_allele_counts.html @@ -0,0 +1,184 @@ + +Compile allele counts at SNPs and at CpGs for bisulfite sequencing data get_allele_counts — get_allele_counts • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Compile allele counts at SNPs and at CpGs for bisulfite sequencing data +get_allele_counts

    +
    + +
    +
    get_allele_counts(
    +  i,
    +  patient_id,
    +  sample_id,
    +  sex,
    +  bam_file,
    +  mq = 0,
    +  path,
    +  path_to_CAMDAC,
    +  build = NULL,
    +  n_cores,
    +  test = FALSE,
    +  paired_end = TRUE
    +)
    +
    + +
    +

    Arguments

    +
    i
    +

    Integer loop index. The function must be run with all values from 1 to 25, each containing +1/25th of the RRBS covered genome.

    + + +
    patient_id
    +

    Character variable containting the patient id

    + + +
    sample_id
    +

    Character variable with the sample id

    + + +
    sex
    +

    Character variable with the patient sex expressed as "XX" for female or "XY" for male.

    + + +
    bam_file
    +

    Character variable with the full bam file name and path

    + + +
    mq
    +

    Character variable or numeric containting the mapping quality treshold to be used. +For RRBS, set mq=0. Read mapping validity is based on read start site and nucleotides rather than mq.

    + + +
    path
    +

    Character path variable pointing to the desired working directory. +This is where the output will be stored and should be constant for all CAMDAC functions. +Do not alter the output directory structure while running CAMDAC. +The function output of this function will be a sub-directory of the path variable under +"./Allelecounts/sample_id/". Do not change the directory structure as subsequent functions will +look for files in this directory.

    + + +
    path_to_CAMDAC
    +

    Character variable containting the CAMDAC installation path (e.g. "/path/to/CAMDAC/").

    + + +
    build
    +

    Character variable corresponding to the reference genome used for alignment. +CAMDAC is compatible with "hg19", "hg38", "GRCH37","GRCH38".

    + + +
    n_cores
    +

    Numerical value correspdonding to the number of cores for parallel processing

    + + +
    test
    +

    Logical value indicating whether this is a quick test run with data subsampling

    + +
    +
    +

    Value

    + + +

    One .fst file including methylation info at CpGs and BAF and depth of coverage at +SNPs for the ith subset of RRBS loci

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/get_cluster_counts.html b/docs/html/reference/get_cluster_counts.html new file mode 100644 index 0000000..6d69b28 --- /dev/null +++ b/docs/html/reference/get_cluster_counts.html @@ -0,0 +1,108 @@ + +Count CpGs within DMP annotations — get_cluster_counts • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Count CpGs within DMP annotations

    +
    + +
    +
    get_cluster_counts(dt)
    +
    + + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/get_differential_methylation.html b/docs/html/reference/get_differential_methylation.html new file mode 100644 index 0000000..9d184ab --- /dev/null +++ b/docs/html/reference/get_differential_methylation.html @@ -0,0 +1,205 @@ + +Perform differential methylation analysis on deconvolute tumour methylation rates — get_differential_methylation • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    get_differential_methylation

    +
    + +
    +
    get_differential_methylation(
    +  patient_id,
    +  sample_id,
    +  sex,
    +  normal_origin_proxy_id,
    +  path,
    +  path_to_CAMDAC,
    +  build,
    +  effect_size = 0.2,
    +  prob = 0.99,
    +  min_DMP_counts_in_DMR = 5,
    +  min_consec_DMP_in_DMR = 4,
    +  n_cores,
    +  reseg = FALSE,
    +  bulk = FALSE
    +)
    +
    + +
    +

    Arguments

    +
    patient_id
    +

    Character variable containting the patient id number

    + + +
    sample_id
    +

    Character variable with the tumour sample_id

    + + +
    sex
    +

    Character variable with the patient expressed as "XX" for +female or "XY" for male.

    + + +
    normal_origin_proxy_id
    +

    Character variable with the sample ID +of the normal to be used as a proxy for the tumour cell of origin in

    + + +
    path
    +

    Character path variable pointing to the desired working +directory. This is where the output will be stored.

    + + +
    path_to_CAMDAC
    +

    Character variable containting the path to the CAMDAC +directory including dir name (e.g. "/path/to/CAMDAC/").

    + + +
    build
    +

    Character variable corresponding to the reference genome +used for alignment. CAMDAC is compatible with "hg19", "hg38", "GRCH37","GRCH38".

    + + +
    effect_size
    +

    Numerical containting the minimum tumour-normal methylation +difference (default is 0.2)

    + + +
    prob
    +

    Numerical value representing the threshold for statistically +significant DMP (default is p=0.99)

    + + +
    min_DMP_counts_in_DMR
    +

    Numerical value representing the number of +DMPs required in a DMR

    + + +
    min_consec_DMP_in_DMR
    +

    Numerical value representing the number of +consecutive DMPs required in a DMR

    + + +
    n_cores
    +

    Numerical value correspdonding to the number of cores +for parallel processing

    + + +
    reseg
    +

    Logical value should be set to FALSE. Multi-sample re-segmentation of +the copy number profiles will be available in future versions of CAMDAC.

    + + +
    bulk
    +

    Default is FALSE unless you want bulk DMP/DMR calls in addition +to CAMDAC pure tumour differential methylation analysis

    +

    Note: +#' Annotation include: +CGI (including shore and shelves) +gene body (intragenic, 5UTR, 3UTR, intron, exon) +promoter (2kb upstream and 500 downstream any UCSC annotated gene) +enhancer (vista and FANTOM5 annotation)

    + +
    +
    +

    Value

    + + +

    Biologically significant DMPs, DMRs

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/get_msp1_fragments.html b/docs/html/reference/get_msp1_fragments.html new file mode 100644 index 0000000..b79408f --- /dev/null +++ b/docs/html/reference/get_msp1_fragments.html @@ -0,0 +1,131 @@ + +get_msp1_fragments — get_msp1_fragments • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    get msp1 fragments

    +
    + +
    +
    get_msp1_fragments(dt, build, path_to_CAMDAC, outfile)
    +
    + +
    +

    Arguments

    +
    dt
    +

    data.table object with containing all covered CCGGs in the sample

    + + +
    build
    +

    Character, Either "hg19", "hg38", "GRCH37","GRCH38"

    + + +
    path_to_CAMDAC
    +

    Character string containting the path to the CAMDAC dir including +dir name e.g. "~/CAMDAC/"

    + + +
    outfile
    +

    character srting with output filename

    + +
    +
    +

    Author

    +

    elizabeth larose cadieux

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/get_pure_tumour_methylation.html b/docs/html/reference/get_pure_tumour_methylation.html new file mode 100644 index 0000000..75dc72e --- /dev/null +++ b/docs/html/reference/get_pure_tumour_methylation.html @@ -0,0 +1,174 @@ + +Deconvolve the pure tumour methylation rate from bisulfite sequencing data — get_pure_tumour_methylation • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    get_pure_tumour_methylation

    +
    + +
    +
    get_pure_tumour_methylation(
    +  patient_id,
    +  sample_id,
    +  sex,
    +  normal_infiltrates_proxy_id,
    +  path,
    +  path_to_CAMDAC,
    +  build,
    +  n_cores,
    +  reseg = FALSE
    +)
    +
    + +
    +

    Arguments

    +
    patient_id
    +

    Character variable containting the patient id number

    + + +
    sample_id
    +

    Character variable with the (control or tumour) sample_id

    + + +
    sex
    +

    Character variable with the patient expressed as "XX" for +female or "XY" for male.

    + + +
    normal_infiltrates_proxy_id,
    +

    Sample ID of the matched normal control

    + + +
    path
    +

    Character path variable pointing to the desired working directory. +This is where the output will be stored and should be constant for all CAMDAC functions.

    + + +
    path_to_CAMDAC
    +

    Character variable containting the path to the CAMDAC +directory including dir name (e.g. "/path/to/CAMDAC/").

    + + +
    build
    +

    Character variable corresponding to the reference genome +used for alignment. CAMDAC is compatible with "hg19", "hg38", "GRCH37","GRCH38".

    + + +
    n_cores
    +

    Numerical value correspdonding to the number of cores +for parallel processing

    + + +
    reseg
    +

    Logical value should be set to FALSE. Multi-sample re-segmentation of +the copy number profiles will be available in future versions of CAMDAC.

    +

    Note: +#' Annotation include: +CGI (including shore and shelves) +gene body (intragenic, 5UTR, 3UTR, intron, exon) +promoter (2kb upstream and 500 downstream any UCSC annotated gene) +enhancer (vista and FANTOM5 annotation)

    + +
    +
    +

    Value

    + + +

    CAMDAC purified tumour methylation rates

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/get_reference_files.html b/docs/html/reference/get_reference_files.html new file mode 100644 index 0000000..0af3dd1 --- /dev/null +++ b/docs/html/reference/get_reference_files.html @@ -0,0 +1,108 @@ + +Get CAMDAC reference files from config — get_reference_files • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Get CAMDAC reference files from config

    +
    + +
    +
    get_reference_files(config, type_folder, glob = NULL)
    +
    + + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/helper_camdac_pileup.html b/docs/html/reference/helper_camdac_pileup.html new file mode 100644 index 0000000..8adea9b --- /dev/null +++ b/docs/html/reference/helper_camdac_pileup.html @@ -0,0 +1,114 @@ + +Cache existing CAMDAC results into a sub-directory so that the current ones can be overwritten by the refitting pipeline Decided this is unnecessary as the initial results were so wrong. Exported only for development — helper_camdac_pileup • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Cache existing CAMDAC results into a sub-directory so that the current ones can be +overwritten by the refitting pipeline +Decided this is unnecessary as the initial results were so wrong. +Exported only for development

    +
    + +
    +
    helper_camdac_pileup(bam_file, seg, loci_dt)
    +
    + + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/index.html b/docs/html/reference/index.html new file mode 100644 index 0000000..4a7d0fa --- /dev/null +++ b/docs/html/reference/index.html @@ -0,0 +1,194 @@ + +Function reference • CAMDAC + + +
    +
    + + + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    All functions

    +

    +
    +

    CamConfig()

    +

    Set CAMDAC configuration

    +

    CamSample()

    +

    Build CAMDAC sample object

    +

    attach_output()

    +

    Manually assign output file to CAMDAC sample

    +

    cmain_bind_snps()

    +

    Bind SNPs

    +

    cmain_call_cna()

    +

    Call CNA

    +

    cmain_call_dmps()

    +

    Call tumour-normal DMPs

    +

    cmain_call_dmrs()

    +

    Call tumour-normal DMRs

    +

    cmain_count_alleles()

    +

    Count alleles

    +

    cmain_deconvolve_methylation()

    +

    Deconvolve methylation

    +

    cmain_make_methylation_profile()

    +

    Make methylation

    +

    cmain_make_snps()

    +

    Make SNPs

    +

    cmain_run_ascat()

    +

    Run ASCAT.m

    +

    cmain_run_battenberg()

    +

    Run battenberg

    +

    download_pipeline_files()

    +

    Download CAMDAC pipeline files

    +

    get_reference_files()

    +

    Get CAMDAC reference files from config

    +

    load_cna_data()

    +

    Parse ASCAT and Battenberg output directories to load CNA data

    +

    load_panel_ac_files()

    +

    Load allele count files

    +

    panel_asm_from_counts()

    +

    Panel ASM from counts Basic function to create an ASM methylation panel from allele count or ASM meth files WARNING: In active development.

    +

    panel_meth_from_beta()

    +

    Make CAMDAC methylation panel from a matrix of beta values

    +

    panel_meth_from_counts()

    +

    Make CAMDAC methylation panel from allele counts Methylation fractions are obtained by summing M and UM reads across samples

    +

    pipeline()

    +

    CAMDAC analysis pipeline

    +

    preprocess_asm()

    +

    Preprocess a list of CamSample objects for ASM analysis

    +

    preprocess_wgbs()

    +

    Preprocess a list of CamSample objects for analysis

    + + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/intervalWidth_r.html b/docs/html/reference/intervalWidth_r.html new file mode 100644 index 0000000..5f30b99 --- /dev/null +++ b/docs/html/reference/intervalWidth_r.html @@ -0,0 +1,133 @@ + +Calculate intervalWidth_r — intervalWidth_r • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Calculate intervalWidth_r

    +
    + +
    +
    intervalWidth_r(lowTailPr, ICDFname, credMass, ...)
    +
    + +
    +

    Arguments

    +
    ICDFname
    +

    is R's name for the inverse cumulative density function +of the distribution.

    + + +
    credMass
    +

    is the desired mass of the HDI region.

    + + +
    tol
    +

    is passed to R's optimize function, +the lower the tolerance,the longer the optimisation, but the higher the accuracy. +tol=1e-4 gives values of the same accurary as our max resolution +Return value: +Highest density iterval (HDI) limits in a vector. +Example of use: For determining HDI of a beta(30,12) distribution, type +HDIofICDF( qbeta , shape1 = 30+1 , shape2 = 12+1 ) +Notice that the parameters of the ICDFname must be explicitly named; +e.g., HDIofICDF( qbeta , 30+1 , 12+1 ) does not work. +Adapted and corrected from Greg Snow's TeachingDemos package. +Source fct outside of loop to speed up code

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/load_cna_data.html b/docs/html/reference/load_cna_data.html new file mode 100644 index 0000000..dfc854c --- /dev/null +++ b/docs/html/reference/load_cna_data.html @@ -0,0 +1,120 @@ + +Parse ASCAT and Battenberg output directories to load CNA data — load_cna_data • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    See "annotate_copy_number" func +A function required to load copy number for a tumour sample from camdac, either ascat or bb, +result should be: chrom, start, end, nA, nB, CN (total), seg_min and seg_max. +This should also include the purity and ploidy. As a separate list? +note that seg_min and seg_max are actually duplicates of the start and end columns, required to +keep track of the ascat segment positions after overalp +WARN: This drops sex chromosome but not implimented. Also should drops CN=0 (hom del) regions

    +
    + +
    +
    load_cna_data(tumour, config, data_type)
    +
    + + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/load_panel_ac_files.html b/docs/html/reference/load_panel_ac_files.html new file mode 100644 index 0000000..1abd099 --- /dev/null +++ b/docs/html/reference/load_panel_ac_files.html @@ -0,0 +1,120 @@ + +Load allele count files — load_panel_ac_files • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Load allele count files

    +
    + +
    +
    load_panel_ac_files(ac_files, cores = 5)
    +
    + +
    +

    Arguments

    +
    ac_files
    +

    Allele count files from CAMDAC

    + +
    +
    +

    Value

    + + +

    List of data tables for each allele counts file

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/panel_asm_from_counts.html b/docs/html/reference/panel_asm_from_counts.html new file mode 100644 index 0000000..e0e3149 --- /dev/null +++ b/docs/html/reference/panel_asm_from_counts.html @@ -0,0 +1,122 @@ + +Panel ASM from counts Basic function to create an ASM methylation panel from allele count or ASM meth files WARNING: In active development. — panel_asm_from_counts • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Panel ASM from counts +Basic function to create an ASM methylation panel from allele count or ASM meth files +WARNING: In active development.

    +
    + +
    +
    panel_asm_from_counts(c1, c2)
    +
    + +
    +

    Arguments

    +
    c1
    +

    First ASM allele counts file to merge

    + + +
    c2
    +

    Second ASM allele counts file to merge

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/panel_meth_from_beta.html b/docs/html/reference/panel_meth_from_beta.html new file mode 100644 index 0000000..90c3d1e --- /dev/null +++ b/docs/html/reference/panel_meth_from_beta.html @@ -0,0 +1,152 @@ + +Make CAMDAC methylation panel from a matrix of beta values — panel_meth_from_beta • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Make CAMDAC methylation panel from a matrix of beta values

    +
    + +
    +
    panel_meth_from_beta(
    +  mat,
    +  chrom,
    +  start,
    +  end,
    +  cov,
    +  props,
    +  cores,
    +  min_samples = 1,
    +  max_sd = 1
    +)
    +
    + +
    +

    Arguments

    +
    mat
    +

    Matrix of beta values. Rows are CpGs, columns are samples

    + + +
    chrom
    +

    Vector of chromosome names

    + + +
    start
    +

    Vector of CpG start positions

    + + +
    end
    +

    Vector of CpG end positions

    + + +
    cov
    +

    Vector of coverage values to give each CpG site. If a matrix is provided, coverage is calculated as the sum of reads for each site.

    + + +
    cores
    +

    Number of cores to use for calculating HDI

    + + +
    min_samples
    +

    Minimum number of samples that must have a non-NA value for a CpG site to be included in panel

    + + +
    max_sd
    +

    Maximum standard deviation of methylation for a site to be included in panel.

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/panel_meth_from_counts.html b/docs/html/reference/panel_meth_from_counts.html new file mode 100644 index 0000000..a3dd576 --- /dev/null +++ b/docs/html/reference/panel_meth_from_counts.html @@ -0,0 +1,150 @@ + +Make CAMDAC methylation panel from allele counts Methylation fractions are obtained by summing M and UM reads across samples — panel_meth_from_counts • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Make CAMDAC methylation panel from allele counts +Methylation fractions are obtained by summing M and UM reads across samples

    +
    + +
    +
    panel_meth_from_counts(
    +  ac_files,
    +  ac_props = NULL,
    +  min_coverage = 3,
    +  min_samples = 1,
    +  max_sd = 1,
    +  drop_snps = FALSE,
    +  cores = 5
    +)
    +
    + +
    +

    Arguments

    +
    ac_files
    +

    Allele count files from CAMDAC

    + + +
    ac_props
    +

    Proportions of each sample to use in panel. If NULL, samples are weighted by their +total number of reads, which equals the sum of M and UM counts. If samples are NA, then +proportions are redistributed.

    + + +
    min_coverage
    +

    Minimum coverage for a sample's site to be included in panel

    + + +
    min_samples
    +

    Minimum number of samples with coverage for a site to be included in panel

    + + +
    max_sd
    +

    Maximum standard deviation of methylation for a site to be included in panel

    + + +
    drop_snps
    +

    Boolean. If TRUE, drop per-sample CG-SNPs (BAF < 0.1 or BAF > 0.9) from panel

    + + +
    cores
    +

    Number of cores to use for calculating HDI

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/pipeline.html b/docs/html/reference/pipeline.html new file mode 100644 index 0000000..54c96f7 --- /dev/null +++ b/docs/html/reference/pipeline.html @@ -0,0 +1,130 @@ + +CAMDAC analysis pipeline — pipeline • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    CAMDAC analysis pipeline

    +
    + +
    +
    pipeline(tumor, germline, infiltrates, origin, config)
    +
    + +
    +

    Arguments

    +
    tumor
    +

    Tumor CamSample() object for deconvultion.

    + + +
    germline
    +

    Patient-matched normal CamSample() object. May be NULL if tumor has CNA calls already.

    + + +
    infiltrates
    +

    Normal CamSample() as a proxy for infiltrating normal methylation.

    + + +
    origin
    +

    Normal CamSample() representing cell of origin for tumor-normal differential methylation.

    + + +
    config
    +

    Configuration built with CamConfig().

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/pipeline_rrbs.html b/docs/html/reference/pipeline_rrbs.html new file mode 100644 index 0000000..2136265 --- /dev/null +++ b/docs/html/reference/pipeline_rrbs.html @@ -0,0 +1,130 @@ + +Call CAMDAC for a tumor and patient-matched normal sample — pipeline_rrbs • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Call CAMDAC for a tumor and patient-matched normal sample

    +
    + +
    +
    pipeline_rrbs(tumor, germline, infiltrates, origin, config)
    +
    + +
    +

    Arguments

    +
    tumor
    +

    Tumor CamSample object for deconvultion.

    + + +
    germline
    +

    Patient-matched normal CamSample object. May be NULL if tumor has CNA calls already.

    + + +
    infiltrates
    +

    Normal CamSample as a proxy for infiltrating normal methylation.

    + + +
    origin
    +

    Normal CamSample representing cell of origin for tumor-normal differential methylation.

    + + +
    config
    +

    Configuration built with CamConfig().

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/pipeline_wgbs.html b/docs/html/reference/pipeline_wgbs.html new file mode 100644 index 0000000..9635fb1 --- /dev/null +++ b/docs/html/reference/pipeline_wgbs.html @@ -0,0 +1,136 @@ + +Run CAMDAC WGBS analysis on a bulk tumor and patient-matched tissue-matched tumor-adjacent normal sample. — pipeline_wgbs • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Run CAMDAC WGBS analysis on a bulk tumor and patient-matched tissue-matched tumor-adjacent normal sample.

    +
    + +
    +
    pipeline_wgbs(
    +  tumor,
    +  germline = NULL,
    +  infiltrates = NULL,
    +  origin = NULL,
    +  config
    +)
    +
    + +
    +

    Arguments

    +
    tumor
    +

    Tumor CamSample object for deconvultion.

    + + +
    germline
    +

    Patient-matched normal CamSample object. May be NULL if tumor has CNA calls already.

    + + +
    infiltrates
    +

    Normal CamSample as a proxy for infiltrating normal methylation.

    + + +
    origin
    +

    Normal CamSample representing cell of origin for tumor-normal differential methylation.

    + + +
    config
    +

    Configuration built with CamConfig().

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/plot_2d_density.html b/docs/html/reference/plot_2d_density.html new file mode 100644 index 0000000..7e8d75a --- /dev/null +++ b/docs/html/reference/plot_2d_density.html @@ -0,0 +1,119 @@ + +plot_2d_density — plot_2d_density • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    plot_2d_density

    +
    + +
    +
    plot_2d_density(dt, path)
    +
    + +
    +

    Arguments

    +
    dt
    +

    Data table with methylation information per CpG

    + + +
    path
    +

    Character path variable pointing to the desired working directory. +This is where the output will be stored and should be constant for all CAMDAC functions.

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/plot_BAF_and_LogR.html b/docs/html/reference/plot_BAF_and_LogR.html new file mode 100644 index 0000000..9f1768b --- /dev/null +++ b/docs/html/reference/plot_BAF_and_LogR.html @@ -0,0 +1,124 @@ + +Plot BAF and logR profiles with ggplot — plot_BAF_and_LogR • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Plot BAF and logR profiles with ggplot

    +
    + +
    +
    plot_BAF_and_LogR(dt, outfile, downsample = 100000)
    +
    + +
    +

    Arguments

    +
    dt
    +

    data.frame with methylation info

    + + +
    outfile
    +

    character srting with output pdf filename +Saves a pdf w/ methylation rate distribution, biases at polymorphic and +non-polymorphic CG/CCGG and coverage distribution

    + +
    +
    +

    Author

    +

    Elizabeth Larose Cadieux

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/plot_SNP_info.html b/docs/html/reference/plot_SNP_info.html new file mode 100644 index 0000000..72486dd --- /dev/null +++ b/docs/html/reference/plot_SNP_info.html @@ -0,0 +1,128 @@ + +Plot SNP data summary and QC — plot_SNP_info • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    plot_SNP_info plots SNP QC

    +
    + +
    +
    plot_SNP_info(dt, outfile, min)
    +
    + +
    +

    Arguments

    +
    dt
    +

    data.table with SNP info

    + + +
    outfile
    +

    character srting with output pdf filename

    + +
    +
    +

    Value

    + + +

    pdf

    +
    +
    +

    Author

    +

    Elizabeth Larose Cadieux

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/plot_methylation_info.html b/docs/html/reference/plot_methylation_info.html new file mode 100644 index 0000000..ab0d020 --- /dev/null +++ b/docs/html/reference/plot_methylation_info.html @@ -0,0 +1,140 @@ + +Plot Methylation — plot_methylation_info • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Creates table grob in format that is most common for my usage.

    +
    + +
    +
    plot_methylation_info(df_sample, outfile)
    +
    + +
    +

    Arguments

    +
    df_sample
    +

    data.frame with methylation info

    + + +
    outfile
    +

    character srting with output pdf filename

    + + +
    dt
    +

    Data.table that the grob will be made out of

    + + +
    title_v
    +

    Title for display

    + + +
    fontsize_v
    +

    Fontsize for title. Default is 14 (goes well with my_theme)

    + +
    +
    +

    Value

    + + +

    pdf w/ methylation rate distribution, biases at polymorphic and non-polymorphic CG/CCGG and coverage distribution

    +
    +
    +

    Details

    +

    plot_methylation_info returns the df_sample with annotated q-value for each CpG

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/plot_methylation_info_with_anno.html b/docs/html/reference/plot_methylation_info_with_anno.html new file mode 100644 index 0000000..fbe4d55 --- /dev/null +++ b/docs/html/reference/plot_methylation_info_with_anno.html @@ -0,0 +1,122 @@ + +Plot methylation information — plot_methylation_info_with_anno • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Plot methylation information

    +
    + +
    +
    plot_methylation_info_with_anno(dt, path, bulk)
    +
    + +
    +

    Arguments

    +
    dt
    +

    Data table with methylation information per CpG

    + + +
    path
    +

    Character path variable pointing to the desired working directory.

    + + +
    bulk
    +

    Logical determining whether the bulk or purified tumour is to be plotted

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/plot_normal_SNP_info.html b/docs/html/reference/plot_normal_SNP_info.html new file mode 100644 index 0000000..b22d60e --- /dev/null +++ b/docs/html/reference/plot_normal_SNP_info.html @@ -0,0 +1,128 @@ + +Plot plots SNP QC — plot_normal_SNP_info • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Plot plots SNP QC

    +
    + +
    +
    plot_normal_SNP_info(dt, outfile, min)
    +
    + +
    +

    Arguments

    +
    dt
    +

    data.table with SNP info

    + + +
    outfile
    +

    character srting with output pdf filename

    + +
    +
    +

    Value

    + + +

    pdf

    +
    +
    +

    Author

    +

    Elizabeth Larose Cadieux

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/preprocess_asm.html b/docs/html/reference/preprocess_asm.html new file mode 100644 index 0000000..e791170 --- /dev/null +++ b/docs/html/reference/preprocess_asm.html @@ -0,0 +1,118 @@ + +Preprocess a list of CamSample objects for ASM analysis — preprocess_asm • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Preprocess a list of CamSample objects for ASM analysis

    +
    + +
    +
    preprocess_asm(sample_list, config)
    +
    + +
    +

    Arguments

    +
    sample_list.
    +

    List of CamSample objects.

    + + +
    config.
    +

    CamConfig object.

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/preprocess_wgbs.html b/docs/html/reference/preprocess_wgbs.html new file mode 100644 index 0000000..987628b --- /dev/null +++ b/docs/html/reference/preprocess_wgbs.html @@ -0,0 +1,118 @@ + +Preprocess a list of CamSample objects for analysis — preprocess_wgbs • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Preprocess a list of CamSample objects for analysis

    +
    + +
    +
    preprocess_wgbs(sample_list, config)
    +
    + +
    +

    Arguments

    +
    sample_list.
    +

    List of CamSample objects.

    + + +
    config.
    +

    CamConfig object.

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/remove_low_cov_singletons.html b/docs/html/reference/remove_low_cov_singletons.html new file mode 100644 index 0000000..28e68cc --- /dev/null +++ b/docs/html/reference/remove_low_cov_singletons.html @@ -0,0 +1,112 @@ + +remove_low_cov_singletons — remove_low_cov_singletons • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Remove low coverage singletons outliers

    +
    + +
    +
    remove_low_cov_singletons(dt_sample_SNPs, min)
    +
    + +
    +

    Author

    +

    Elizabeth larose cadieux

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/round2.html b/docs/html/reference/round2.html new file mode 100644 index 0000000..51b4cf8 --- /dev/null +++ b/docs/html/reference/round2.html @@ -0,0 +1,145 @@ + +Round2 — round2 • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Round numerical values to 'n' digits

    +

    Round numerical values to 'n' digits

    +

    Round numerical values to 'n' digits

    +

    Round numerical values to 'n' digits

    +
    + +
    +
    round2(x, digits)
    +
    +round2(x, digits)
    +
    +round2(x, digits)
    +
    +round2(x, digits)
    +
    + +
    +

    Arguments

    +
    x
    +

    Numerical vector containing the numbers to round

    + + +
    digits
    +

    Numerical value representing the number of decimal digits to retain

    + +
    +
    +

    Value

    + + +

    rounded numerical vector

    + + +

    rounded numerical vector

    + + +

    rounded numerical vector

    + + +

    rounded numerical vector

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/run_ASCAT.m.html b/docs/html/reference/run_ASCAT.m.html new file mode 100644 index 0000000..b374b69 --- /dev/null +++ b/docs/html/reference/run_ASCAT.m.html @@ -0,0 +1,183 @@ + +Obtain allele-specific copy number profiles, tumour purity and plot SNP data — run_ASCAT.m • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    run_ASCAT.m

    +
    + +
    +
    run_ASCAT.m(
    +  patient_id,
    +  sample_id,
    +  sex,
    +  patient_matched_normal_id = NULL,
    +  path,
    +  path_to_CAMDAC,
    +  build,
    +  min_normal = 10,
    +  min_tumour = 1,
    +  n_cores = 1,
    +  reference_panel_coverage = NULL
    +)
    +
    + +
    +

    Arguments

    +
    patient_id
    +

    Character variable containting the patient id number

    + + +
    sample_id
    +

    Character variable with the (control or tumour) sample_id

    + + +
    sex
    +

    Character variable with the patient expressed as "XX" for female +or "XY" for male. +This is important for copy number profiling. If sex is unknown, put "XY" for now, +then look at the allelic imbalance (BAF) on X in the germline outside pseudo- +autosomal regions. If there are little to no heterozygous SNPs, the sample is likely male.

    + + +
    patient_matched_normal_id
    +

    Character variable with the sample ID of the matched normal control

    + + +
    path
    +

    Character path variable pointing to the desired working directory. +This is where the output will be stored +IMPORTANT: The function output directory will be the in the path variable working +directory under "./Copy_number/sample_id/".

    + + +
    path_to_CAMDAC
    +

    Character variable containting the path to the CAMDAC dir +including dir name (e.g. "/path/to/CAMDAC/").

    + + +
    build
    +

    Character variable corresponding to the reference genome used for alignment. +CAMDAC is compatible with "hg19", "hg38", "GRCH37","GRCH38".

    + + +
    min_normal
    +

    Numerical value correspdonding to the minimum counts for germline +SNPs to be included (default:1)

    + + +
    min_tumour
    +

    Numerical value correspdonding to the minimum counts in the tumour +sample for germline SNPs to be included (default:10)

    + + +
    n_cores
    +

    Numerical value correspdonding to the number of cores for parallel processing

    + + +
    reference_panel_coverage
    +

    Path to the reference panel for the coverage.

    + +
    +
    +

    Value

    + + +

    Three text files with all the CpG loci and their SNP and/or CpG methylation info

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/run_methylation_data_processing.html b/docs/html/reference/run_methylation_data_processing.html new file mode 100644 index 0000000..7145d5c --- /dev/null +++ b/docs/html/reference/run_methylation_data_processing.html @@ -0,0 +1,192 @@ + +Filter bulk tumour and normal methylation data, get methylation rate highest density interval (HDI) and plot raw methylation info run_methylation_data_processing — run_methylation_data_processing • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Filter bulk tumour and normal methylation data, get methylation rate highest density interval (HDI) +and plot raw methylation info +run_methylation_data_processing

    +
    + +
    +
    run_methylation_data_processing(
    +  patient_id,
    +  sample_id,
    +  normal_infiltrates_proxy_id,
    +  normal_origin_proxy_id,
    +  path,
    +  min_normal = 10,
    +  min_tumour = 3,
    +  n_cores,
    +  reference_panel_normal_infiltrates = NULL,
    +  reference_panel_normal_origin = NULL
    +)
    +
    + +
    +

    Arguments

    +
    patient_id
    +

    Character variable containting the patient ID

    + + +
    sample_id
    +

    Character variable with the (control or tumour) sample ID

    + + +
    normal_infiltrates_proxy_id
    +

    Character variable with the sample ID of +the tissue-matched normal acting as proxy for the tumour infiltrating +normal cells. Ideally, this is a patient and tissue-matched tumour adjacent normal sample.

    + + +
    normal_origin_proxy_id
    +

    Character variable with the sample ID +of the normal to be used as a proxy for the tumour cell of origin in +differential methylation analyses.

    + + +
    path
    +

    Character path variable pointing to the desired working directory. +This is where the output will be stored.

    + + +
    min_normal
    +

    Numerical value correspdonding to the minimum counts threshold for +the normal CpGs to be included

    + + +
    min_tumour
    +

    Numerical value correspdonding to the minimum counts threshold +in the tumour sample CpGs inclusion

    + + +
    n_cores
    +

    Numerical value correspdonding to the number of cores for parallel processing

    + + +
    reference_panel_normal_infiltrates
    +

    Default is NULL. Character string with the complete +path to a reference methylation profile for the tumour normal infiltrates as a .fst file.

    + + +
    reference_panel_normal_origin
    +

    Default is NULL. Character string with the complete +path to your reference methylation profile for the tumour cell of origin as a .fst file.

    +

    If a patient-matched proxy for the normal infiltrates and/or the normal cell of origin is not +available, a reference panel may be constructed from different individuals and used as a substitute.

    +

    The reference samples should be at the very least sex-matched.

    +

    The reference should be saved as a .fst file with the following columns: +CHR start end M_n UM_n m_n cov_n +

    +

    where each row is a CpG or CCpGG with coordinates CHR:start-end +The start and end columns correspond to the 5'-C and 3'-G coordinate, respectively. +M_n is the number of reads supporting of the methylated allele +UM_n is the number of reads supporting of the unmethylated allele +m_n is the normal methylation rate (M_n / (M_n+UM_n)) +cov_n is the total CpG methylation informative reads counts (M_n+UM_n)

    + +
    +
    +

    Value

    + + +

    GRanges object in .RData file

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/sort_genomic_dt.html b/docs/html/reference/sort_genomic_dt.html new file mode 100644 index 0000000..e60d63a --- /dev/null +++ b/docs/html/reference/sort_genomic_dt.html @@ -0,0 +1,122 @@ + +sort_genomic_dt — sort_genomic_dt • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    sort_genomic_dt

    +

    Sort a data table with genomic coordinates

    +
    + +
    +
    sort_genomic_dt(dt, with_chr = F)
    +
    +sort_genomic_dt(dt, with_chr = F)
    +
    + +
    +

    Arguments

    +
    dt
    +

    An object that is a data.table

    + + +
    with_chr
    +

    A boolean to indicate whether the chrom field has UCSC (TRUE) or NCBI (FALSE) format

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/reference/split_segments_gr.html b/docs/html/reference/split_segments_gr.html new file mode 100644 index 0000000..bdf3dd0 --- /dev/null +++ b/docs/html/reference/split_segments_gr.html @@ -0,0 +1,118 @@ + +Split genome into segments for allele counting — split_segments_gr • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Split genome into segments for allele counting

    +
    + +
    +
    split_segments_gr(segments_file, n_seg_split)
    +
    + +
    +

    Arguments

    +
    segments_file
    +

    An RDS file containing a GRanges object with each chromosome region to split

    + + +
    n_seg_split
    +

    An integer to split each chromosome segment

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/html/sitemap.xml b/docs/html/sitemap.xml new file mode 100644 index 0000000..35f63bf --- /dev/null +++ b/docs/html/sitemap.xml @@ -0,0 +1,246 @@ + + + + /404.html + + + /DEV.html + + + /LICENSE-text.html + + + /articles/contributing.html + + + /articles/experimental.html + + + /articles/index.html + + + /articles/introduction.html + + + /articles/output.html + + + /articles/pipeline.html + + + /articles/questions.html + + + /articles/setup.html + + + /articles/technical.html + + + /authors.html + + + /index.html + + + /news/index.html + + + /reference/CamConfig.html + + + /reference/CamSample.html + + + /reference/HDIofICDF.html + + + /reference/HDIofMCMC.html + + + /reference/HDIofMCMC_mt.html + + + /reference/LogR_correction.html + + + /reference/annotate_copy_number.html + + + /reference/ascat.m.plotRawData.html + + + /reference/ascat.m.plotSegmentedData.html + + + /reference/ascat.plotRawData.flags.html + + + /reference/asm_pipeline.html + + + /reference/attach_output.html + + + /reference/bin_CpGs.html + + + /reference/calculate_m_t_hdi.html + + + /reference/call_dmps.html + + + /reference/call_dmr_routine.html + + + /reference/camdac_to_battenberg_prepare_wgbs.html + + + /reference/cmain_bind_snps.html + + + /reference/cmain_call_cna.html + + + /reference/cmain_call_dmps.html + + + /reference/cmain_call_dmrs.html + + + /reference/cmain_count_alleles.html + + + /reference/cmain_deconvolve_methylation.html + + + /reference/cmain_make_methylation_profile.html + + + /reference/cmain_make_snps.html + + + /reference/cmain_run_ascat.html + + + /reference/cmain_run_battenberg.html + + + /reference/collapse_cpg_to_dmr.html + + + /reference/compute_tumour_methylome.html + + + /reference/cwrap_asm_get_allele_counts.html + + + /reference/download_pipeline_files.html + + + /reference/format_methylation_df.html + + + /reference/format_output.html + + + /reference/get_DMPs.html + + + /reference/get_DMRs.html + + + /reference/get_allele_counts.html + + + /reference/get_cluster_counts.html + + + /reference/get_differential_methylation.html + + + /reference/get_msp1_fragments.html + + + /reference/get_pure_tumour_methylation.html + + + /reference/get_reference_files.html + + + /reference/helper_camdac_pileup.html + + + /reference/index.html + + + /reference/intervalWidth_r.html + + + /reference/load_cna_data.html + + + /reference/load_panel_ac_files.html + + + /reference/panel_asm_from_counts.html + + + /reference/panel_meth_from_beta.html + + + /reference/panel_meth_from_counts.html + + + /reference/pipeline.html + + + /reference/pipeline_rrbs.html + + + /reference/pipeline_wgbs.html + + + /reference/plot_2d_density.html + + + /reference/plot_BAF_and_LogR.html + + + /reference/plot_SNP_info.html + + + /reference/plot_methylation_info.html + + + /reference/plot_methylation_info_with_anno.html + + + /reference/plot_normal_SNP_info.html + + + /reference/preprocess_asm.html + + + /reference/preprocess_wgbs.html + + + /reference/remove_low_cov_singletons.html + + + /reference/round2.html + + + /reference/run_ASCAT.m.html + + + /reference/run_methylation_data_processing.html + + + /reference/sort_genomic_dt.html + + + /reference/split_segments_gr.html + + diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 0000000..010f60c --- /dev/null +++ b/docs/index.html @@ -0,0 +1,220 @@ + + + + + + + +Copy-number Aware Methylation Deconvolution and Analysis of Cancers • CAMDAC + + + + + + + + + + + + +
    +
    + + + + +
    +
    +
    + +

    Copy-number Aware Methylation Deconvolution Analysis of Cancer (CAMDAC) is an R library for deconvolving bulk tumor DNA methylation (bisulfite) sequencing data (Larose Cadieux et al., 2022, bioRxiv).

    +
    +

    Documentation +

    +

    Visit https://vanloo-lab.github.io/CAMDAC/.

    +
    +
    +

    Installation : Dockerhub +

    +

    A CAMDAC container is available on dockerhub for use with Docker, Singularity or Apptainer:

    +
    docker pull nmensah5/camdac:latest
    +echo "library(CAMDAC)" > commands.R
    +docker run -v $(pwd):/app nmensah5/camdac:latest Rscript commands.R
    +
    +
    +

    Installation : Github +

    +

    You can install CAMDAC and its dependencies from an R console:

    +
    +install.packages("remotes")
    +remotes::install_github("VanLoo-lab/CAMDAC")
    +
    +
    +

    Quickstart +

    +

    We provide pre-built reference datasets for hg38 and hg19. These files are required to run CAMDAC for either RRBS or WGBS analysis from the Zenodo repository: (10565423). An R getter function is provided for convenience:

    +
    +CAMDAC::download_pipeline_files(bsseq = "rrbs", directory = "./refs")
    +CAMDAC::download_pipeline_files(bsseq = "wgbs", directory = "./refs")
    +

    For WGBS analysis, CAMDAC requires the java command line utility to be available in the system PATH.

    +

    With reference files downloaded, run the tumor-normal deconvolution pipeline with test data:

    +

    [!NOTE]
    +We provide downsampled BAM files for testing the pipeline. For representative results, please use your own BAM files.

    +
    +library(CAMDAC)
    +
    +tumor_bam <- system.file("testdata", "tumour_beds_min.sorted.bam", package = "CAMDAC")
    +normal_bam <- system.file("testdata", "normal_beds_min.sorted.bam", package = "CAMDAC")
    +
    +# Select samples for basic tumor-normal analysis
    +tumor <- CamSample(id = "T", sex = "XY", bam = tumor_bam, patient_id="readme")
    +normal <- CamSample(id = "N", sex = "XY", bam = normal_bam, patient_id="readme")
    +
    +# Configure pipeline
    +config <- CamConfig(
    +  outdir = "./validation/results/test_readme/", bsseq = "rrbs", lib = "pe",
    +  build = "hg38", refs = "./refs", n_cores = 1, cna_caller='ascat',
    +  min_cov=1, # Minimum tumour coverage at 1 for testing.
    +  min_normal_cov=1, # Minimum normal coverage at 1 for testing. 
    +  min_mapq=1 # Minimum MAPQ at 1 for testing.
    +)
    +
    +# Run CAMDAC
    +CAMDAC::pipeline(
    +  tumor, germline = normal, infiltrates = normal, origin = normal, config
    +)
    +

    For a more detailed walkthrough with test data, see vignette("pipeline").

    +
    +
    +

    Contributing +

    +

    To contribute to CAMDAC, fork the repository and install the development dependencies with remotes::install_dev_deps('.').

    +

    After making your changes, run the build and test commands listed in vignette("contributing").

    +

    Finally, submit a pull request with the changes on your fork.

    +
    +
    +
    + + +
    + + +
    + +
    +

    +

    Site built with pkgdown 2.0.9.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/link.svg b/docs/link.svg new file mode 100644 index 0000000..88ad827 --- /dev/null +++ b/docs/link.svg @@ -0,0 +1,12 @@ + + + + + + diff --git a/docs/news/index.html b/docs/news/index.html new file mode 100644 index 0000000..4c60cb1 --- /dev/null +++ b/docs/news/index.html @@ -0,0 +1,109 @@ + +Changelog • CAMDAC + + +
    +
    + + + +
    +
    + + +
    + +
    • Integrated RRBS and WGBS analysis under a single call to the pipeline() function.
    • +
    • Added option for RRBS paired end analysis
    • +
    • Upgraded ASCAT to version 3.2.0 (commit: 44ddd3080723a2c3640d1cfead13437a093c21d1)
    • +
    +
    + +
    • Minor documentation updates.
    +
    + + + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/pkgdown.css b/docs/pkgdown.css new file mode 100644 index 0000000..80ea5b8 --- /dev/null +++ b/docs/pkgdown.css @@ -0,0 +1,384 @@ +/* Sticky footer */ + +/** + * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/ + * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css + * + * .Site -> body > .container + * .Site-content -> body > .container .row + * .footer -> footer + * + * Key idea seems to be to ensure that .container and __all its parents__ + * have height set to 100% + * + */ + +html, body { + height: 100%; +} + +body { + position: relative; +} + +body > .container { + display: flex; + height: 100%; + flex-direction: column; +} + +body > .container .row { + flex: 1 0 auto; +} + +footer { + margin-top: 45px; + padding: 35px 0 36px; + border-top: 1px solid #e5e5e5; + color: #666; + display: flex; + flex-shrink: 0; +} +footer p { + margin-bottom: 0; +} +footer div { + flex: 1; +} +footer .pkgdown { + text-align: right; +} +footer p { + margin-bottom: 0; +} + +img.icon { + float: right; +} + +/* Ensure in-page images don't run outside their container */ +.contents img { + max-width: 100%; + height: auto; +} + +/* Fix bug in bootstrap (only seen in firefox) */ +summary { + display: list-item; +} + +/* Typographic tweaking ---------------------------------*/ + +.contents .page-header { + margin-top: calc(-60px + 1em); +} + +dd { + margin-left: 3em; +} + +/* Section anchors ---------------------------------*/ + +a.anchor { + display: none; + margin-left: 5px; + width: 20px; + height: 20px; + + background-image: url(./link.svg); + background-repeat: no-repeat; + background-size: 20px 20px; + background-position: center center; +} + +h1:hover .anchor, +h2:hover .anchor, +h3:hover .anchor, +h4:hover .anchor, +h5:hover .anchor, +h6:hover .anchor { + display: inline-block; +} + +/* Fixes for fixed navbar --------------------------*/ + +.contents h1, .contents h2, .contents h3, .contents h4 { + padding-top: 60px; + margin-top: -40px; +} + +/* Navbar submenu --------------------------*/ + +.dropdown-submenu { + position: relative; +} + +.dropdown-submenu>.dropdown-menu { + top: 0; + left: 100%; + margin-top: -6px; + margin-left: -1px; + border-radius: 0 6px 6px 6px; +} + +.dropdown-submenu:hover>.dropdown-menu { + display: block; +} + +.dropdown-submenu>a:after { + display: block; + content: " "; + float: right; + width: 0; + height: 0; + border-color: transparent; + border-style: solid; + border-width: 5px 0 5px 5px; + border-left-color: #cccccc; + margin-top: 5px; + margin-right: -10px; +} + +.dropdown-submenu:hover>a:after { + border-left-color: #ffffff; +} + +.dropdown-submenu.pull-left { + float: none; +} + +.dropdown-submenu.pull-left>.dropdown-menu { + left: -100%; + margin-left: 10px; + border-radius: 6px 0 6px 6px; +} + +/* Sidebar --------------------------*/ + +#pkgdown-sidebar { + margin-top: 30px; + position: -webkit-sticky; + position: sticky; + top: 70px; +} + +#pkgdown-sidebar h2 { + font-size: 1.5em; + margin-top: 1em; +} + +#pkgdown-sidebar h2:first-child { + margin-top: 0; +} + +#pkgdown-sidebar .list-unstyled li { + margin-bottom: 0.5em; +} + +/* bootstrap-toc tweaks ------------------------------------------------------*/ + +/* All levels of nav */ + +nav[data-toggle='toc'] .nav > li > a { + padding: 4px 20px 4px 6px; + font-size: 1.5rem; + font-weight: 400; + color: inherit; +} + +nav[data-toggle='toc'] .nav > li > a:hover, +nav[data-toggle='toc'] .nav > li > a:focus { + padding-left: 5px; + color: inherit; + border-left: 1px solid #878787; +} + +nav[data-toggle='toc'] .nav > .active > a, +nav[data-toggle='toc'] .nav > .active:hover > a, +nav[data-toggle='toc'] .nav > .active:focus > a { + padding-left: 5px; + font-size: 1.5rem; + font-weight: 400; + color: inherit; + border-left: 2px solid #878787; +} + +/* Nav: second level (shown on .active) */ + +nav[data-toggle='toc'] .nav .nav { + display: none; /* Hide by default, but at >768px, show it */ + padding-bottom: 10px; +} + +nav[data-toggle='toc'] .nav .nav > li > a { + padding-left: 16px; + font-size: 1.35rem; +} + +nav[data-toggle='toc'] .nav .nav > li > a:hover, +nav[data-toggle='toc'] .nav .nav > li > a:focus { + padding-left: 15px; +} + +nav[data-toggle='toc'] .nav .nav > .active > a, +nav[data-toggle='toc'] .nav .nav > .active:hover > a, +nav[data-toggle='toc'] .nav .nav > .active:focus > a { + padding-left: 15px; + font-weight: 500; + font-size: 1.35rem; +} + +/* orcid ------------------------------------------------------------------- */ + +.orcid { + font-size: 16px; + color: #A6CE39; + /* margins are required by official ORCID trademark and display guidelines */ + margin-left:4px; + margin-right:4px; + vertical-align: middle; +} + +/* Reference index & topics ----------------------------------------------- */ + +.ref-index th {font-weight: normal;} + +.ref-index td {vertical-align: top; min-width: 100px} +.ref-index .icon {width: 40px;} +.ref-index .alias {width: 40%;} +.ref-index-icons .alias {width: calc(40% - 40px);} +.ref-index .title {width: 60%;} + +.ref-arguments th {text-align: right; padding-right: 10px;} +.ref-arguments th, .ref-arguments td {vertical-align: top; min-width: 100px} +.ref-arguments .name {width: 20%;} +.ref-arguments .desc {width: 80%;} + +/* Nice scrolling for wide elements --------------------------------------- */ + +table { + display: block; + overflow: auto; +} + +/* Syntax highlighting ---------------------------------------------------- */ + +pre, code, pre code { + background-color: #f8f8f8; + color: #333; +} +pre, pre code { + white-space: pre-wrap; + word-break: break-all; + overflow-wrap: break-word; +} + +pre { + border: 1px solid #eee; +} + +pre .img, pre .r-plt { + margin: 5px 0; +} + +pre .img img, pre .r-plt img { + background-color: #fff; +} + +code a, pre a { + color: #375f84; +} + +a.sourceLine:hover { + text-decoration: none; +} + +.fl {color: #1514b5;} +.fu {color: #000000;} /* function */ +.ch,.st {color: #036a07;} /* string */ +.kw {color: #264D66;} /* keyword */ +.co {color: #888888;} /* comment */ + +.error {font-weight: bolder;} +.warning {font-weight: bolder;} + +/* Clipboard --------------------------*/ + +.hasCopyButton { + position: relative; +} + +.btn-copy-ex { + position: absolute; + right: 0; + top: 0; + visibility: hidden; +} + +.hasCopyButton:hover button.btn-copy-ex { + visibility: visible; +} + +/* headroom.js ------------------------ */ + +.headroom { + will-change: transform; + transition: transform 200ms linear; +} +.headroom--pinned { + transform: translateY(0%); +} +.headroom--unpinned { + transform: translateY(-100%); +} + +/* mark.js ----------------------------*/ + +mark { + background-color: rgba(255, 255, 51, 0.5); + border-bottom: 2px solid rgba(255, 153, 51, 0.3); + padding: 1px; +} + +/* vertical spacing after htmlwidgets */ +.html-widget { + margin-bottom: 10px; +} + +/* fontawesome ------------------------ */ + +.fab { + font-family: "Font Awesome 5 Brands" !important; +} + +/* don't display links in code chunks when printing */ +/* source: https://stackoverflow.com/a/10781533 */ +@media print { + code a:link:after, code a:visited:after { + content: ""; + } +} + +/* Section anchors --------------------------------- + Added in pandoc 2.11: https://github.com/jgm/pandoc-templates/commit/9904bf71 +*/ + +div.csl-bib-body { } +div.csl-entry { + clear: both; +} +.hanging-indent div.csl-entry { + margin-left:2em; + text-indent:-2em; +} +div.csl-left-margin { + min-width:2em; + float:left; +} +div.csl-right-inline { + margin-left:2em; + padding-left:1em; +} +div.csl-indent { + margin-left: 2em; +} diff --git a/docs/pkgdown.js b/docs/pkgdown.js new file mode 100644 index 0000000..6f0eee4 --- /dev/null +++ b/docs/pkgdown.js @@ -0,0 +1,108 @@ +/* http://gregfranko.com/blog/jquery-best-practices/ */ +(function($) { + $(function() { + + $('.navbar-fixed-top').headroom(); + + $('body').css('padding-top', $('.navbar').height() + 10); + $(window).resize(function(){ + $('body').css('padding-top', $('.navbar').height() + 10); + }); + + $('[data-toggle="tooltip"]').tooltip(); + + var cur_path = paths(location.pathname); + var links = $("#navbar ul li a"); + var max_length = -1; + var pos = -1; + for (var i = 0; i < links.length; i++) { + if (links[i].getAttribute("href") === "#") + continue; + // Ignore external links + if (links[i].host !== location.host) + continue; + + var nav_path = paths(links[i].pathname); + + var length = prefix_length(nav_path, cur_path); + if (length > max_length) { + max_length = length; + pos = i; + } + } + + // Add class to parent
  • , and enclosing
  • if in dropdown + if (pos >= 0) { + var menu_anchor = $(links[pos]); + menu_anchor.parent().addClass("active"); + menu_anchor.closest("li.dropdown").addClass("active"); + } + }); + + function paths(pathname) { + var pieces = pathname.split("/"); + pieces.shift(); // always starts with / + + var end = pieces[pieces.length - 1]; + if (end === "index.html" || end === "") + pieces.pop(); + return(pieces); + } + + // Returns -1 if not found + function prefix_length(needle, haystack) { + if (needle.length > haystack.length) + return(-1); + + // Special case for length-0 haystack, since for loop won't run + if (haystack.length === 0) { + return(needle.length === 0 ? 0 : -1); + } + + for (var i = 0; i < haystack.length; i++) { + if (needle[i] != haystack[i]) + return(i); + } + + return(haystack.length); + } + + /* Clipboard --------------------------*/ + + function changeTooltipMessage(element, msg) { + var tooltipOriginalTitle=element.getAttribute('data-original-title'); + element.setAttribute('data-original-title', msg); + $(element).tooltip('show'); + element.setAttribute('data-original-title', tooltipOriginalTitle); + } + + if(ClipboardJS.isSupported()) { + $(document).ready(function() { + var copyButton = ""; + + $("div.sourceCode").addClass("hasCopyButton"); + + // Insert copy buttons: + $(copyButton).prependTo(".hasCopyButton"); + + // Initialize tooltips: + $('.btn-copy-ex').tooltip({container: 'body'}); + + // Initialize clipboard: + var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { + text: function(trigger) { + return trigger.parentNode.textContent.replace(/\n#>[^\n]*/g, ""); + } + }); + + clipboardBtnCopies.on('success', function(e) { + changeTooltipMessage(e.trigger, 'Copied!'); + e.clearSelection(); + }); + + clipboardBtnCopies.on('error', function() { + changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); + }); + }); + } +})(window.jQuery || window.$) diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml new file mode 100644 index 0000000..318f131 --- /dev/null +++ b/docs/pkgdown.yml @@ -0,0 +1,14 @@ +pandoc: 2.9.2.1 +pkgdown: 2.0.9 +pkgdown_sha: ~ +articles: + contributing: contributing.html + experimental: experimental.html + introduction: introduction.html + output: output.html + pipeline: pipeline.html + questions: questions.html + setup: setup.html + technical: technical.html +last_built: 2025-09-22T21:53Z + diff --git a/docs/reference/CamConfig.html b/docs/reference/CamConfig.html new file mode 100644 index 0000000..5d0a3e1 --- /dev/null +++ b/docs/reference/CamConfig.html @@ -0,0 +1,176 @@ + +Set CAMDAC configuration — CamConfig • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Set CAMDAC configuration

    +
    + +
    +
    CamConfig(
    +  outdir,
    +  bsseq,
    +  lib,
    +  build,
    +  n_cores = 1,
    +  regions = NULL,
    +  refs = NULL,
    +  n_seg_split = 50,
    +  min_mapq = 1,
    +  min_cov = 1,
    +  min_normal_cov = 10,
    +  overwrite = FALSE,
    +  cna_caller = "battenberg",
    +  cna_settings = NULL
    +)
    +
    + +
    +

    Arguments

    +
    outdir
    +

    A path to save CAMDAC results. The results folder structure +follows the format PATIENT/DATASET/SAMPLE/.

    + + +
    bsseq
    +

    Bisulfite sequencing platform. Choose between "wgbs" or "rrbs".

    + + +
    lib
    +

    Bisulfite sequencing library. Choose "pe" for paired end, "se" for single end.

    + + +
    build
    +

    Reference genome build. Choose "hg38" or "hg19".

    + + +
    n_cores
    +

    Number of cores to process CAMDAC data in parallel wherever possible.

    + + +
    regions
    +

    A BED file with regions to restrict the analysis to

    + + +
    refs
    +

    Path to CAMDAC reference files. If this is not given, CAMDAC searches the +environment variable CAMDAC_PIPELINE_FILES. If this is not set, CAMDAC searches recursively in the current +working directory.

    + + +
    min_mapq
    +

    Minimum mapping quality filter used in cmain_allele_counts().

    + + +
    min_cov
    +

    Minimum coverage filter for: DNA methylation, Normal SNP selection.

    + + +
    overwrite
    +

    Config to overwrite files if they already exist.

    + + +
    cna_caller
    +

    The CNA caller to use. "ascat" or "battenberg". Default is "battenberg"

    + + +
    cna_settings
    +

    A list of settings to pass to the CNA caller. rho, psi, java, beaglemaxmem

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/CamSample.html b/docs/reference/CamSample.html new file mode 100644 index 0000000..e18f996 --- /dev/null +++ b/docs/reference/CamSample.html @@ -0,0 +1,126 @@ + +Build CAMDAC sample object — CamSample • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Build CAMDAC sample object

    +
    + +
    +
    CamSample(id, sex, bam = NULL, patient_id = "P")
    +
    + +
    +

    Arguments

    +
    id
    +

    Unique identifier for the sample

    + + +
    sex
    +

    The sex of the patient, "XX" or "XY". Required for CNA calling.

    + + +
    bam
    +

    Sample BAM file. If not given, CAMDAC expects files linked with attach_output.

    + + +
    patient_id
    +

    An identifier for the patient

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/HDIofICDF.html b/docs/reference/HDIofICDF.html new file mode 100644 index 0000000..b730bab --- /dev/null +++ b/docs/reference/HDIofICDF.html @@ -0,0 +1,135 @@ + +HDI of ICDF — HDIofICDF • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    HDI of ICDF

    +
    + +
    +
    HDIofICDF(ICDFname, credMass = 0.99, tol = 0.0001, ...)
    +
    + +
    +

    Arguments

    +
    ICDFname
    +

    The inverse cumulative density function of the distribution.

    + + +
    credMass
    +

    The desired mass of the HDI region.

    + + +
    tol
    +

    Tolerance parameter for optimisation. the lower the tolerance,the +longer the optimisation, but the higher the accuracy. +According to CAMDAC RRBS comments, tol=1e-4 gives values +of the same accuracy as our max resolution. +This function is adapted from Greg Snow's TeachingDemos package +E.g.Determine HDI of a M=30 and UM=12 CpG +Adding 1 to shape parameter ensures uniform beta(1,1) is updated with our counts +HDIofICDF(qbeta,shape1 = 30+1 , shape2 = 12+1 )

    + +
    +
    +

    Value

    + + +

    Highest density interval (HDI) limits in a vector.

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/HDIofMCMC.html b/docs/reference/HDIofMCMC.html new file mode 100644 index 0000000..33850b7 --- /dev/null +++ b/docs/reference/HDIofMCMC.html @@ -0,0 +1,150 @@ + +HDI of MCMC — HDIofMCMC • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    HDI of MCMC

    +
    + +
    +
    HDIofMCMC(M_b, UM_b, M_n, UM_n, p, CN, CN_n, credMass = 0.99)
    +
    + +
    +

    Arguments

    +
    M_b
    +

    counts methylated in the tumour

    + + +
    UM_b
    +

    counts unmethylated in the tumour

    + + +
    M_n
    +

    counts methylated in the normal

    + + +
    UM_n
    +

    counts unmethylated in the normal

    + + +
    p
    +

    tumour purity

    + + +
    CN
    +

    total tumour copy number

    + + +
    CN_n
    +

    total normal copy number

    + + +
    credMass
    +

    default is 0.99 +credMass is a scalar between 0 and 1, indicating the mass within the +credible interval that is to be estimated.

    + +
    +
    +

    Value

    + + +

    Value: HDIlim is a vector containing the limits of the HDI

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/HDIofMCMC_mt.html b/docs/reference/HDIofMCMC_mt.html new file mode 100644 index 0000000..2321853 --- /dev/null +++ b/docs/reference/HDIofMCMC_mt.html @@ -0,0 +1,152 @@ + +Calculate HDI by simulation — HDIofMCMC_mt • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Computes highest density interval from a sample of representative values, +estimated as shortest credible interval for a unimodal distribution

    +
    + +
    +
    HDIofMCMC_mt(M_b, UM_b, M_n, UM_n, p, CN, credMass = 0.99)
    +
    + +
    +

    Arguments

    +
    M_b
    +

    counts methylated in the tumour

    + + +
    UM_b
    +

    counts unmethylated in the tumour

    + + +
    M_n
    +

    counts methylated in the normal

    + + +
    UM_n
    +

    counts unmethylated in the normal

    + + +
    p
    +

    tumour purity

    + + +
    CN
    +

    total tumour copy number

    + + +
    credMass
    +

    default is 0.99 +credMass is a scalar between 0 and 1, indicating the mass within the +credible interval that is to be estimated.

    + + +
    CN_n
    +

    total normal copy number

    + +
    +
    +

    Value

    + + +

    Value: HDIlim is a vector containing the limits of the HDI

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/LogR_correction.html b/docs/reference/LogR_correction.html new file mode 100644 index 0000000..ded5e60 --- /dev/null +++ b/docs/reference/LogR_correction.html @@ -0,0 +1,155 @@ + +LogR_correction — LogR_correction • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Correct logR for msp1 fragment size bias and GC content

    +
    + +
    +
    LogR_correction(
    +  dt_sample,
    +  dt_SNPs,
    +  build,
    +  chr_names,
    +  min_normal,
    +  fragments_file,
    +  replic_timing_file_prefix,
    +  n_cores
    +)
    +
    + +
    +

    Arguments

    +
    dt_sample
    +

    Allelecounts output as a data.table

    + + +
    dt_SNPs
    +

    Allelecounts output subset to QC'ed SNP positions

    + + +
    build
    +

    Character variable corresponding to the reference genome version used for alignment

    + + +
    chr_names
    +

    Character variable with the seqlevels.

    + + +
    min_normal
    +

    Numerical with the minimum normal coverage threshold

    + + +
    fragments_file
    +

    CAMDAC reference MspI fragments file

    + + +
    replic_timing_file_prefix
    +

    CAMDAC reference replication timing files path and file name prefix

    + + +
    n_cores
    +

    Numerical value correspdonding to the number of cores for parallel processing

    + +
    +
    +

    Author

    +

    Elizabeth Larose Cadieux

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/annotate_copy_number.html b/docs/reference/annotate_copy_number.html new file mode 100644 index 0000000..716c43d --- /dev/null +++ b/docs/reference/annotate_copy_number.html @@ -0,0 +1,128 @@ + +Assign copy number calls — annotate_copy_number • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    annotate_copy_number returns the data.table dt_sample annotated with allele-specific copy numbers

    +
    + +
    +
    annotate_copy_number(dt_sample, seg, rm_sex_chrom = FALSE)
    +
    + +
    +

    Arguments

    +
    dt_sample
    +

    data.table object with each CpG and their coverage, counts methylated and methylation rate

    + + +
    seg
    +

    ASCAT.m copy number segements object

    + + +
    rm_sex_chrom
    +

    Logical indicating if you would like to remove sex chrom from downstream analyses

    + +
    +
    +

    Value

    + + +

    A dataframe for each sample_id with the copy number calls added

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/ascat.m.plotRawData.html b/docs/reference/ascat.m.plotRawData.html new file mode 100644 index 0000000..82f66fe --- /dev/null +++ b/docs/reference/ascat.m.plotRawData.html @@ -0,0 +1,140 @@ + +ascat.m.plotRawData — ascat.m.plotRawData • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Plot tumour and germline BAF and LogR

    +
    + +
    +
    ascat.m.plotRawData(ASCATobj, raw_LogR, pch, cex, lim_logR)
    +
    + +
    +

    Arguments

    +
    ASCATobj
    +

    an ASCAT object (e.g. data structure from ascat.loadData)

    + + +
    raw_LogR
    +

    vector with the LogR values before correction

    + + +
    pch
    +

    type of data points in plot

    + + +
    cex
    +

    size of data points in plot

    + + +
    lim_logR
    +

    y-axis limits on logR plot

    + +
    +
    +

    Value

    + + +

    Produces png files showing the logR and BAF values for tumour and germline samples

    +
    +
    +

    Author

    +

    Peter Van Loo

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/ascat.m.plotSegmentedData.html b/docs/reference/ascat.m.plotSegmentedData.html new file mode 100644 index 0000000..bafe911 --- /dev/null +++ b/docs/reference/ascat.m.plotSegmentedData.html @@ -0,0 +1,124 @@ + +ascat.m.plotSegmentedData — ascat.m.plotSegmentedData • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Plot segmentated BAF LogR

    +
    + +
    +
    ascat.m.plotSegmentedData(ASCATobj, lim_logR = 2)
    +
    + +
    +

    Arguments

    +
    ASCATobj
    +

    an ASCAT object (e.g. data structure from ascat.loadData)

    + +
    +
    +

    Value

    + + +

    Produces png files showing the logR and BAF values for tumour and germline samples

    +
    +
    +

    Author

    +

    Peter Van Loo

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/ascat.plotRawData.flags.html b/docs/reference/ascat.plotRawData.flags.html new file mode 100644 index 0000000..1d085e2 --- /dev/null +++ b/docs/reference/ascat.plotRawData.flags.html @@ -0,0 +1,136 @@ + +ascat.plotRawData — ascat.plotRawData.flags • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Plot BAF LogR

    +
    + +
    +
    ascat.plotRawData.flags(ASCATobj, pch, cex, lim_logR)
    +
    + +
    +

    Arguments

    +
    ASCATobj
    +

    an ASCAT object (e.g. data structure from ascat.loadData)

    + + +
    pch
    +

    type of data points in plot

    + + +
    cex
    +

    size of data points in plot

    + + +
    lim_logR
    +

    y-axis limits on logR plot

    + +
    +
    +

    Value

    + + +

    Produces png files showing the logR and BAF values for tumour and germline samples

    +
    +
    +

    Author

    +

    Peter Van Loo

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/asm_pipeline.html b/docs/reference/asm_pipeline.html new file mode 100644 index 0000000..3c0c339 --- /dev/null +++ b/docs/reference/asm_pipeline.html @@ -0,0 +1,130 @@ + +Run allele-specific methylation analysis pipeline — asm_pipeline • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Run allele-specific methylation analysis pipeline

    +
    + +
    +
    asm_pipeline(tumor, germline = NULL, infiltrates = NULL, origin = NULL, config)
    +
    + +
    +

    Arguments

    +
    tumor.
    +

    CamSample object for tumor sample.

    + + +
    germline.
    +

    CamSample object for germline sample. Used for CNA calling.

    + + +
    infiltrates.
    +

    CamSample object for infiltrating normal sample. Used for deconvolution.

    + + +
    origin.
    +

    CamSample object for cell of origin sample. Used for differential methylation.

    + + +
    config.
    +

    CamConfig object.

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/attach_output.html b/docs/reference/attach_output.html new file mode 100644 index 0000000..4f66171 --- /dev/null +++ b/docs/reference/attach_output.html @@ -0,0 +1,126 @@ + +Manually assign output file to CAMDAC sample — attach_output • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Manually assign output file to CAMDAC sample

    +
    + +
    +
    attach_output(sample, config, code, file)
    +
    + +
    +

    Arguments

    +
    sample
    +

    CamSample object

    + + +
    config
    +

    CamConfig object

    + + +
    code
    +

    Code for output file. See vignettes("output") for descriptions.

    + + +
    file
    +

    Path to file to copy to expected location

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/bin_CpGs.html b/docs/reference/bin_CpGs.html new file mode 100644 index 0000000..a587541 --- /dev/null +++ b/docs/reference/bin_CpGs.html @@ -0,0 +1,141 @@ + +Cluster CpGs into annotated bins — bin_CpGs • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    bin_CpGs returns the df with the annotation for each CpG

    +
    + +
    +
    bin_CpGs(path, patient_id, sample_id, dt, anno_list, n_cores)
    +
    + +
    +

    Arguments

    +
    path
    +

    Character string of the output directory

    + + +
    patient_id
    +

    Character string containting the patient ID

    + + +
    sample_id
    +

    Character string containting the sample ID.

    + + +
    dt
    +

    data.table where each CG is a row with DMP info.

    + + +
    anno_list
    +

    A data.table object containing annotated genomic bins including +genes, exons, introns, UTRs, CGI, CGI shores, CGI shelves, promoters or enhancers

    + + +
    n_cores
    +

    number of cores for parallel processing

    + +
    +
    +

    Value

    + + +

    A dataframe for each sample_id with the copy number calls added

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/calculate_m_t_hdi.html b/docs/reference/calculate_m_t_hdi.html new file mode 100644 index 0000000..032bea1 --- /dev/null +++ b/docs/reference/calculate_m_t_hdi.html @@ -0,0 +1,108 @@ + +Calculate HDI by simulation — calculate_m_t_hdi • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Calculate HDI by simulation

    +
    + +
    +
    calculate_m_t_hdi(meth_c, n_cores, itersplit = 100000)
    +
    + + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/call_dmps.html b/docs/reference/call_dmps.html new file mode 100644 index 0000000..30e5eaf --- /dev/null +++ b/docs/reference/call_dmps.html @@ -0,0 +1,115 @@ + +Call differentially methylated positions — call_dmps • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Call differentially methylated positions

    +
    + +
    +
    call_dmps(
    +  pmeth,
    +  nmeth,
    +  effect_size = 0.2,
    +  prob = 0.99,
    +  itersplit = 500000,
    +  ncores = 5
    +)
    +
    + + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/call_dmr_routine.html b/docs/reference/call_dmr_routine.html new file mode 100644 index 0000000..ac296ff --- /dev/null +++ b/docs/reference/call_dmr_routine.html @@ -0,0 +1,113 @@ + +Function to call DMRs on a camdac dmp dataset — call_dmr_routine • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Function to call DMRs on a camdac dmp dataset

    +
    + +
    +
    call_dmr_routine(
    +  tmeth_dmps,
    +  regions_annotations,
    +  min_DMP_counts,
    +  min_consec_DMP
    +)
    +
    + + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/camdac_to_battenberg_prepare_wgbs.html b/docs/reference/camdac_to_battenberg_prepare_wgbs.html new file mode 100644 index 0000000..a11f642 --- /dev/null +++ b/docs/reference/camdac_to_battenberg_prepare_wgbs.html @@ -0,0 +1,137 @@ + +Generate alleleCounter file from CAMDAC — camdac_to_battenberg_prepare_wgbs • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    camdac_to_battenberg_prepare_wgbs converts CAMDAC allele counter results to a format for processing.

    +
    + +
    +
    camdac_to_battenberg_prepare_wgbs(
    +  tumour_prefix,
    +  normal_prefix,
    +  camdac_tsnps,
    +  outdir
    +)
    +
    + +
    +

    Arguments

    +
    camdac_tumour_ac
    +

    CAMDAC tumour allele counts filepath. Expected *.gz

    + + +
    camdac_normal_ac
    +

    CAMDAC normal allele couts filepath. Expected *.gz

    + + +
    camdac_tnsps
    +

    CAMDAC tumour-normal-snps object. Expected *.gz

    + + +
    output_file
    +

    allelecounter formatted-file output directory.

    + +
    +
    +

    Value

    + + +

    File handle for allele counter file generated

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/cmain_bind_snps.html b/docs/reference/cmain_bind_snps.html new file mode 100644 index 0000000..ea3330d --- /dev/null +++ b/docs/reference/cmain_bind_snps.html @@ -0,0 +1,122 @@ + +Bind SNPs — cmain_bind_snps • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Combing tumour-normal SNP file for CNA analysis (ASCAT or BATTENBERG)

    +
    + +
    +
    cmain_bind_snps(tumour, normal, config)
    +
    + +
    +

    Arguments

    +
    tumour
    +

    A camdac sample object

    + + +
    normal
    +

    A camdac sample object

    + + +
    config
    +

    A camdac config object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/cmain_call_cna.html b/docs/reference/cmain_call_cna.html new file mode 100644 index 0000000..f4ad912 --- /dev/null +++ b/docs/reference/cmain_call_cna.html @@ -0,0 +1,122 @@ + +Call CNA — cmain_call_cna • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Config determines whether ASCAT or Battenberg is used

    +
    + +
    +
    cmain_call_cna(tumour, config)
    +
    + +
    +

    Arguments

    +
    tumour
    +

    A camdac sample object

    + + +
    config
    +

    A camdac config object

    + + +
    normal
    +

    A camdac sample object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/cmain_call_dmps.html b/docs/reference/cmain_call_dmps.html new file mode 100644 index 0000000..388c61a --- /dev/null +++ b/docs/reference/cmain_call_dmps.html @@ -0,0 +1,122 @@ + +Call tumour-normal DMPs — cmain_call_dmps • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Single-sample DMP calling on CAMDAC-deconvolved data

    +
    + +
    +
    cmain_call_dmps(tumour, normal, config)
    +
    + +
    +

    Arguments

    +
    tumour
    +

    A camdac sample object

    + + +
    normal
    +

    A camdac sample object

    + + +
    config
    +

    A camdac config object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/cmain_call_dmrs.html b/docs/reference/cmain_call_dmrs.html new file mode 100644 index 0000000..c2c7742 --- /dev/null +++ b/docs/reference/cmain_call_dmrs.html @@ -0,0 +1,122 @@ + +Call tumour-normal DMRs — cmain_call_dmrs • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Single-sample DMR calling on CAMDAC DMP data

    +
    + +
    +
    cmain_call_dmrs(tumour, config)
    +
    + +
    +

    Arguments

    +
    tumour
    +

    A camdac sample object

    + + +
    config
    +

    A camdac config object

    + + +
    normal
    +

    A camdac sample object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/cmain_count_alleles.html b/docs/reference/cmain_count_alleles.html new file mode 100644 index 0000000..3c12114 --- /dev/null +++ b/docs/reference/cmain_count_alleles.html @@ -0,0 +1,118 @@ + +Count alleles — cmain_count_alleles • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Count alleles

    +
    + +
    +
    cmain_count_alleles(sample, config)
    +
    + +
    +

    Arguments

    +
    sample
    +

    A camdac sample object

    + + +
    config
    +

    A camac allele object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/cmain_deconvolve_methylation.html b/docs/reference/cmain_deconvolve_methylation.html new file mode 100644 index 0000000..1ca56ec --- /dev/null +++ b/docs/reference/cmain_deconvolve_methylation.html @@ -0,0 +1,122 @@ + +Deconvolve methylation — cmain_deconvolve_methylation • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Deconvolve methylation

    +
    + +
    +
    cmain_deconvolve_methylation(tumour, normal, config)
    +
    + +
    +

    Arguments

    +
    tumour
    +

    A camdac sample object

    + + +
    normal
    +

    A camdac sample object

    + + +
    config
    +

    A camdac config object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/cmain_make_methylation_profile.html b/docs/reference/cmain_make_methylation_profile.html new file mode 100644 index 0000000..5f85d1d --- /dev/null +++ b/docs/reference/cmain_make_methylation_profile.html @@ -0,0 +1,118 @@ + +Make methylation — cmain_make_methylation_profile • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Pre-process methylation from allele counts for CAMDAC deconvolution

    +
    + +
    +
    cmain_make_methylation_profile(sample, config)
    +
    + +
    +

    Arguments

    +
    sample
    +

    A camdac sample object

    + + +
    config
    +

    A camdac config object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/cmain_make_snps.html b/docs/reference/cmain_make_snps.html new file mode 100644 index 0000000..e6de204 --- /dev/null +++ b/docs/reference/cmain_make_snps.html @@ -0,0 +1,118 @@ + +Make SNPs — cmain_make_snps • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Format and save SNP file for CNA analysis (ASCAT or BATTENBERG)

    +
    + +
    +
    cmain_make_snps(sample, config)
    +
    + +
    +

    Arguments

    +
    sample
    +

    A camdac sample object

    + + +
    config
    +

    A camdac config object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/cmain_run_ascat.html b/docs/reference/cmain_run_ascat.html new file mode 100644 index 0000000..701b415 --- /dev/null +++ b/docs/reference/cmain_run_ascat.html @@ -0,0 +1,122 @@ + +Run ASCAT.m — cmain_run_ascat • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Expects SNP profiles to have been created using cmain_make_snp_profiles

    +
    + +
    +
    cmain_run_ascat(tumour, config)
    +
    + +
    +

    Arguments

    +
    tumour
    +

    A camdac sample object

    + + +
    config
    +

    A camdac config object

    + + +
    normal
    +

    A camdac sample object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/cmain_run_battenberg.html b/docs/reference/cmain_run_battenberg.html new file mode 100644 index 0000000..2e17525 --- /dev/null +++ b/docs/reference/cmain_run_battenberg.html @@ -0,0 +1,122 @@ + +Run battenberg — cmain_run_battenberg • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Expects SNP profiles to have been created using cmain_make_snp_profiles

    +
    + +
    +
    cmain_run_battenberg(tumour, config)
    +
    + +
    +

    Arguments

    +
    tumour
    +

    A camdac sample object

    + + +
    config
    +

    A camdac config object

    + + +
    normal
    +

    A camdac sample object

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/collapse_cpg_to_dmr.html b/docs/reference/collapse_cpg_to_dmr.html new file mode 100644 index 0000000..ac92a5f --- /dev/null +++ b/docs/reference/collapse_cpg_to_dmr.html @@ -0,0 +1,108 @@ + +Summarise CG stats per DMR — collapse_cpg_to_dmr • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Summarise CG stats per DMR

    +
    + +
    +
    collapse_cpg_to_dmr(dt)
    +
    + + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/compute_tumour_methylome.html b/docs/reference/compute_tumour_methylome.html new file mode 100644 index 0000000..484d1ac --- /dev/null +++ b/docs/reference/compute_tumour_methylome.html @@ -0,0 +1,139 @@ + +Compute the tumour methylation rate — compute_tumour_methylome • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    compute_tumour_methylome returns the data.table dt annotated with +CAMDAC pure tumour methylation rates

    +
    + +
    +
    compute_tumour_methylome(dt, p, min_cov_t = 3, sex, build)
    +
    + +
    +

    Arguments

    +
    dt
    +

    data.table object with each CpG and their coverage, counts methylated, +methylation rate and copy number and matched normal methylation info

    + + +
    p
    +

    Numerical - Sample purity estimates

    + + +
    min_cov_t
    +

    Numerical - Minimum tumour coverage

    + + +
    sex
    +

    Character variable with the patient expressed as "XX" for female or "XY" for male.

    + + +
    build
    +

    Character variable corresponding to the reference genome used for alignment.

    + +
    +
    +

    Value

    + + +

    A dataframe for each sample_id with the tumour methylome added

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/cwrap_asm_get_allele_counts.html b/docs/reference/cwrap_asm_get_allele_counts.html new file mode 100644 index 0000000..88dcf6b --- /dev/null +++ b/docs/reference/cwrap_asm_get_allele_counts.html @@ -0,0 +1,153 @@ + +Count alleles for reads phased to SNPs in a BAM file — cwrap_asm_get_allele_counts • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Count alleles for reads phased to SNPs in a BAM file

    +
    + +
    +
    cwrap_asm_get_allele_counts(
    +  bam_file,
    +  snps_gr,
    +  loci_dt,
    +  paired_end,
    +  drop_ccgg,
    +  min_mapq = min_mapq,
    +  min_cov = min_cov
    +)
    +
    + +
    +

    Arguments

    +
    bam_file
    +

    Path to BAM file

    + + +
    snps_gr
    +

    GRanges object with heterozygous SNP loci for phasing

    + + +
    loci_dt
    +

    Data table with CAMDAC CpG loci from reference files

    + + +
    paired_end
    +

    Logical indicating if BAM is paired end

    + + +
    drop_ccgg
    +

    Logical indicating if CCGG should be dropped (i.e. rrbs mode)

    + + +
    min_mapq
    +

    Minimum mapping quality to consider a read

    + + +
    min_cov
    +

    Minimum coverage to consider a read

    + +
    +
    +

    Value

    + + +

    A list with three slots: stats, qnames and asm_cg. stats describes counts of reads phased, +qnames determines which SNPs each read was phased to and asm_cg is the data table with read counts

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/download_pipeline_files.html b/docs/reference/download_pipeline_files.html new file mode 100644 index 0000000..5bb4977 --- /dev/null +++ b/docs/reference/download_pipeline_files.html @@ -0,0 +1,130 @@ + +Download CAMDAC pipeline files — download_pipeline_files • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    CAMDAC pipeline files are required for analysis. This function downloads the files to +the output directory and unpacks them. By default, CAMDAC searches for the files in the +environment variable CAMDAC_PIPELINE_FILES. If this is missing, the current directory is used.

    +

    CAMDAC pipeline files are required for analysis. This function downloads the files to +the output directory and unpacks them. By default, CAMDAC searches for the files in the +environment variable CAMDAC_PIPELINE_FILES. If this is missing, the current directory is used.

    +
    + +
    +
    download_pipeline_files(bsseq, directory = NULL, quiet = TRUE)
    +
    +download_pipeline_files(bsseq, directory = NULL, quiet = TRUE)
    +
    + +
    +

    Arguments

    +
    directory
    +

    Optional. Directory to download files to.

    + + +
    assay
    +

    Sequencing assay. Either wgbs or rrbs.

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/format_methylation_df.html b/docs/reference/format_methylation_df.html new file mode 100644 index 0000000..29e43e8 --- /dev/null +++ b/docs/reference/format_methylation_df.html @@ -0,0 +1,156 @@ + +Format methylation rates format_methylation_df — format_methylation_df • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Format methylation rates +format_methylation_df

    +
    + +
    +
    format_methylation_df(
    +  dt,
    +  sample_id,
    +  normal_ids,
    +  path_output,
    +  n_cores,
    +  suffix,
    +  trim = FALSE
    +)
    +
    + +
    +

    Arguments

    +
    dt
    +

    data.table containing the methylation information for each CpG

    + + +
    sample_id
    +

    sample ID

    + + +
    normal_ids
    +

    sample ID of normal sample(s)

    + + +
    path_output
    +

    output directory

    + + +
    n_cores
    +

    number of threads for HDI calculation

    + + +
    suffix
    +

    string containing the column names suffix for normal samples +This is to distinguish between the proxy supplied for the normal infiltrates +for use in deconvolution and the normal cell of origin for use in DMP/DMR calling

    + + +
    trim
    +

    Logical value establishing whether regions with extremely high coverage be trimmed or not

    + +
    +
    +

    Value

    + + +

    A GRanges object with all the CpG loci, their coverage, counts methylated and methylation rate

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/format_output.html b/docs/reference/format_output.html new file mode 100644 index 0000000..c0b8a60 --- /dev/null +++ b/docs/reference/format_output.html @@ -0,0 +1,159 @@ + +Format output nucleotide counts format_output — format_output • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Format output nucleotide counts +format_output

    +
    + +
    +
    format_output(
    +  patient_id,
    +  sample_id,
    +  sex,
    +  is_normal = FALSE,
    +  path,
    +  path_to_CAMDAC,
    +  build
    +)
    +
    + +
    +

    Arguments

    +
    patient_id
    +

    Character variable containting the patient id number

    + + +
    sample_id
    +

    Character variable with the sample ID

    + + +
    sex
    +

    Character variable with the patient expressed as "XX" for female or "XY" for male.

    + + +
    is_normal
    +

    Logical flag set to false if the sample to be formatted is normal or tumour

    + + +
    path
    +

    Character path variable pointing to the desired working directory. +This is where the output will be stored and should be constant for all CAMDAC functions. +Do not alter the output directory structure while running CAMDAC.

    + + +
    path_to_CAMDAC
    +

    Character variable containting the path to the CAMDAC directory +including dir name (e.g. "/path/to/CAMDAC/").

    + + +
    build
    +

    Character variable corresponding to the reference genome used for alignment. +CAMDAC is compatible with "hg19", "hg38", "GRCH37","GRCH38". +is desired in addition to GRanges object in .RData file

    + +
    +
    +

    Value

    + + +

    Concatenated SNP and CpG information

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/get_DMPs.html b/docs/reference/get_DMPs.html new file mode 100644 index 0000000..9d47112 --- /dev/null +++ b/docs/reference/get_DMPs.html @@ -0,0 +1,143 @@ + +Get DMPs — get_DMPs • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    get_DMPs returns a df with annotated statistics for each CpG

    +
    + +
    +
    get_DMPs(path, patient_id, sample_id, df, prob = 0.99, n_cores)
    +
    + +
    +

    Arguments

    +
    path
    +

    Complete path to the CAMDAC methylation output directory +fir this sample

    + + +
    patient_id
    +

    Character string containting the patient number

    + + +
    sample_id
    +

    Character variable with the tumour sample_id

    + + +
    df
    +

    A data.table with pure, bulk and normal methylation info

    + + +
    prob
    +

    Numerical value representing the threshold for statistically +significant DMP (default is p=0.99)

    + + +
    n_cores
    +

    Number of cores to do the statistical testing over

    + +
    +
    +

    Value

    + + +

    A data.table object with all the CpG loci, their coverage, counts +methylated and methylation rate

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/get_DMRs.html b/docs/reference/get_DMRs.html new file mode 100644 index 0000000..5ff26d6 --- /dev/null +++ b/docs/reference/get_DMRs.html @@ -0,0 +1,159 @@ + +Assign bins — get_DMRs • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    annotate_DMRs returns the df with the annotation for each CpG

    +
    + +
    +
    get_DMRs(
    +  path,
    +  patient_id,
    +  sample_id,
    +  dt,
    +  anno_list,
    +  min_DMP_counts,
    +  min_consec_DMP,
    +  n_cores,
    +  bulk = FALSE
    +)
    +
    + +
    +

    Arguments

    +
    path
    +

    Character string of the output directory

    + + +
    patient_id
    +

    Character string containting the patient_id ID

    + + +
    sample_id
    +

    Character string containting the sample ID.

    + + +
    dt
    +

    dataframe where each CG is a row with DMP info.

    + + +
    anno_list
    +

    A data.table object containing annotated genomic bins including +genes, exons, introns, UTRs, CGI, CGI shores, CGI shelves, promoters or enhancers

    + + +
    min_DMP_counts
    +

    Numerical - number of DMPs required in a DMR

    + + +
    min_consec_DMP
    +

    Numerical - number of consecutive DMPs required in a DMR

    + + +
    n_cores
    +

    number of cores for parallel processing

    + +
    +
    +

    Value

    + + +

    A dataframe for each sample_id with the copy number calls added

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/get_allele_counts.html b/docs/reference/get_allele_counts.html new file mode 100644 index 0000000..54b66b0 --- /dev/null +++ b/docs/reference/get_allele_counts.html @@ -0,0 +1,185 @@ + +Compile allele counts at SNPs and at CpGs for bisulfite sequencing data get_allele_counts — get_allele_counts • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Compile allele counts at SNPs and at CpGs for bisulfite sequencing data +get_allele_counts

    +
    + +
    +
    get_allele_counts(
    +  i,
    +  patient_id,
    +  sample_id,
    +  sex,
    +  bam_file,
    +  mq = 0,
    +  path,
    +  path_to_CAMDAC,
    +  build = NULL,
    +  n_cores,
    +  test = FALSE,
    +  paired_end = TRUE,
    +  segments_bed = NULL
    +)
    +
    + +
    +

    Arguments

    +
    i
    +

    Integer loop index. The function must be run with all values from 1 to 25, each containing +1/25th of the RRBS covered genome.

    + + +
    patient_id
    +

    Character variable containting the patient id

    + + +
    sample_id
    +

    Character variable with the sample id

    + + +
    sex
    +

    Character variable with the patient sex expressed as "XX" for female or "XY" for male.

    + + +
    bam_file
    +

    Character variable with the full bam file name and path

    + + +
    mq
    +

    Character variable or numeric containting the mapping quality treshold to be used. +For RRBS, set mq=0. Read mapping validity is based on read start site and nucleotides rather than mq.

    + + +
    path
    +

    Character path variable pointing to the desired working directory. +This is where the output will be stored and should be constant for all CAMDAC functions. +Do not alter the output directory structure while running CAMDAC. +The function output of this function will be a sub-directory of the path variable under +"./Allelecounts/sample_id/". Do not change the directory structure as subsequent functions will +look for files in this directory.

    + + +
    path_to_CAMDAC
    +

    Character variable containting the CAMDAC installation path (e.g. "/path/to/CAMDAC/").

    + + +
    build
    +

    Character variable corresponding to the reference genome used for alignment. +CAMDAC is compatible with "hg19", "hg38", "GRCH37","GRCH38".

    + + +
    n_cores
    +

    Numerical value correspdonding to the number of cores for parallel processing

    + + +
    test
    +

    Logical value indicating whether this is a quick test run with data subsampling

    + +
    +
    +

    Value

    + + +

    One .fst file including methylation info at CpGs and BAF and depth of coverage at +SNPs for the ith subset of RRBS loci

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/get_cluster_counts.html b/docs/reference/get_cluster_counts.html new file mode 100644 index 0000000..6d69b28 --- /dev/null +++ b/docs/reference/get_cluster_counts.html @@ -0,0 +1,108 @@ + +Count CpGs within DMP annotations — get_cluster_counts • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Count CpGs within DMP annotations

    +
    + +
    +
    get_cluster_counts(dt)
    +
    + + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/get_differential_methylation.html b/docs/reference/get_differential_methylation.html new file mode 100644 index 0000000..9d184ab --- /dev/null +++ b/docs/reference/get_differential_methylation.html @@ -0,0 +1,205 @@ + +Perform differential methylation analysis on deconvolute tumour methylation rates — get_differential_methylation • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    get_differential_methylation

    +
    + +
    +
    get_differential_methylation(
    +  patient_id,
    +  sample_id,
    +  sex,
    +  normal_origin_proxy_id,
    +  path,
    +  path_to_CAMDAC,
    +  build,
    +  effect_size = 0.2,
    +  prob = 0.99,
    +  min_DMP_counts_in_DMR = 5,
    +  min_consec_DMP_in_DMR = 4,
    +  n_cores,
    +  reseg = FALSE,
    +  bulk = FALSE
    +)
    +
    + +
    +

    Arguments

    +
    patient_id
    +

    Character variable containting the patient id number

    + + +
    sample_id
    +

    Character variable with the tumour sample_id

    + + +
    sex
    +

    Character variable with the patient expressed as "XX" for +female or "XY" for male.

    + + +
    normal_origin_proxy_id
    +

    Character variable with the sample ID +of the normal to be used as a proxy for the tumour cell of origin in

    + + +
    path
    +

    Character path variable pointing to the desired working +directory. This is where the output will be stored.

    + + +
    path_to_CAMDAC
    +

    Character variable containting the path to the CAMDAC +directory including dir name (e.g. "/path/to/CAMDAC/").

    + + +
    build
    +

    Character variable corresponding to the reference genome +used for alignment. CAMDAC is compatible with "hg19", "hg38", "GRCH37","GRCH38".

    + + +
    effect_size
    +

    Numerical containting the minimum tumour-normal methylation +difference (default is 0.2)

    + + +
    prob
    +

    Numerical value representing the threshold for statistically +significant DMP (default is p=0.99)

    + + +
    min_DMP_counts_in_DMR
    +

    Numerical value representing the number of +DMPs required in a DMR

    + + +
    min_consec_DMP_in_DMR
    +

    Numerical value representing the number of +consecutive DMPs required in a DMR

    + + +
    n_cores
    +

    Numerical value correspdonding to the number of cores +for parallel processing

    + + +
    reseg
    +

    Logical value should be set to FALSE. Multi-sample re-segmentation of +the copy number profiles will be available in future versions of CAMDAC.

    + + +
    bulk
    +

    Default is FALSE unless you want bulk DMP/DMR calls in addition +to CAMDAC pure tumour differential methylation analysis

    +

    Note: +#' Annotation include: +CGI (including shore and shelves) +gene body (intragenic, 5UTR, 3UTR, intron, exon) +promoter (2kb upstream and 500 downstream any UCSC annotated gene) +enhancer (vista and FANTOM5 annotation)

    + +
    +
    +

    Value

    + + +

    Biologically significant DMPs, DMRs

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/get_msp1_fragments.html b/docs/reference/get_msp1_fragments.html new file mode 100644 index 0000000..b79408f --- /dev/null +++ b/docs/reference/get_msp1_fragments.html @@ -0,0 +1,131 @@ + +get_msp1_fragments — get_msp1_fragments • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    get msp1 fragments

    +
    + +
    +
    get_msp1_fragments(dt, build, path_to_CAMDAC, outfile)
    +
    + +
    +

    Arguments

    +
    dt
    +

    data.table object with containing all covered CCGGs in the sample

    + + +
    build
    +

    Character, Either "hg19", "hg38", "GRCH37","GRCH38"

    + + +
    path_to_CAMDAC
    +

    Character string containting the path to the CAMDAC dir including +dir name e.g. "~/CAMDAC/"

    + + +
    outfile
    +

    character srting with output filename

    + +
    +
    +

    Author

    +

    elizabeth larose cadieux

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/get_pure_tumour_methylation.html b/docs/reference/get_pure_tumour_methylation.html new file mode 100644 index 0000000..75dc72e --- /dev/null +++ b/docs/reference/get_pure_tumour_methylation.html @@ -0,0 +1,174 @@ + +Deconvolve the pure tumour methylation rate from bisulfite sequencing data — get_pure_tumour_methylation • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    get_pure_tumour_methylation

    +
    + +
    +
    get_pure_tumour_methylation(
    +  patient_id,
    +  sample_id,
    +  sex,
    +  normal_infiltrates_proxy_id,
    +  path,
    +  path_to_CAMDAC,
    +  build,
    +  n_cores,
    +  reseg = FALSE
    +)
    +
    + +
    +

    Arguments

    +
    patient_id
    +

    Character variable containting the patient id number

    + + +
    sample_id
    +

    Character variable with the (control or tumour) sample_id

    + + +
    sex
    +

    Character variable with the patient expressed as "XX" for +female or "XY" for male.

    + + +
    normal_infiltrates_proxy_id,
    +

    Sample ID of the matched normal control

    + + +
    path
    +

    Character path variable pointing to the desired working directory. +This is where the output will be stored and should be constant for all CAMDAC functions.

    + + +
    path_to_CAMDAC
    +

    Character variable containting the path to the CAMDAC +directory including dir name (e.g. "/path/to/CAMDAC/").

    + + +
    build
    +

    Character variable corresponding to the reference genome +used for alignment. CAMDAC is compatible with "hg19", "hg38", "GRCH37","GRCH38".

    + + +
    n_cores
    +

    Numerical value correspdonding to the number of cores +for parallel processing

    + + +
    reseg
    +

    Logical value should be set to FALSE. Multi-sample re-segmentation of +the copy number profiles will be available in future versions of CAMDAC.

    +

    Note: +#' Annotation include: +CGI (including shore and shelves) +gene body (intragenic, 5UTR, 3UTR, intron, exon) +promoter (2kb upstream and 500 downstream any UCSC annotated gene) +enhancer (vista and FANTOM5 annotation)

    + +
    +
    +

    Value

    + + +

    CAMDAC purified tumour methylation rates

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/get_reference_files.html b/docs/reference/get_reference_files.html new file mode 100644 index 0000000..0af3dd1 --- /dev/null +++ b/docs/reference/get_reference_files.html @@ -0,0 +1,108 @@ + +Get CAMDAC reference files from config — get_reference_files • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Get CAMDAC reference files from config

    +
    + +
    +
    get_reference_files(config, type_folder, glob = NULL)
    +
    + + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/helper_camdac_pileup.html b/docs/reference/helper_camdac_pileup.html new file mode 100644 index 0000000..8adea9b --- /dev/null +++ b/docs/reference/helper_camdac_pileup.html @@ -0,0 +1,114 @@ + +Cache existing CAMDAC results into a sub-directory so that the current ones can be overwritten by the refitting pipeline Decided this is unnecessary as the initial results were so wrong. Exported only for development — helper_camdac_pileup • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Cache existing CAMDAC results into a sub-directory so that the current ones can be +overwritten by the refitting pipeline +Decided this is unnecessary as the initial results were so wrong. +Exported only for development

    +
    + +
    +
    helper_camdac_pileup(bam_file, seg, loci_dt)
    +
    + + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/index.html b/docs/reference/index.html new file mode 100644 index 0000000..7f51673 --- /dev/null +++ b/docs/reference/index.html @@ -0,0 +1,190 @@ + +Function reference • CAMDAC + + +
    +
    + + + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    All functions

    +

    +
    +

    CamConfig()

    +

    Set CAMDAC configuration

    +

    CamSample()

    +

    Build CAMDAC sample object

    +

    attach_output()

    +

    Manually assign output file to CAMDAC sample

    +

    cmain_bind_snps()

    +

    Bind SNPs

    +

    cmain_call_cna()

    +

    Call CNA

    +

    cmain_call_dmps()

    +

    Call tumour-normal DMPs

    +

    cmain_call_dmrs()

    +

    Call tumour-normal DMRs

    +

    cmain_count_alleles()

    +

    Count alleles

    +

    cmain_deconvolve_methylation()

    +

    Deconvolve methylation

    +

    cmain_make_methylation_profile()

    +

    Make methylation

    +

    cmain_make_snps()

    +

    Make SNPs

    +

    cmain_run_ascat()

    +

    Run ASCAT.m

    +

    cmain_run_battenberg()

    +

    Run battenberg

    +

    download_pipeline_files()

    +

    Download CAMDAC pipeline files

    +

    get_reference_files()

    +

    Get CAMDAC reference files from config

    +

    load_cna_data()

    +

    Parse ASCAT and Battenberg output directories to load CNA data

    +

    load_panel_ac_files()

    +

    Load allele count files

    +

    panel_asm_from_counts()

    +

    Panel ASM from counts Basic function to create an ASM methylation panel from allele count or ASM meth files WARNING: In active development.

    +

    panel_meth_from_beta()

    +

    Make CAMDAC methylation panel from a matrix of beta values

    +

    panel_meth_from_counts()

    +

    Make CAMDAC methylation panel from allele counts Methylation fractions are obtained by summing M and UM reads across samples

    +

    pipeline()

    +

    CAMDAC analysis pipeline

    +

    preprocess_asm()

    +

    Preprocess a list of CamSample objects for ASM analysis

    + + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/intervalWidth_r.html b/docs/reference/intervalWidth_r.html new file mode 100644 index 0000000..5f30b99 --- /dev/null +++ b/docs/reference/intervalWidth_r.html @@ -0,0 +1,133 @@ + +Calculate intervalWidth_r — intervalWidth_r • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Calculate intervalWidth_r

    +
    + +
    +
    intervalWidth_r(lowTailPr, ICDFname, credMass, ...)
    +
    + +
    +

    Arguments

    +
    ICDFname
    +

    is R's name for the inverse cumulative density function +of the distribution.

    + + +
    credMass
    +

    is the desired mass of the HDI region.

    + + +
    tol
    +

    is passed to R's optimize function, +the lower the tolerance,the longer the optimisation, but the higher the accuracy. +tol=1e-4 gives values of the same accurary as our max resolution +Return value: +Highest density iterval (HDI) limits in a vector. +Example of use: For determining HDI of a beta(30,12) distribution, type +HDIofICDF( qbeta , shape1 = 30+1 , shape2 = 12+1 ) +Notice that the parameters of the ICDFname must be explicitly named; +e.g., HDIofICDF( qbeta , 30+1 , 12+1 ) does not work. +Adapted and corrected from Greg Snow's TeachingDemos package. +Source fct outside of loop to speed up code

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/load_cna_data.html b/docs/reference/load_cna_data.html new file mode 100644 index 0000000..dfc854c --- /dev/null +++ b/docs/reference/load_cna_data.html @@ -0,0 +1,120 @@ + +Parse ASCAT and Battenberg output directories to load CNA data — load_cna_data • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    See "annotate_copy_number" func +A function required to load copy number for a tumour sample from camdac, either ascat or bb, +result should be: chrom, start, end, nA, nB, CN (total), seg_min and seg_max. +This should also include the purity and ploidy. As a separate list? +note that seg_min and seg_max are actually duplicates of the start and end columns, required to +keep track of the ascat segment positions after overalp +WARN: This drops sex chromosome but not implimented. Also should drops CN=0 (hom del) regions

    +
    + +
    +
    load_cna_data(tumour, config, data_type)
    +
    + + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/load_panel_ac_files.html b/docs/reference/load_panel_ac_files.html new file mode 100644 index 0000000..1abd099 --- /dev/null +++ b/docs/reference/load_panel_ac_files.html @@ -0,0 +1,120 @@ + +Load allele count files — load_panel_ac_files • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Load allele count files

    +
    + +
    +
    load_panel_ac_files(ac_files, cores = 5)
    +
    + +
    +

    Arguments

    +
    ac_files
    +

    Allele count files from CAMDAC

    + +
    +
    +

    Value

    + + +

    List of data tables for each allele counts file

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/panel_asm_from_counts.html b/docs/reference/panel_asm_from_counts.html new file mode 100644 index 0000000..e0e3149 --- /dev/null +++ b/docs/reference/panel_asm_from_counts.html @@ -0,0 +1,122 @@ + +Panel ASM from counts Basic function to create an ASM methylation panel from allele count or ASM meth files WARNING: In active development. — panel_asm_from_counts • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Panel ASM from counts +Basic function to create an ASM methylation panel from allele count or ASM meth files +WARNING: In active development.

    +
    + +
    +
    panel_asm_from_counts(c1, c2)
    +
    + +
    +

    Arguments

    +
    c1
    +

    First ASM allele counts file to merge

    + + +
    c2
    +

    Second ASM allele counts file to merge

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/panel_meth_from_beta.html b/docs/reference/panel_meth_from_beta.html new file mode 100644 index 0000000..90c3d1e --- /dev/null +++ b/docs/reference/panel_meth_from_beta.html @@ -0,0 +1,152 @@ + +Make CAMDAC methylation panel from a matrix of beta values — panel_meth_from_beta • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Make CAMDAC methylation panel from a matrix of beta values

    +
    + +
    +
    panel_meth_from_beta(
    +  mat,
    +  chrom,
    +  start,
    +  end,
    +  cov,
    +  props,
    +  cores,
    +  min_samples = 1,
    +  max_sd = 1
    +)
    +
    + +
    +

    Arguments

    +
    mat
    +

    Matrix of beta values. Rows are CpGs, columns are samples

    + + +
    chrom
    +

    Vector of chromosome names

    + + +
    start
    +

    Vector of CpG start positions

    + + +
    end
    +

    Vector of CpG end positions

    + + +
    cov
    +

    Vector of coverage values to give each CpG site. If a matrix is provided, coverage is calculated as the sum of reads for each site.

    + + +
    cores
    +

    Number of cores to use for calculating HDI

    + + +
    min_samples
    +

    Minimum number of samples that must have a non-NA value for a CpG site to be included in panel

    + + +
    max_sd
    +

    Maximum standard deviation of methylation for a site to be included in panel.

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/panel_meth_from_counts.html b/docs/reference/panel_meth_from_counts.html new file mode 100644 index 0000000..a3dd576 --- /dev/null +++ b/docs/reference/panel_meth_from_counts.html @@ -0,0 +1,150 @@ + +Make CAMDAC methylation panel from allele counts Methylation fractions are obtained by summing M and UM reads across samples — panel_meth_from_counts • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Make CAMDAC methylation panel from allele counts +Methylation fractions are obtained by summing M and UM reads across samples

    +
    + +
    +
    panel_meth_from_counts(
    +  ac_files,
    +  ac_props = NULL,
    +  min_coverage = 3,
    +  min_samples = 1,
    +  max_sd = 1,
    +  drop_snps = FALSE,
    +  cores = 5
    +)
    +
    + +
    +

    Arguments

    +
    ac_files
    +

    Allele count files from CAMDAC

    + + +
    ac_props
    +

    Proportions of each sample to use in panel. If NULL, samples are weighted by their +total number of reads, which equals the sum of M and UM counts. If samples are NA, then +proportions are redistributed.

    + + +
    min_coverage
    +

    Minimum coverage for a sample's site to be included in panel

    + + +
    min_samples
    +

    Minimum number of samples with coverage for a site to be included in panel

    + + +
    max_sd
    +

    Maximum standard deviation of methylation for a site to be included in panel

    + + +
    drop_snps
    +

    Boolean. If TRUE, drop per-sample CG-SNPs (BAF < 0.1 or BAF > 0.9) from panel

    + + +
    cores
    +

    Number of cores to use for calculating HDI

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/pipeline.html b/docs/reference/pipeline.html new file mode 100644 index 0000000..54c96f7 --- /dev/null +++ b/docs/reference/pipeline.html @@ -0,0 +1,130 @@ + +CAMDAC analysis pipeline — pipeline • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    CAMDAC analysis pipeline

    +
    + +
    +
    pipeline(tumor, germline, infiltrates, origin, config)
    +
    + +
    +

    Arguments

    +
    tumor
    +

    Tumor CamSample() object for deconvultion.

    + + +
    germline
    +

    Patient-matched normal CamSample() object. May be NULL if tumor has CNA calls already.

    + + +
    infiltrates
    +

    Normal CamSample() as a proxy for infiltrating normal methylation.

    + + +
    origin
    +

    Normal CamSample() representing cell of origin for tumor-normal differential methylation.

    + + +
    config
    +

    Configuration built with CamConfig().

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/pipeline_rrbs.html b/docs/reference/pipeline_rrbs.html new file mode 100644 index 0000000..2136265 --- /dev/null +++ b/docs/reference/pipeline_rrbs.html @@ -0,0 +1,130 @@ + +Call CAMDAC for a tumor and patient-matched normal sample — pipeline_rrbs • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Call CAMDAC for a tumor and patient-matched normal sample

    +
    + +
    +
    pipeline_rrbs(tumor, germline, infiltrates, origin, config)
    +
    + +
    +

    Arguments

    +
    tumor
    +

    Tumor CamSample object for deconvultion.

    + + +
    germline
    +

    Patient-matched normal CamSample object. May be NULL if tumor has CNA calls already.

    + + +
    infiltrates
    +

    Normal CamSample as a proxy for infiltrating normal methylation.

    + + +
    origin
    +

    Normal CamSample representing cell of origin for tumor-normal differential methylation.

    + + +
    config
    +

    Configuration built with CamConfig().

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/pipeline_wgbs.html b/docs/reference/pipeline_wgbs.html new file mode 100644 index 0000000..9635fb1 --- /dev/null +++ b/docs/reference/pipeline_wgbs.html @@ -0,0 +1,136 @@ + +Run CAMDAC WGBS analysis on a bulk tumor and patient-matched tissue-matched tumor-adjacent normal sample. — pipeline_wgbs • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Run CAMDAC WGBS analysis on a bulk tumor and patient-matched tissue-matched tumor-adjacent normal sample.

    +
    + +
    +
    pipeline_wgbs(
    +  tumor,
    +  germline = NULL,
    +  infiltrates = NULL,
    +  origin = NULL,
    +  config
    +)
    +
    + +
    +

    Arguments

    +
    tumor
    +

    Tumor CamSample object for deconvultion.

    + + +
    germline
    +

    Patient-matched normal CamSample object. May be NULL if tumor has CNA calls already.

    + + +
    infiltrates
    +

    Normal CamSample as a proxy for infiltrating normal methylation.

    + + +
    origin
    +

    Normal CamSample representing cell of origin for tumor-normal differential methylation.

    + + +
    config
    +

    Configuration built with CamConfig().

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/plot_2d_density.html b/docs/reference/plot_2d_density.html new file mode 100644 index 0000000..7e8d75a --- /dev/null +++ b/docs/reference/plot_2d_density.html @@ -0,0 +1,119 @@ + +plot_2d_density — plot_2d_density • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    plot_2d_density

    +
    + +
    +
    plot_2d_density(dt, path)
    +
    + +
    +

    Arguments

    +
    dt
    +

    Data table with methylation information per CpG

    + + +
    path
    +

    Character path variable pointing to the desired working directory. +This is where the output will be stored and should be constant for all CAMDAC functions.

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/plot_BAF_and_LogR.html b/docs/reference/plot_BAF_and_LogR.html new file mode 100644 index 0000000..9f1768b --- /dev/null +++ b/docs/reference/plot_BAF_and_LogR.html @@ -0,0 +1,124 @@ + +Plot BAF and logR profiles with ggplot — plot_BAF_and_LogR • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Plot BAF and logR profiles with ggplot

    +
    + +
    +
    plot_BAF_and_LogR(dt, outfile, downsample = 100000)
    +
    + +
    +

    Arguments

    +
    dt
    +

    data.frame with methylation info

    + + +
    outfile
    +

    character srting with output pdf filename +Saves a pdf w/ methylation rate distribution, biases at polymorphic and +non-polymorphic CG/CCGG and coverage distribution

    + +
    +
    +

    Author

    +

    Elizabeth Larose Cadieux

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/plot_SNP_info.html b/docs/reference/plot_SNP_info.html new file mode 100644 index 0000000..72486dd --- /dev/null +++ b/docs/reference/plot_SNP_info.html @@ -0,0 +1,128 @@ + +Plot SNP data summary and QC — plot_SNP_info • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    plot_SNP_info plots SNP QC

    +
    + +
    +
    plot_SNP_info(dt, outfile, min)
    +
    + +
    +

    Arguments

    +
    dt
    +

    data.table with SNP info

    + + +
    outfile
    +

    character srting with output pdf filename

    + +
    +
    +

    Value

    + + +

    pdf

    +
    +
    +

    Author

    +

    Elizabeth Larose Cadieux

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/plot_methylation_info.html b/docs/reference/plot_methylation_info.html new file mode 100644 index 0000000..ab0d020 --- /dev/null +++ b/docs/reference/plot_methylation_info.html @@ -0,0 +1,140 @@ + +Plot Methylation — plot_methylation_info • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Creates table grob in format that is most common for my usage.

    +
    + +
    +
    plot_methylation_info(df_sample, outfile)
    +
    + +
    +

    Arguments

    +
    df_sample
    +

    data.frame with methylation info

    + + +
    outfile
    +

    character srting with output pdf filename

    + + +
    dt
    +

    Data.table that the grob will be made out of

    + + +
    title_v
    +

    Title for display

    + + +
    fontsize_v
    +

    Fontsize for title. Default is 14 (goes well with my_theme)

    + +
    +
    +

    Value

    + + +

    pdf w/ methylation rate distribution, biases at polymorphic and non-polymorphic CG/CCGG and coverage distribution

    +
    +
    +

    Details

    +

    plot_methylation_info returns the df_sample with annotated q-value for each CpG

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/plot_methylation_info_with_anno.html b/docs/reference/plot_methylation_info_with_anno.html new file mode 100644 index 0000000..fbe4d55 --- /dev/null +++ b/docs/reference/plot_methylation_info_with_anno.html @@ -0,0 +1,122 @@ + +Plot methylation information — plot_methylation_info_with_anno • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Plot methylation information

    +
    + +
    +
    plot_methylation_info_with_anno(dt, path, bulk)
    +
    + +
    +

    Arguments

    +
    dt
    +

    Data table with methylation information per CpG

    + + +
    path
    +

    Character path variable pointing to the desired working directory.

    + + +
    bulk
    +

    Logical determining whether the bulk or purified tumour is to be plotted

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/plot_normal_SNP_info.html b/docs/reference/plot_normal_SNP_info.html new file mode 100644 index 0000000..b22d60e --- /dev/null +++ b/docs/reference/plot_normal_SNP_info.html @@ -0,0 +1,128 @@ + +Plot plots SNP QC — plot_normal_SNP_info • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Plot plots SNP QC

    +
    + +
    +
    plot_normal_SNP_info(dt, outfile, min)
    +
    + +
    +

    Arguments

    +
    dt
    +

    data.table with SNP info

    + + +
    outfile
    +

    character srting with output pdf filename

    + +
    +
    +

    Value

    + + +

    pdf

    +
    +
    +

    Author

    +

    Elizabeth Larose Cadieux

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/preprocess_asm.html b/docs/reference/preprocess_asm.html new file mode 100644 index 0000000..e791170 --- /dev/null +++ b/docs/reference/preprocess_asm.html @@ -0,0 +1,118 @@ + +Preprocess a list of CamSample objects for ASM analysis — preprocess_asm • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Preprocess a list of CamSample objects for ASM analysis

    +
    + +
    +
    preprocess_asm(sample_list, config)
    +
    + +
    +

    Arguments

    +
    sample_list.
    +

    List of CamSample objects.

    + + +
    config.
    +

    CamConfig object.

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/preprocess_wgbs.html b/docs/reference/preprocess_wgbs.html new file mode 100644 index 0000000..987628b --- /dev/null +++ b/docs/reference/preprocess_wgbs.html @@ -0,0 +1,118 @@ + +Preprocess a list of CamSample objects for analysis — preprocess_wgbs • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Preprocess a list of CamSample objects for analysis

    +
    + +
    +
    preprocess_wgbs(sample_list, config)
    +
    + +
    +

    Arguments

    +
    sample_list.
    +

    List of CamSample objects.

    + + +
    config.
    +

    CamConfig object.

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/remove_low_cov_singletons.html b/docs/reference/remove_low_cov_singletons.html new file mode 100644 index 0000000..28e68cc --- /dev/null +++ b/docs/reference/remove_low_cov_singletons.html @@ -0,0 +1,112 @@ + +remove_low_cov_singletons — remove_low_cov_singletons • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Remove low coverage singletons outliers

    +
    + +
    +
    remove_low_cov_singletons(dt_sample_SNPs, min)
    +
    + +
    +

    Author

    +

    Elizabeth larose cadieux

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/round2.html b/docs/reference/round2.html new file mode 100644 index 0000000..51b4cf8 --- /dev/null +++ b/docs/reference/round2.html @@ -0,0 +1,145 @@ + +Round2 — round2 • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Round numerical values to 'n' digits

    +

    Round numerical values to 'n' digits

    +

    Round numerical values to 'n' digits

    +

    Round numerical values to 'n' digits

    +
    + +
    +
    round2(x, digits)
    +
    +round2(x, digits)
    +
    +round2(x, digits)
    +
    +round2(x, digits)
    +
    + +
    +

    Arguments

    +
    x
    +

    Numerical vector containing the numbers to round

    + + +
    digits
    +

    Numerical value representing the number of decimal digits to retain

    + +
    +
    +

    Value

    + + +

    rounded numerical vector

    + + +

    rounded numerical vector

    + + +

    rounded numerical vector

    + + +

    rounded numerical vector

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/run_ASCAT.m.html b/docs/reference/run_ASCAT.m.html new file mode 100644 index 0000000..b374b69 --- /dev/null +++ b/docs/reference/run_ASCAT.m.html @@ -0,0 +1,183 @@ + +Obtain allele-specific copy number profiles, tumour purity and plot SNP data — run_ASCAT.m • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    run_ASCAT.m

    +
    + +
    +
    run_ASCAT.m(
    +  patient_id,
    +  sample_id,
    +  sex,
    +  patient_matched_normal_id = NULL,
    +  path,
    +  path_to_CAMDAC,
    +  build,
    +  min_normal = 10,
    +  min_tumour = 1,
    +  n_cores = 1,
    +  reference_panel_coverage = NULL
    +)
    +
    + +
    +

    Arguments

    +
    patient_id
    +

    Character variable containting the patient id number

    + + +
    sample_id
    +

    Character variable with the (control or tumour) sample_id

    + + +
    sex
    +

    Character variable with the patient expressed as "XX" for female +or "XY" for male. +This is important for copy number profiling. If sex is unknown, put "XY" for now, +then look at the allelic imbalance (BAF) on X in the germline outside pseudo- +autosomal regions. If there are little to no heterozygous SNPs, the sample is likely male.

    + + +
    patient_matched_normal_id
    +

    Character variable with the sample ID of the matched normal control

    + + +
    path
    +

    Character path variable pointing to the desired working directory. +This is where the output will be stored +IMPORTANT: The function output directory will be the in the path variable working +directory under "./Copy_number/sample_id/".

    + + +
    path_to_CAMDAC
    +

    Character variable containting the path to the CAMDAC dir +including dir name (e.g. "/path/to/CAMDAC/").

    + + +
    build
    +

    Character variable corresponding to the reference genome used for alignment. +CAMDAC is compatible with "hg19", "hg38", "GRCH37","GRCH38".

    + + +
    min_normal
    +

    Numerical value correspdonding to the minimum counts for germline +SNPs to be included (default:1)

    + + +
    min_tumour
    +

    Numerical value correspdonding to the minimum counts in the tumour +sample for germline SNPs to be included (default:10)

    + + +
    n_cores
    +

    Numerical value correspdonding to the number of cores for parallel processing

    + + +
    reference_panel_coverage
    +

    Path to the reference panel for the coverage.

    + +
    +
    +

    Value

    + + +

    Three text files with all the CpG loci and their SNP and/or CpG methylation info

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/run_methylation_data_processing.html b/docs/reference/run_methylation_data_processing.html new file mode 100644 index 0000000..7145d5c --- /dev/null +++ b/docs/reference/run_methylation_data_processing.html @@ -0,0 +1,192 @@ + +Filter bulk tumour and normal methylation data, get methylation rate highest density interval (HDI) and plot raw methylation info run_methylation_data_processing — run_methylation_data_processing • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Filter bulk tumour and normal methylation data, get methylation rate highest density interval (HDI) +and plot raw methylation info +run_methylation_data_processing

    +
    + +
    +
    run_methylation_data_processing(
    +  patient_id,
    +  sample_id,
    +  normal_infiltrates_proxy_id,
    +  normal_origin_proxy_id,
    +  path,
    +  min_normal = 10,
    +  min_tumour = 3,
    +  n_cores,
    +  reference_panel_normal_infiltrates = NULL,
    +  reference_panel_normal_origin = NULL
    +)
    +
    + +
    +

    Arguments

    +
    patient_id
    +

    Character variable containting the patient ID

    + + +
    sample_id
    +

    Character variable with the (control or tumour) sample ID

    + + +
    normal_infiltrates_proxy_id
    +

    Character variable with the sample ID of +the tissue-matched normal acting as proxy for the tumour infiltrating +normal cells. Ideally, this is a patient and tissue-matched tumour adjacent normal sample.

    + + +
    normal_origin_proxy_id
    +

    Character variable with the sample ID +of the normal to be used as a proxy for the tumour cell of origin in +differential methylation analyses.

    + + +
    path
    +

    Character path variable pointing to the desired working directory. +This is where the output will be stored.

    + + +
    min_normal
    +

    Numerical value correspdonding to the minimum counts threshold for +the normal CpGs to be included

    + + +
    min_tumour
    +

    Numerical value correspdonding to the minimum counts threshold +in the tumour sample CpGs inclusion

    + + +
    n_cores
    +

    Numerical value correspdonding to the number of cores for parallel processing

    + + +
    reference_panel_normal_infiltrates
    +

    Default is NULL. Character string with the complete +path to a reference methylation profile for the tumour normal infiltrates as a .fst file.

    + + +
    reference_panel_normal_origin
    +

    Default is NULL. Character string with the complete +path to your reference methylation profile for the tumour cell of origin as a .fst file.

    +

    If a patient-matched proxy for the normal infiltrates and/or the normal cell of origin is not +available, a reference panel may be constructed from different individuals and used as a substitute.

    +

    The reference samples should be at the very least sex-matched.

    +

    The reference should be saved as a .fst file with the following columns: +CHR start end M_n UM_n m_n cov_n +

    +

    where each row is a CpG or CCpGG with coordinates CHR:start-end +The start and end columns correspond to the 5'-C and 3'-G coordinate, respectively. +M_n is the number of reads supporting of the methylated allele +UM_n is the number of reads supporting of the unmethylated allele +m_n is the normal methylation rate (M_n / (M_n+UM_n)) +cov_n is the total CpG methylation informative reads counts (M_n+UM_n)

    + +
    +
    +

    Value

    + + +

    GRanges object in .RData file

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/sort_genomic_dt.html b/docs/reference/sort_genomic_dt.html new file mode 100644 index 0000000..e60d63a --- /dev/null +++ b/docs/reference/sort_genomic_dt.html @@ -0,0 +1,122 @@ + +sort_genomic_dt — sort_genomic_dt • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    sort_genomic_dt

    +

    Sort a data table with genomic coordinates

    +
    + +
    +
    sort_genomic_dt(dt, with_chr = F)
    +
    +sort_genomic_dt(dt, with_chr = F)
    +
    + +
    +

    Arguments

    +
    dt
    +

    An object that is a data.table

    + + +
    with_chr
    +

    A boolean to indicate whether the chrom field has UCSC (TRUE) or NCBI (FALSE) format

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/split_segments_gr.html b/docs/reference/split_segments_gr.html new file mode 100644 index 0000000..bdf3dd0 --- /dev/null +++ b/docs/reference/split_segments_gr.html @@ -0,0 +1,118 @@ + +Split genome into segments for allele counting — split_segments_gr • CAMDAC + + +
    +
    + + + +
    +
    + + +
    +

    Split genome into segments for allele counting

    +
    + +
    +
    split_segments_gr(segments_file, n_seg_split)
    +
    + +
    +

    Arguments

    +
    segments_file
    +

    An RDS file containing a GRanges object with each chromosome region to split

    + + +
    n_seg_split
    +

    An integer to split each chromosome segment

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.9.

    +
    + +
    + + + + + + + + diff --git a/docs/sitemap.xml b/docs/sitemap.xml new file mode 100644 index 0000000..d9cd095 --- /dev/null +++ b/docs/sitemap.xml @@ -0,0 +1,489 @@ + + + + /404.html + + + /LICENSE-text.html + + + /NEMO_DEV.html + + + /articles/contributing.html + + + /articles/experimental.html + + + /articles/index.html + + + /articles/introduction.html + + + /articles/output.html + + + /articles/pipeline.html + + + /articles/questions.html + + + /articles/setup.html + + + /articles/technical.html + + + /authors.html + + + /html/404.html + + + /html/DEV.html + + + /html/LICENSE-text.html + + + /html/articles/contributing.html + + + /html/articles/experimental.html + + + /html/articles/index.html + + + /html/articles/introduction.html + + + /html/articles/output.html + + + /html/articles/pipeline.html + + + /html/articles/questions.html + + + /html/articles/setup.html + + + /html/articles/technical.html + + + /html/authors.html + + + /html/index.html + + + /html/news/index.html + + + /html/reference/CamConfig.html + + + /html/reference/CamSample.html + + + /html/reference/HDIofICDF.html + + + /html/reference/HDIofMCMC.html + + + /html/reference/HDIofMCMC_mt.html + + + /html/reference/LogR_correction.html + + + /html/reference/annotate_copy_number.html + + + /html/reference/ascat.m.plotRawData.html + + + /html/reference/ascat.m.plotSegmentedData.html + + + /html/reference/ascat.plotRawData.flags.html + + + /html/reference/asm_pipeline.html + + + /html/reference/attach_output.html + + + /html/reference/bin_CpGs.html + + + /html/reference/calculate_m_t_hdi.html + + + /html/reference/call_dmps.html + + + /html/reference/call_dmr_routine.html + + + /html/reference/camdac_to_battenberg_prepare_wgbs.html + + + /html/reference/cmain_bind_snps.html + + + /html/reference/cmain_call_cna.html + + + /html/reference/cmain_call_dmps.html + + + /html/reference/cmain_call_dmrs.html + + + /html/reference/cmain_count_alleles.html + + + /html/reference/cmain_deconvolve_methylation.html + + + /html/reference/cmain_make_methylation_profile.html + + + /html/reference/cmain_make_snps.html + + + /html/reference/cmain_run_ascat.html + + + /html/reference/cmain_run_battenberg.html + + + /html/reference/collapse_cpg_to_dmr.html + + + /html/reference/compute_tumour_methylome.html + + + /html/reference/cwrap_asm_get_allele_counts.html + + + /html/reference/download_pipeline_files.html + + + /html/reference/format_methylation_df.html + + + /html/reference/format_output.html + + + /html/reference/get_DMPs.html + + + /html/reference/get_DMRs.html + + + /html/reference/get_allele_counts.html + + + /html/reference/get_cluster_counts.html + + + /html/reference/get_differential_methylation.html + + + /html/reference/get_msp1_fragments.html + + + /html/reference/get_pure_tumour_methylation.html + + + /html/reference/get_reference_files.html + + + /html/reference/helper_camdac_pileup.html + + + /html/reference/index.html + + + /html/reference/intervalWidth_r.html + + + /html/reference/load_cna_data.html + + + /html/reference/load_panel_ac_files.html + + + /html/reference/panel_asm_from_counts.html + + + /html/reference/panel_meth_from_beta.html + + + /html/reference/panel_meth_from_counts.html + + + /html/reference/pipeline.html + + + /html/reference/pipeline_rrbs.html + + + /html/reference/pipeline_wgbs.html + + + /html/reference/plot_2d_density.html + + + /html/reference/plot_BAF_and_LogR.html + + + /html/reference/plot_SNP_info.html + + + /html/reference/plot_methylation_info.html + + + /html/reference/plot_methylation_info_with_anno.html + + + /html/reference/plot_normal_SNP_info.html + + + /html/reference/preprocess_asm.html + + + /html/reference/preprocess_wgbs.html + + + /html/reference/remove_low_cov_singletons.html + + + /html/reference/round2.html + + + /html/reference/run_ASCAT.m.html + + + /html/reference/run_methylation_data_processing.html + + + /html/reference/sort_genomic_dt.html + + + /html/reference/split_segments_gr.html + + + /index.html + + + /news/index.html + + + /reference/CamConfig.html + + + /reference/CamSample.html + + + /reference/HDIofICDF.html + + + /reference/HDIofMCMC.html + + + /reference/HDIofMCMC_mt.html + + + /reference/LogR_correction.html + + + /reference/annotate_copy_number.html + + + /reference/ascat.m.plotRawData.html + + + /reference/ascat.m.plotSegmentedData.html + + + /reference/ascat.plotRawData.flags.html + + + /reference/asm_pipeline.html + + + /reference/attach_output.html + + + /reference/bin_CpGs.html + + + /reference/calculate_m_t_hdi.html + + + /reference/call_dmps.html + + + /reference/call_dmr_routine.html + + + /reference/camdac_to_battenberg_prepare_wgbs.html + + + /reference/cmain_bind_snps.html + + + /reference/cmain_call_cna.html + + + /reference/cmain_call_dmps.html + + + /reference/cmain_call_dmrs.html + + + /reference/cmain_count_alleles.html + + + /reference/cmain_deconvolve_methylation.html + + + /reference/cmain_make_methylation_profile.html + + + /reference/cmain_make_snps.html + + + /reference/cmain_run_ascat.html + + + /reference/cmain_run_battenberg.html + + + /reference/collapse_cpg_to_dmr.html + + + /reference/compute_tumour_methylome.html + + + /reference/cwrap_asm_get_allele_counts.html + + + /reference/download_pipeline_files.html + + + /reference/format_methylation_df.html + + + /reference/format_output.html + + + /reference/get_DMPs.html + + + /reference/get_DMRs.html + + + /reference/get_allele_counts.html + + + /reference/get_cluster_counts.html + + + /reference/get_differential_methylation.html + + + /reference/get_msp1_fragments.html + + + /reference/get_pure_tumour_methylation.html + + + /reference/get_reference_files.html + + + /reference/helper_camdac_pileup.html + + + /reference/index.html + + + /reference/intervalWidth_r.html + + + /reference/load_cna_data.html + + + /reference/load_panel_ac_files.html + + + /reference/panel_asm_from_counts.html + + + /reference/panel_meth_from_beta.html + + + /reference/panel_meth_from_counts.html + + + /reference/pipeline.html + + + /reference/pipeline_rrbs.html + + + /reference/pipeline_wgbs.html + + + /reference/plot_2d_density.html + + + /reference/plot_BAF_and_LogR.html + + + /reference/plot_SNP_info.html + + + /reference/plot_methylation_info.html + + + /reference/plot_methylation_info_with_anno.html + + + /reference/plot_normal_SNP_info.html + + + /reference/preprocess_asm.html + + + /reference/preprocess_wgbs.html + + + /reference/remove_low_cov_singletons.html + + + /reference/round2.html + + + /reference/run_ASCAT.m.html + + + /reference/run_methylation_data_processing.html + + + /reference/sort_genomic_dt.html + + + /reference/split_segments_gr.html + + diff --git a/inst/extdata/pipeline_files_urls.txt b/inst/extdata/pipeline_files_urls.txt index a6a1e09..89a70c8 100644 --- a/inst/extdata/pipeline_files_urls.txt +++ b/inst/extdata/pipeline_files_urls.txt @@ -1,3 +1,2 @@ -rrbs https://www.dropbox.com/s/hu37jtxowasskmr/camdac_rrbs_pipeline_files.tar.gz?dl=0 -wgbs https://www.dropbox.com/s/r1jg1lwor4vt24c/camdac_wgbs_pipeline_files.tar.gz?dl=0 -test https://www.dropbox.com/s/8zojz2mwnuuiyh6/test_wgbs_pipeline_files.tar.gz?dl=0 +rrbs https://zenodo.org/records/10565423/files/camdac_rrbs_pipeline_files.tar.gz +wgbs https://zenodo.org/records/10565423/files/camdac_wgbs_pipeline_files.tar.gz diff --git a/inst/testdata/normal.bam b/inst/testdata/normal.bam new file mode 100644 index 0000000..ceebcb6 Binary files /dev/null and b/inst/testdata/normal.bam differ diff --git a/inst/testdata/normal.bam.bai b/inst/testdata/normal.bam.bai new file mode 100644 index 0000000..54f099c Binary files /dev/null and b/inst/testdata/normal.bam.bai differ diff --git a/inst/testdata/normal_beds_min.sorted.bam b/inst/testdata/normal_beds_min.sorted.bam new file mode 100644 index 0000000..cd2e258 Binary files /dev/null and b/inst/testdata/normal_beds_min.sorted.bam differ diff --git a/inst/testdata/normal_beds_min.sorted.bam.bai b/inst/testdata/normal_beds_min.sorted.bam.bai new file mode 100644 index 0000000..155bc99 Binary files /dev/null and b/inst/testdata/normal_beds_min.sorted.bam.bai differ diff --git a/inst/testdata/normal_hg19.bam b/inst/testdata/normal_hg19.bam new file mode 100644 index 0000000..d581f36 Binary files /dev/null and b/inst/testdata/normal_hg19.bam differ diff --git a/inst/testdata/normal_hg19.bam.bai b/inst/testdata/normal_hg19.bam.bai new file mode 100644 index 0000000..15622ca Binary files /dev/null and b/inst/testdata/normal_hg19.bam.bai differ diff --git a/inst/testdata/test.N.SNPs.csv.gz b/inst/testdata/test.N.SNPs.csv.gz new file mode 100644 index 0000000..f79c99d Binary files /dev/null and b/inst/testdata/test.N.SNPs.csv.gz differ diff --git a/inst/testdata/test.SNPs.CpGs.all.sorted.csv.gz b/inst/testdata/test.SNPs.CpGs.all.sorted.csv.gz new file mode 100644 index 0000000..21df95a Binary files /dev/null and b/inst/testdata/test.SNPs.CpGs.all.sorted.csv.gz differ diff --git a/inst/testdata/test.cna.txt b/inst/testdata/test.cna.txt new file mode 100644 index 0000000..2b1433f --- /dev/null +++ b/inst/testdata/test.cna.txt @@ -0,0 +1,11 @@ +chrom start end major_cn minor_cn purity ploidy fit pipeline +13 18173263 19986585 1 4 0.83 4.99604743083004 99.4967458472109 ascat +14 16097083 16137992 2 2 0.83 4.99604743083004 99.4967458472109 ascat +14 18958390 19976201 5 0 0.83 4.99604743083004 99.4967458472109 ascat +15 17037457 17058176 5 0 0.83 4.99604743083004 99.4967458472109 ascat +15 19805699 19961798 5 0 0.83 4.99604743083004 99.4967458472109 ascat +21 5221382 5313005 5 0 0.83 4.99604743083004 99.4967458472109 ascat +21 7259364 10807171 5 0 0.83 4.99604743083004 99.4967458472109 ascat +21 12966778 19980336 5 0 0.83 4.99604743083004 99.4967458472109 ascat +22 10710918 12425547 1 4 0.83 4.99604743083004 99.4967458472109 ascat +22 15166325 19967250 4 1 0.83 4.99604743083004 99.4967458472109 ascat diff --git a/inst/testdata/test.to.norm_pos.csv.gz b/inst/testdata/test.to.norm_pos.csv.gz new file mode 100644 index 0000000..9943a4e Binary files /dev/null and b/inst/testdata/test.to.norm_pos.csv.gz differ diff --git a/inst/testdata/test.to.norm_pos_count.csv.gz b/inst/testdata/test.to.norm_pos_count.csv.gz new file mode 100644 index 0000000..5c2e2b5 Binary files /dev/null and b/inst/testdata/test.to.norm_pos_count.csv.gz differ diff --git a/inst/testdata/test_beds_segments.bed b/inst/testdata/test_beds_segments.bed new file mode 100644 index 0000000..b144423 --- /dev/null +++ b/inst/testdata/test_beds_segments.bed @@ -0,0 +1,24 @@ +chr1 1000000 5000000 +chr2 1000000 5000000 +chr3 1000000 5000000 +chr4 1000000 5000000 +chr5 1000000 5000000 +chr6 160000000 170700000 +chr7 1000000 5000000 +chr8 1000000 5000000 +chr9 1000000 5000000 +chr10 1000000 5000000 +chr11 1000000 5000000 +chr12 1000000 5000000 +chr13 1000000 50000000 +chr14 1000000 50000000 +chr15 1000000 50000000 +chr16 1000000 5000000 +chr17 1000000 5000000 +chr18 1000000 5000000 +chr19 1000000 5000000 +chr20 1000000 5000000 +chr21 1000000 50000000 +chr22 1000000 50000000 +chrX 1000000 10000000 +chrY 1000000 10000000 diff --git a/inst/testdata/test_het_snps.tsv b/inst/testdata/test_het_snps.tsv new file mode 100644 index 0000000..33e504d --- /dev/null +++ b/inst/testdata/test_het_snps.tsv @@ -0,0 +1,169 @@ +chrom pos ref alt +1 1724489 G C +1 1729539 C A +1 1737504 A C +1 1771055 A T +1 2346801 C T +1 2364339 A C +1 2840944 A T +1 2840950 T A +1 2840962 A T +10 1181358 A C +10 1198080 T G +10 1220643 A T +10 1374277 G C +10 1385353 A G +10 1442527 C A +13 18178016 A G +14 19679893 A G +14 19733813 A C +14 19733881 T A +14 19774834 T A +14 19806811 G A +14 19821399 C T +14 19824618 A T +14 19833436 G T +14 19833453 C T +14 19855616 A T +14 19867960 A C +14 19874714 G T +14 19892060 A T +14 19921715 T G +16 1012447 C G +18 1072749 C G +18 1168346 T G +19 1383650 G T +2 1292894 T C +2 1316532 C A +2 1316533 G C +2 1362797 T G +2 1402287 A G +2 1408510 A C +2 1415357 T C +20 1469919 A T +21 7259364 A C +21 7936369 T C +21 7948037 A G +21 8686412 C G +21 8989048 C T +21 8995278 C T +21 8998013 A G +21 9738783 A G +21 9785962 G A +21 9841402 C G +21 9887931 T G +21 9960798 T G +21 10270911 T G +21 10427641 A G +21 10753820 A G +21 10755478 A C +21 10807171 A G +21 13001675 T A +21 13001841 A G +21 13033467 A C +21 13247733 G T +21 13259327 A C +21 13270891 C A +21 13377146 G T +21 13381955 C G +21 13890762 C A +21 13900285 C T +21 14222318 G T +21 14245926 C G +21 14245979 A G +21 14525718 T C +21 14525831 T G +21 14548430 G C +21 14755469 A G +21 14756350 G T +21 14810713 A C +21 14813541 A C +21 14814448 T A +21 14819403 G T +21 14821441 G C +21 14976936 T A +21 15041337 G C +21 15047405 T G +21 15062105 C A +21 15227918 T G +21 15235469 A C +21 16297562 A C +21 16645845 T G +21 16675274 T C +21 17214086 A T +21 17602576 G C +21 18339467 T G +21 18695826 C A +21 18695871 G T +21 18702306 G T +21 18708616 T A +21 18713129 C T +21 18723169 A C +21 18727736 A T +21 18731034 T A +21 18731169 A T +21 18731785 C G +21 18861439 A G +21 18875613 C G +21 18999111 T G +21 19029469 G C +21 19078452 C G +21 19201084 C G +21 19222906 A C +21 19233989 T C +21 19443363 C G +21 19475516 C A +21 19499340 G T +21 19505980 G C +21 19506712 C T +21 19563075 G C +21 19565052 C A +21 19565375 A C +21 19637435 T C +21 19651517 C G +21 19745095 G T +21 19746958 G T +21 19747921 C A +21 19825291 C A +22 10772683 A G +22 10939406 A G +22 16056919 T G +22 16440993 G C +22 16444061 A G +22 16466117 T C +22 16729428 A T +22 16757508 T G +22 16768054 G A +22 16774406 A T +22 16850808 T G +22 16890808 C A +22 16943075 C A +22 16943472 C T +22 16964612 A C +22 17004233 G T +22 17004544 T A +22 17405801 C T +22 18159378 A G +22 18950000 T A +22 19796334 G C +22 19951909 G A +3 1037945 T C +3 1173329 A C +4 1051535 C G +5 1307868 T A +5 1310037 G A +5 1440165 T G +6 1282224 G C +6 1351468 G C +7 1187645 G C +7 1485927 T G +8 1378750 A C +9 1137878 C G +9 1176636 G T +9 1344641 T A +9 1350093 T G +X 1021944 C T +X 1021969 A C +X 1069058 G T +X 1214993 G A +X 1220532 T A diff --git a/inst/testdata/test_panel.m.csv.gz b/inst/testdata/test_panel.m.csv.gz new file mode 100644 index 0000000..49a1191 Binary files /dev/null and b/inst/testdata/test_panel.m.csv.gz differ diff --git a/inst/testdata/test_panel_from_beta.csv b/inst/testdata/test_panel_from_beta.csv new file mode 100644 index 0000000..f10a2d2 --- /dev/null +++ b/inst/testdata/test_panel_from_beta.csv @@ -0,0 +1,18 @@ +chrom,start,end,a,b,c +13,18173666,18173667,0.145521255908534,, +13,18174070,18174071,0.098293288378045,0.734082729090005,0.0743192497550395 +13,18174203,18174204,0.0889053461141884,0.65553721296601,0.000206400660049026 +13,18186169,18186170,0.585936168441549,0.402733342023566,0.131107373766366 +13,18213719,18213720,0.507491068914533,0.433371701044962,0.062174885505301 +13,18231099,18231100,0.89573905733414,0.0613525717053562,0.0473284388405029 +13,18231410,18231411,0.134111179271713,0.20273268641904,0.0538401282697869 +13,18231437,18231438,0.0657471548765898,0.281406426336616,0.0422733099558735 +13,18231447,18231448,0.933853001333773,0.773583886912093,0.0526508528865518 +13,18231466,18231467,0.166047691600397,0.786161274416372,0.0060758869761556 +13,18231584,18231585,0.0435942148324102,0.407993582310155,0.202396970951763 +13,18244976,18244977,0.406777470838279,0.266518469434232,0.152827903559774 +13,18255051,18255052,0.499927770113573,0.0216015297919512,0.029949741054729 +13,18276304,18276305,0.407744152704254,0.0405331093352288,0.0247275754469668 +13,18299353,18299354,0.53362335334532,0.641128221526742,0.150880568616513 +13,18299412,18299413,0.966818436281756,0.499422115273774,0.186131283951183 +13,18316131,18316132,0.76186578348279,0.751277812989429,0.102938770015714 diff --git a/inst/testdata/test_prop.SNPs.CpGs.all.sorted.csv.gz b/inst/testdata/test_prop.SNPs.CpGs.all.sorted.csv.gz new file mode 100644 index 0000000..ed66d7e Binary files /dev/null and b/inst/testdata/test_prop.SNPs.CpGs.all.sorted.csv.gz differ diff --git a/inst/testdata/test_segments.bed b/inst/testdata/test_segments.bed new file mode 100644 index 0000000..6e6a6d9 --- /dev/null +++ b/inst/testdata/test_segments.bed @@ -0,0 +1,5 @@ +chr1 3000000 3100000 +chr1 3300001 3400000 +chr1 3400001 3500000 +chr1 3700001 3800000 +chr1 3950000 4000000 \ No newline at end of file diff --git a/inst/testdata/test_tnsnps.csv.gz b/inst/testdata/test_tnsnps.csv.gz new file mode 100644 index 0000000..f39719c Binary files /dev/null and b/inst/testdata/test_tnsnps.csv.gz differ diff --git a/inst/testdata/test_tsnps.csv.gz b/inst/testdata/test_tsnps.csv.gz new file mode 100644 index 0000000..cab968a Binary files /dev/null and b/inst/testdata/test_tsnps.csv.gz differ diff --git a/inst/testdata/test_wgbs_segments.bed b/inst/testdata/test_wgbs_segments.bed new file mode 100644 index 0000000..f67334f --- /dev/null +++ b/inst/testdata/test_wgbs_segments.bed @@ -0,0 +1,24 @@ +chr1 1000000 5000000 +chr2 1000000 5000000 +chr3 1000000 5000000 +chr4 1000000 5000000 +chr5 1000000 5000000 +chr6 1000000 5000000 +chr7 1000000 5000000 +chr8 1000000 5000000 +chr9 1000000 5000000 +chr10 1000000 5000000 +chr11 1000000 5000000 +chr12 1000000 5000000 +chr13 1000000 50000000 +chr14 1000000 50000000 +chr15 1000000 50000000 +chr16 1000000 5000000 +chr17 1000000 5000000 +chr18 1000000 5000000 +chr19 1000000 5000000 +chr20 1000000 5000000 +chr21 1000000 50000000 +chr22 1000000 50000000 +chrX 1000000 10000000 +chrY 1000000 10000000 diff --git a/inst/testdata/test_wgbs_small.bed b/inst/testdata/test_wgbs_small.bed new file mode 100644 index 0000000..e10c179 --- /dev/null +++ b/inst/testdata/test_wgbs_small.bed @@ -0,0 +1,24 @@ +chr1 4500000 5000000 +chr2 4500000 5000000 +chr3 4500000 5000000 +chr4 4500000 5000000 +chr5 4500000 5000000 +chr6 4500000 5000000 +chr7 4500000 5000000 +chr8 4500000 5000000 +chr9 4500000 5000000 +chr10 4500000 5000000 +chr11 4500000 5000000 +chr12 4500000 5000000 +chr13 4500000 50000000 +chr14 4500000 50000000 +chr15 4500000 50000000 +chr16 4500000 5000000 +chr17 4500000 5000000 +chr18 4500000 5000000 +chr19 4500000 5000000 +chr20 4500000 5000000 +chr21 4500000 50000000 +chr22 4500000 50000000 +chrX 4500000 10000000 +chrY 4500000 10000000 diff --git a/inst/testdata/testdata_info.txt b/inst/testdata/testdata_info.txt new file mode 100644 index 0000000..c164367 --- /dev/null +++ b/inst/testdata/testdata_info.txt @@ -0,0 +1,2 @@ +test_tumor.bam - generated by downsampling NA18939 WGS +test_normal.bam - generated by downsampling NA20502 WGS \ No newline at end of file diff --git a/inst/testdata/tumor.bam b/inst/testdata/tumor.bam new file mode 100644 index 0000000..1539d61 Binary files /dev/null and b/inst/testdata/tumor.bam differ diff --git a/inst/testdata/tumor.bam.bai b/inst/testdata/tumor.bam.bai new file mode 100644 index 0000000..6b9fc1a Binary files /dev/null and b/inst/testdata/tumor.bam.bai differ diff --git a/inst/testdata/tumor.cna.txt b/inst/testdata/tumor.cna.txt new file mode 100644 index 0000000..2b1433f --- /dev/null +++ b/inst/testdata/tumor.cna.txt @@ -0,0 +1,11 @@ +chrom start end major_cn minor_cn purity ploidy fit pipeline +13 18173263 19986585 1 4 0.83 4.99604743083004 99.4967458472109 ascat +14 16097083 16137992 2 2 0.83 4.99604743083004 99.4967458472109 ascat +14 18958390 19976201 5 0 0.83 4.99604743083004 99.4967458472109 ascat +15 17037457 17058176 5 0 0.83 4.99604743083004 99.4967458472109 ascat +15 19805699 19961798 5 0 0.83 4.99604743083004 99.4967458472109 ascat +21 5221382 5313005 5 0 0.83 4.99604743083004 99.4967458472109 ascat +21 7259364 10807171 5 0 0.83 4.99604743083004 99.4967458472109 ascat +21 12966778 19980336 5 0 0.83 4.99604743083004 99.4967458472109 ascat +22 10710918 12425547 1 4 0.83 4.99604743083004 99.4967458472109 ascat +22 15166325 19967250 4 1 0.83 4.99604743083004 99.4967458472109 ascat diff --git a/inst/testdata/tumor_hg19.bam b/inst/testdata/tumor_hg19.bam new file mode 100644 index 0000000..1275374 Binary files /dev/null and b/inst/testdata/tumor_hg19.bam differ diff --git a/inst/testdata/tumor_hg19.bam.bai b/inst/testdata/tumor_hg19.bam.bai new file mode 100644 index 0000000..3565a80 Binary files /dev/null and b/inst/testdata/tumor_hg19.bam.bai differ diff --git a/inst/testdata/tumour_beds_min.sorted.bam b/inst/testdata/tumour_beds_min.sorted.bam new file mode 100644 index 0000000..c9df6be Binary files /dev/null and b/inst/testdata/tumour_beds_min.sorted.bam differ diff --git a/inst/testdata/tumour_beds_min.sorted.bam.bai b/inst/testdata/tumour_beds_min.sorted.bam.bai new file mode 100644 index 0000000..90a070c Binary files /dev/null and b/inst/testdata/tumour_beds_min.sorted.bam.bai differ diff --git a/tests/testthat.R b/tests/testthat.R old mode 100644 new mode 100755 index 78b1a88..818f76b --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,12 +1,4 @@ -# This file is part of the standard setup for testthat. -# It is recommended that you do not modify it. -# -# Where should you do additional test configuration? -# Learn more about the roles of various files in: -# * https://r-pkgs.org/tests.html -# * https://testthat.r-lib.org/reference/test_package.html#special-files - library(testthat) library(CAMDAC) -test_check("CAMDAC") +test_check("CAMDAC") \ No newline at end of file diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R new file mode 100644 index 0000000..4129804 --- /dev/null +++ b/tests/testthat/setup.R @@ -0,0 +1,22 @@ +# Global text fixtures that may be used by tests. +bam <- system.file("testdata", "tumour_beds_min.sorted.bam", package = "CAMDAC") +bam2 <- system.file("testdata", "normal_beds_min.sorted.bam", package = "CAMDAC") +regions <- system.file("testdata", "test_beds_segments.bed", package = "CAMDAC") + +config <- CamConfig( + outdir = "./result_test", + bsseq = "wgbs", + build = "hg38", + lib = "pe", + regions = regions, # Speed up tests + n_cores = 10, + min_cov = 1, # Required to capture sufficient SNPs from test + min_normal_cov = 1, + min_mapq = 1 +) + +tumor <- CamSample(id = "T", sex = "XY", bam = bam) +normal <- CamSample(id = "N", sex = "XY", bam = bam2) + +# Cleanup, as presented in https://testthat.r-lib.org/articles/test-fixtures.html +withr::defer(teardown_env()) diff --git a/tests/testthat/test-allele_count_full.R b/tests/testthat/test-allele_count_full.R new file mode 100644 index 0000000..3a0ead2 --- /dev/null +++ b/tests/testthat/test-allele_count_full.R @@ -0,0 +1,19 @@ +test_that("Tumor allele counting completes for test sample", { + bam <- system.file("testdata", "tumor.bam", package = "CAMDAC") + + # Create test config for segments only + config_t <- config + config_t$outdir <- "./result_test_ac_full" + withr::defer(fs::dir_delete(config_t$outdir)) + + # Run allele counting + tumor <- CamSample(id = "T", sex = "XY", bam = bam) + cmain_count_alleles(tumor, config_t) + ac_file <- get_fpath(tumor, config_t, "counts") + + testthat::expect_true( + fs::file_exists( + ac_file + ) + ) +}) diff --git a/tests/testthat/test-asm-counter.R b/tests/testthat/test-asm-counter.R new file mode 100644 index 0000000..5b069bc --- /dev/null +++ b/tests/testthat/test-asm-counter.R @@ -0,0 +1,34 @@ +testthat::skip("ASM not implemented.") + +test_that("ASM allele counter runs", { + # Load hets + # Attach SNPs to tumor and normal objects + hets_file <- system.file("testdata", "test_het_snps.tsv", package = "CAMDAC") + + asm_config <- CamConfig( + outdir = "./result_asm_counter", + bsseq = "wgbs", + build = "hg38", + lib = "pe", + n_cores = 10, + min_cov = 1 # Required to capture sufficient SNPs from test + ) + + attach_output(tumor, asm_config, "asm_snps", hets_file) + attach_output(normal, asm_config, "asm_snps", hets_file) + + # Run ASM allele counter + cmain_asm_allele_counts(tumor, asm_config) + cmain_asm_allele_counts(normal, asm_config) + + # Test for expected output files + exp_files <- c( + get_fpath(tumor, asm_config, "asm_counts"), + get_fpath(tumor, asm_config, "asm_hap_stats"), + get_fpath(tumor, asm_config, "asm_phase_map"), + get_fpath(normal, asm_config, "asm_counts"), + get_fpath(normal, asm_config, "asm_hap_stats"), + get_fpath(normal, asm_config, "asm_phase_map") + ) + testthat::expect_true(all(file.exists(exp_files))) +}) diff --git a/tests/testthat/test-asm-pipeline-after-main.R b/tests/testthat/test-asm-pipeline-after-main.R new file mode 100644 index 0000000..6715143 --- /dev/null +++ b/tests/testthat/test-asm-pipeline-after-main.R @@ -0,0 +1,31 @@ +testthat::skip("ASM not implemented.") + +test_that("ASM pipeline runs", { + # Run main + + main_config <- CamConfig( + outdir = "./result_test", + bsseq = "wgbs", + build = "hg38", + lib = "pe", + regions = regions, # Speed up tests + n_cores = 10, + min_cov = 1, # Required to capture sufficient SNPs from test + cna_caller = "ascat" # Battenberg always recommended. ASCAT used here to speed up test + ) + + # Run main (skips if outputs exist) + pipeline(tumor, germline = normal, infiltrates = NULL, origin = NULL, config) + + asm_pipeline( + tumor = tumor, + germline = normal, + infiltrates = normal, + origin = normal, + config = config + ) + + # Confirm that final output file is created + asm_out <- get_fpath(tumor, asm_config, "asm_dmp") + expect_true(file.exists(asm_out)) +}) diff --git a/tests/testthat/test-asm-pipeline.R b/tests/testthat/test-asm-pipeline.R new file mode 100644 index 0000000..d9bece0 --- /dev/null +++ b/tests/testthat/test-asm-pipeline.R @@ -0,0 +1,26 @@ +testthat::skip("ASM not implemented.") + +test_that("ASM pipeline runs", { + # Setup config + asm_config <- CamConfig( + outdir = "./result_asm", bsseq = "wgbs", lib = "pe", + build = "hg38", n_cores = 3, min_cov = 1, cna_caller = "ascat" + ) + + # Add ASM CNA caller + attach_output(tumor, asm_config, "asm_cna", system.file("testdata", "tumor.cna.txt", package = "CAMDAC")) + attach_output(tumor, asm_config, "asm_snps", system.file("testdata", "test_het_snps.tsv", package = "CAMDAC")) + attach_output(normal, asm_config, "asm_snps", system.file("testdata", "test_het_snps.tsv", package = "CAMDAC")) + + asm_pipeline( + tumor = tumor, + germline = normal, + infiltrates = normal, + origin = normal, + config = asm_config + ) + + # Confirm that final output file is created + asm_out <- get_fpath(tumor, asm_config, "asm_dmp") + expect_true(file.exists(asm_out)) +}) diff --git a/tests/testthat/test-attach-output.R b/tests/testthat/test-attach-output.R new file mode 100644 index 0000000..1324f95 --- /dev/null +++ b/tests/testthat/test-attach-output.R @@ -0,0 +1,26 @@ +test_that("attach output function writes files", { + outdir <- tempdir() + + config <- CamConfig( + outdir = outdir, + bsseq = "wgbs", + build = "hg38", + lib = "pe", + n_cores = 10 + ) + + tumor <- CamSample(id = "T", sex = "XY", bam = bam) + + # Get expected file and test that it doesn't exist + exp <- get_fpath(tumor, config, "counts") + if (file.exists(exp)) { + fs::file_delete(exp) + } + + # Run attach_output + counts_file <- system.file("testdata", "test.SNPs.CpGs.all.sorted.csv.gz", package = "CAMDAC") + attach_output(tumor, config, "counts", counts_file) + + # Test that file exists + testthat::expect_true(file.exists(exp)) +}) diff --git a/tests/testthat/test-cna.R b/tests/testthat/test-cna.R new file mode 100644 index 0000000..b2604bb --- /dev/null +++ b/tests/testthat/test-cna.R @@ -0,0 +1,33 @@ +testthat::skip("Invalid test data.") +test_that("ascat and battenberg runs on wgbs samples", { + # CNA caller test config + config_c <- CamConfig( + outdir = "./result_cna", + bsseq = "wgbs", + build = "hg38", + lib = "pe", + regions = regions, + n_cores = 30, + min_cov = 1 # Required to capture sufficient SNPs from test + ) + withr::defer(fs::dir_delete(config_c$outdir)) + + # Attach tsnps file to tumor + tsnps_file <- system.file("testdata", "test_tsnps.csv.gz", package = "CAMDAC") + attach_output(tumor, config_c, "tsnps", tsnps_file) + + # Reset + cna_file <- get_fpath(tumor, config_c, "cna") + if (fs::file_exists(cna_file)) { + fs::file_delete(cna_file) + } + + # Test CNA and expect file exists after ASCAT + config_c$cna_caller <- "ascat" + cmain_call_cna(tumor, config_c) + + tool <- fread(cna_file)$pipeline[[1]] + testthat::expect_equal(tool, "ascat") + fs::file_delete(cna_file) + +}) diff --git a/tests/testthat/test-external-cna.R b/tests/testthat/test-external-cna.R new file mode 100644 index 0000000..e8aa48f --- /dev/null +++ b/tests/testthat/test-external-cna.R @@ -0,0 +1,28 @@ +testthat::skip("Invalid test data.") +test_that("CAMDAC deconvolves using pre-computed CNA data", { + # Load external CNA object + cna_file <- system.file("testdata", "test.cna.txt", package = "CAMDAC") + + # Delete expected CNA file if it exists + exp <- get_fpath(tumor, config, "cna") + if (fs::file_exists(exp)) { + fs::file_delete(exp) + } + + # Objects for tumor, normal and config defined in `tests/testthat/setup.R` + # Add expected CNA file + attach_output(tumor, config, "cna", cna_file) + + # Test that pipeline runs without calling CNAs + stdout <- testthat::capture_output( + pipeline(tumor, germline = NULL, infiltrates = normal, origin = normal, config) + ) + + testthat::expect_true( + stringr::str_detect(stdout, "CNA Found.") + ) + + testthat::expect_true( + stringr::str_detect(stdout, "pipeline complete") + ) +}) diff --git a/tests/testthat/test-hg19-single-end.R b/tests/testthat/test-hg19-single-end.R new file mode 100644 index 0000000..3159cd9 --- /dev/null +++ b/tests/testthat/test-hg19-single-end.R @@ -0,0 +1,47 @@ +testthat::skip("Invalid test data.") +test_that("single-end allele counter runs on hg19 samples samples", { + # CNA caller test config + config_hg19 <- CamConfig( + outdir = "./result_hg19", + bsseq = "wgbs", + build = "hg19", + lib = "se", + n_cores = 10, + min_cov = 1 # Required to capture sufficient SNPs from test + ) + + # Test that beagle found for hg19 + testthat::expect_true(file.exists(config_hg19$beaglejar)) + # Test that caller is not battenberg (not implemented yet for hg19) + testthat::expect_true(config_hg19$cna_caller != "battenberg") + + # Run allele count on a small segment and confirm that BAM file with non `chr` mapping is processed + # bam <- system.file("testdata", "normal_hg19.bam", package = "CAMDAC") + # seg_regions <- system.file("testdata", "test_segments.bed", package = "CAMDAC") + # normal <- CamSample(id = "N", sex = "XY", bam = bam) + # ac_file <- get_fpath(normal, config_hg19, "counts") + # config_hg19$regions <- seg_regions + # cmain_count_alleles(normal, config_hg19) + # testthat::expect_true(fs::file_exists(ac_file)) + # fs::file_delete(ac_file) # Remove file to use new segments without overwriting. + + # Test full pipeline on hg19 + ## Overwrite regions for full test + regions <- system.file("testdata", "test_wgbs_segments.bed", package = "CAMDAC") + config_hg19$regions = regions + + tumor <- CamSample(id = "T", sex = "XY", + bam = system.file("testdata", "tumor_hg19.bam", package = "CAMDAC")) + normal <- CamSample(id = "N", sex = "XY", + bam = system.file("testdata", "normal_hg19.bam", package = "CAMDAC")) + stdout <- testthat::capture_output( + pipeline(tumor, germline = normal, infiltrates = normal, origin = normal, config_hg19) + ) + + expr <- "pipeline complete" + + testthat::expect_true( + stringr::str_detect(stdout, expr) + ) + +}) diff --git a/tests/testthat/test-hg19.R b/tests/testthat/test-hg19.R new file mode 100644 index 0000000..e10483f --- /dev/null +++ b/tests/testthat/test-hg19.R @@ -0,0 +1,47 @@ +testthat::skip("Invalid test data.") + +test_that("CAMDAC runs with battenberg in hg19 mode", { + # CNA caller test config + config_hg19 <- CamConfig( + outdir = "./result_hg19_pe", + bsseq = "wgbs", + build = "hg19", + lib = "pe", + # hg19 Battenberg will fail as not enough SNPs on each hg19/hg38 chrom for haplotyping to work. + # Hence, can't use tumor-normal and must SNP-inject? + cna_caller = "battenberg", + n_cores = 10, + min_cov = 1 # Required to capture sufficient SNPs from test + ) + config_hg19$regions = system.file("testdata", "test_wgbs_segments.bed", package = "CAMDAC") + + # Mock het SNP selection. As we're not using true tumor-normal pairs, + # normal SNP selection on BAF 0.2 and 0.8 will fail to yield sufficient SNPs + local_mocked_bindings( + select_heterozygous_snps = function(tsnps, ...){ + return(tsnps) + } + ) + + local_mocked_bindings( + get.chrom.names = function(...){ + c("3", "9") + }, + .package="Battenberg" + ) + + # Try again, simply replacing tsnps file + tsnps_file <- system.file("testdata", "test_tsnps.csv.gz", package = "CAMDAC") + attach_output(tumor, config_hg19, "tsnps", tsnps_file) + + stdout <- testthat::capture_output( + pipeline(tumor, germline = normal, infiltrates = normal, origin = normal, config_hg19) + ) + + expr <- "pipeline complete" + + testthat::expect_true( + stringr::str_detect(stdout, expr) + ) + +}) diff --git a/tests/testthat/test-panel.R b/tests/testthat/test-panel.R new file mode 100644 index 0000000..0322b6f --- /dev/null +++ b/tests/testthat/test-panel.R @@ -0,0 +1,104 @@ +test_that("allele counts combine to form panels given sample proportions", { + # Test panel using two samples that should return 1 eligible CpG + # Confirm that the panel methylation is a mixture of the two samples: + # sample 1 has a methylation of 1 and will be present at 20% + # sample 2 has a methylation of 0.5 and will be present at 80% + # expected pnael methylation at this site should be 0.6 + # expected panel coverage for this site should be 20% from sample 1 and 80% from sample 2 + ac_sample1 <- system.file("testdata", "test.SNPs.CpGs.all.sorted.csv.gz", package = "CAMDAC") + ac_sample2 <- system.file("testdata", "test_prop.SNPs.CpGs.all.sorted.csv.gz", package = "CAMDAC") + + panel <- panel_meth_from_counts( + ac_files = c(ac_sample1, ac_sample2), + ac_props = c(0.2, 0.8), + min_coverage = 3, + min_samples = 1, + max_sd = 0.8, + drop_snps = TRUE + ) + + # Test panel is a data table + expect_is(panel, "data.table") + # Test panel has expected rows + expect_true(nrow(panel) >= 1) + # Test panel has the expected columns + expected_fields <- c("chrom", "start", "end", "M", "UM", "m", "cov") + expect_true( + all(expected_fields %in% names(panel)) + ) + + # Test panel has the expected methylation + test_cg <- panel[chrom == "13" & start == 18231437 & end == 18231438, ] + expect_equal(test_cg$m, 0.6) + + # Test panel has the expected coverage + expect_equal(test_cg$cov, 12) + + # Test panel default behaviour is to return the sum of counts + panel_default <- panel_meth_from_counts( + ac_files = c(ac_sample1, ac_sample2), + min_coverage = 3, + min_samples = 1, + max_sd = 0.9, + drop_snps = TRUE + ) + test_cg2 <- panel_default[chrom == "13" & start == 18231437 & end == 18231438, ] + expect_equal(test_cg2$M, 8) + expect_equal(test_cg2$UM, 4) + expect_equal(test_cg2$m, (8 / (8 + 4))) + +# Test panel can be built from a matrix of beta values +data <- data.table::fread( + system.file("testdata", "test_panel_from_beta.csv", package = "CAMDAC") +) +mat = data[, 4:ncol(data)] + +panel_beta <- panel_meth_from_beta( + mat = mat, + chrom = data$chrom, + start = data$start, + end = data$end, + cov = 100, # Single value for coverage given to all CpGs. + props = c(0.1, 0.8, 0.1), + min_samples = 1, + max_sd = 1 +) + +test_cg3 <- panel_beta[chrom == "13" & start == 18231437 & end == 18231438, ] +expect_equal(round(test_cg3$m, 2), 0.24) # Expect linear combination of three betas +expect_equal(test_cg3$M + test_cg3$UM, 100) # Expect cg cov to meet input value +test_cg4 <- panel_beta[chrom == "13" & start == 18173666 & end == 18173667, ] +expect_equal(round(test_cg4$m, 2), 0.15) + +# Test panel can be created from a matrix of beta values and a vector +data <- data.table::fread( + system.file("testdata", "test_panel_from_beta.csv", package = "CAMDAC") +)[1:5,] +mat = data[, 4:ncol(data)] + +panel_cov <- panel_meth_from_beta( + mat = mat, + chrom = data$chrom, + start = data$start, + end = data$end, + cov = c(10, 1, 10, 1, 10), # Single value for coverage given to all CpGs. + props = c(0.1, 0.8, 0.1), + min_samples = 1, + max_sd = 1 +) + +panel_cov2 <- panel_meth_from_beta( + mat = mat, + chrom = data$chrom, + start = data$start, + end = data$end, + cov = matrix(30, nrow=5, ncol=3), # Single value for coverage given to all CpGs. + props = c(0.1, 0.8, 0.1), + min_samples = 1, + max_sd = 1 +) + +expect_equal(panel_cov$cov, c(10, 1, 10, 1, 10)) +expect_equal(panel_cov2$cov, c(30, 30, 30, 30, 30)) + +}) diff --git a/tests/testthat/test-pipeline-panel.R b/tests/testthat/test-pipeline-panel.R new file mode 100644 index 0000000..2648c0e --- /dev/null +++ b/tests/testthat/test-pipeline-panel.R @@ -0,0 +1,26 @@ +testthat::skip("Invalid test data.") +# FIX +test_that("tumor panel pipeline runs with panel of normals", { + # Create test config + config_p <- config + config_p$outdir <- "result_panel" + + # Get internal datasets + cna_file <- system.file("testdata", "test.cna.txt", package = "CAMDAC") + meth_file <- system.file("testdata", "test_panel.m.csv.gz", package = "CAMDAC") + + # Setup panel + panel <- CamSample(id = "PANEL", sex = "XY", bam = NULL) + attach_output(panel, config_p, "meth", meth_file) + attach_output(tumor, config_p, "cna", cna_file) + + # Test pipeline runs + stdout <- testthat::capture_output( + pipeline(tumor, germline = normal, infiltrates = panel, origin = panel, config_p) + ) + + expr <- "pipeline complete" + testthat::expect_true( + stringr::str_detect(stdout, expr) + ) +}) diff --git a/tests/testthat/test-pipeline.R b/tests/testthat/test-pipeline.R new file mode 100644 index 0000000..7c7581a --- /dev/null +++ b/tests/testthat/test-pipeline.R @@ -0,0 +1,13 @@ +testthat::skip("Invalid test data.") + +test_that("Pipeline tumour-normal completes", { + stdout <- testthat::capture_output( + pipeline(tumor, germline = normal, infiltrates = normal, origin = normal, config) + ) + + expr <- "pipeline complete" + + testthat::expect_true( + stringr::str_detect(stdout, expr) + ) +}) diff --git a/tests/testthat/test-rrbs_tumor_normal.R b/tests/testthat/test-rrbs_tumor_normal.R index ca3f19b..a91de67 100644 --- a/tests/testthat/test-rrbs_tumor_normal.R +++ b/tests/testthat/test-rrbs_tumor_normal.R @@ -1,6 +1,7 @@ +testthat::skip("Long-running function.") + test_that("rrbs tumor_normal_pipeline", { - # Log test run - logger::log_info("Running test: rrbs tumor_normal_pipeline. Estimated time: 25 minutes.") + # rrbs tumor_normal_pipeline testing. Estimated time: 25 minutes.") # Ensure pipeline files are downloaded pf_dir <- "pf_rrbs" diff --git a/tests/testthat/test-segments_bed.R b/tests/testthat/test-segments_bed.R new file mode 100644 index 0000000..c428f6d --- /dev/null +++ b/tests/testthat/test-segments_bed.R @@ -0,0 +1,39 @@ +testthat::skip("Invalid test data.") + +test_that("allele counting regions can be read from BED file", { + bam <- system.file("testdata", "tumour_beds_min.sorted.bam", package = "CAMDAC") + seg_regions <- system.file("testdata", "test_beds_segments.bed", package = "CAMDAC") + + # Create test config for segments only + config_t <- config + config_t$outdir <- "./result_test_segments_bed" + config_t$regions <- seg_regions + config_t$overwrite <- TRUE + withr::defer(fs::dir_delete(config_t$outdir)) + + # Run allele counting + tumor <- CamSample(id = "T", sex = "XY", bam = bam) + cmain_count_alleles(tumor, config_t) + ac_file <- get_fpath(tumor, config_t, "counts") + + testthat::expect_true( + fs::file_exists( + ac_file + ) + ) + + # Check that allele counts regions do not extend beyond the segments given + dt <- fread_chrom(ac_file) + regs <- data.table::fread(seg_regions) + names(regs) <- c("chrom", "start", "end") + + testthat::expect_equal( + unique(dt$CHR), + unique(regs$chrom) + ) + + all_within <- max(dt$start) <= max(regs$end) && min(dt$end) >= min(regs$start) + testthat::expect_true( + all_within + ) +}) diff --git a/tests/testthat/test-test_download_pipeline_files.R b/tests/testthat/test-test_download_pipeline_files.R index 5c96cfb..29d77a7 100644 --- a/tests/testthat/test-test_download_pipeline_files.R +++ b/tests/testthat/test-test_download_pipeline_files.R @@ -1,3 +1,5 @@ +testthat::skip("Test download URL not implemented.") + test_that("pipeline file download works", { pf <- download_pipeline_files("test", directory="pfdw_test") expected <- fs::path(pf, "wgbs") diff --git a/tests/testthat/test-tumor-only.R b/tests/testthat/test-tumor-only.R new file mode 100644 index 0000000..4f95a4b --- /dev/null +++ b/tests/testthat/test-tumor-only.R @@ -0,0 +1,60 @@ +testthat::skip("Experimental.") + +test_that("CNA calls can be made without a germline sample", { + # CNA caller test config + config_c <- CamConfig( + outdir = "./result_to", + bsseq = "wgbs", + build = "hg38", + lib = "pe", + regions = regions, + n_cores = 30, + cna_caller = "ascat", + min_cov = 1 # Required to capture sufficient SNPs from test + ) + #withr::defer(fs::dir_delete(config_c$outdir)) + + # Setup germline versions: no data, SNP pos only, SNP pos with counts + germline <- normal + germline_to <- NULL + + germline_pos <- CamSample(id = "G", sex = "XY") + germline_pos_file <- system.file("testdata", "test.to.norm_pos.csv.gz", package = "CAMDAC") + attach_output(germline_pos, config_c, "snps", germline_pos_file) + + germline_count <- CamSample(id= "GP", sex= "XY") + germline_count_file <- system.file("testdata", "test.to.norm_pos_count.csv.gz", package = "CAMDAC") + attach_output(germline_count, config_c, "snps", germline_count_file) + + # Run allele counting + preprocess_wgbs(list( + tumor, germline, germline_to, germline_count, germline_pos + ), config_c) + + # Mock gc annotation + local_mocked_bindings( + select_heterozygous_snps = function(tsnps, ...){ + tsnps + } + ) + + # Set output names expectation + exp_names = c("chrom", "POS", "total_counts", "total_depth", "ref", "alt", "BAF", "BAFr", + "total_depth_n", "total_counts_n", "BAF_n", "BAFr_n", "LogR_n") + + # Tumor only + tsnps_to = cmain_bind_snps(tumor, germline_to, config_c) + fs::file_exists(tsnps_to) + fs::file_delete(c(tsnps_to)) + + # SNP only + tsnps_pos = cmain_bind_snps(tumor, germline_pos, config_c) + fs::file_exists(tsnps_pos) + fs::file_delete(c(tsnps_pos)) + + # SNP with counts + tsnps_count = cmain_bind_snps(tumor, germline_count, config_c) + fs::file_exists(tsnps_count) + fs::file_delete(c(tsnps_count)) + +}) diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 0000000..097b241 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/contributing.Rmd b/vignettes/contributing.Rmd new file mode 100644 index 0000000..f7a049f --- /dev/null +++ b/vignettes/contributing.Rmd @@ -0,0 +1,47 @@ +--- +title: "Contributing" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{contributing} + %\VignetteEngine{rmarkdown::render} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + eval = FALSE +) +``` + +To contribute to CAMDAC, fork [the repository](https://github.com/VanLoo-lab/CAMDAC) and install the development dependencies with `remotes::install_dev_deps('.')`. + +After making your changes, run the test and build commands listed below, then submit a [pull request](https://github.com/VanLoo-lab/CAMDAC/pulls) with the changes on your fork. + +## CAMDAC test and build commands + +```{r} +library(devtools) + +# Install dev dependencies +devtools::install_dev_deps("VanLoo-lab/CAMDAC") + +# Update docs +devtools::document() + +# Run tests +devtools::test() + +# Build readme +rmarkdown::render('README.Rmd', output_format='github_document', output_file='README.md') + +# Check package builds +devtools::check() + +# Build documentation +pkgdown::build_site(examples=FALSE, devel=TRUE, lazy=TRUE, preview=FALSE) +pkgdown::preview_site() # To view. Or: python3 -m http.server --directory docs 8000 + +# Commit changes on the docs/ folder before submitting +``` \ No newline at end of file diff --git a/vignettes/experimental.Rmd b/vignettes/experimental.Rmd new file mode 100644 index 0000000..2142e40 --- /dev/null +++ b/vignettes/experimental.Rmd @@ -0,0 +1,471 @@ +--- +title: "Experimental Features" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{experimental} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + eval = FALSE +) +``` + +This document describes experimental features of the CAMDAC package. These features are not yet fully tested and may change in future releases. The following features are currently under development for the **WGBS pipeline** only: + +* Deconvolution only +* Using external copy number solutions +* Copy number calling in tumor-only mode +* Allele-specific methylation analysis +* Normal DNA methylation panels +* DMR visualisation + +## Deconvolution only + +The CAMDAC equation can be used to infer pure tumour DNA methylation rates, provided the following information is available per CpG: + +* Bulk tumour methylation rate (CpG-wise) +* Tumour allele-specific copy number state (local region overlapping CpG) +* Tumour purity (single parameter per-sample) + +Here is an example for 5 CpGs from a single sample. Note: the normal copy number state is assumed diploid (2) in humans: +```{r} + +# Set parameters +bulk = c(0.3, 0.5, 0.2, 0.1, 0.9) +normal = c(0.3, 0.9, 0.1, 0.7, 0.5) +ploidy = c(2, 2, 1, 3, 4) +purity = 0.8 + +# Deconvolve methylation rates +pure_meth = CAMDAC:::calculate_mt(bulk, normal, purity, ploidy) + +# Set clean rates based on threshold +pure_meth_clean = dplyr::case_when( + pure_meth < 0 ~ 0, + pure_meth > 1 ~ 1, + TRUE ~ pure_meth +) + +``` + +After deconvolution, it may be useful to estimate the CpG coverage in the deconvolved tumour sample. Additionally, the highest density interval (HDI) of the methylation rate may be informative for quality control. These metrics can be calculated given additional information on bulk methylated and unmethylated read counts: + +```{r} + +# Optional: calculate effective coverage of the tumour +# # Requires coverage per CpG in the bulk sample +bulk_coverage = c(10, 20, 5, 15, 30) +pure_effective_coverage = CAMDAC:::calculate_mt_cov(bulk_coverage, purity, ploidy) + +# Optional: calculate the HDI of the pure tumour methylation rate +bulk_methylated_count = c(3, 10, 1, 2, 27) +bulk_unmethylated_count = c(7, 10, 4, 13, 3) +normal_methylated_count = c(3, 9, 1, 5, 2) +normal_unmethylated_count = c(7, 11, 3, 8, 3) + +# HDI function (fast) +CAMDAC:::hdi_norm_approx( + bulk_methylated_count, + bulk_unmethylated_count, + normal_methylated_count, + normal_unmethylated_count, + purity, + ploidy +) + +# HDI function (most accurate) +CAMDAC:::vec_HDIofMCMC_mt( + bulk_methylated_count, + bulk_unmethylated_count, + normal_methylated_count, + normal_unmethylated_count, + purity, + ploidy, + credMass=0.99 +) +``` + + +## Using external copy number solutions + +The germline sample is optional as, in the absence of patient-matched methylation data, you may already have an allele-specific CNA solutions for your bulk tumor. For example, this could be derived from bulk WGS of the same sampl. + +You can provide this data in tab-delimited text file as shown below. Importantly,: + +- column names are optional +- purity and ploidy values are taken from the first data row alone +- chromosome names may be given with or without 'chr' prefix + +| chrom | start | end | major_cn | minor_cn | purity | ploidy | +| --------- | ---- | ---- | ---- | ---- | ---- | ---- | +| chr1 | 1 | 400 | 2 | 1 | 0.67 | 3.5 | +| chr1 | 401 | 1000 | 1 | 1 | 0.67 | 3.5 | + +To run CAMDAC with this CNA solution, pass attach the file to the tumor `CamSample()` object: + +```{r add_cna} +library(CAMDAC) + +# Load test data +b_tumor <- system.file("testdata", "tumor.bam", package = "CAMDAC") +b_normal <- system.file("testdata", "normal.bam", package = "CAMDAC") +cna_file <- system.file("testdata", "test.cna.txt", package = "CAMDAC") + +# Set config +config <- CamConfig(outdir="./results", bsseq="wgbs", lib="pe", build="hg38", n_cores=10) + +# Create tumor object and attach CNA solution +tumor <- CamSample(id="T", sex="XY", bam=b_tumor) +attach_output(tumor, config, "cna", cna_file) + +# Define normal object(s) for deconvolution or differential methylation +normal <- CamSample(id="N", sex="XY", bam=b_normal) + +# Run pipeline with CNA solution +pipeline( + tumor=tumor, + germline=NULL, + infiltrates=normal, + origin=normal, + config=config +) +``` + +## Copy number calling in tumor-only mode + +If no SNP file is present for the germline, CAMDAC will infer the copy number calls from the tumor sample alone. Here, the BAF is calculated by a threshold on the tumor BAF, and the LogR is calculated by taking the coverage relative to the median. These results are not as accurate as using a germline normal sample. + +You may already know where heterozygous SNPs lie for your sample, obviating the need for a tumor BAF threshold. In addition, you may have a proxy of the normal coverage for your platform, which is an improvement over taking the tumor median. You can provide this information by attaching a SNPs file to the germline CamSample object. The file should contain: + +| Field | Description | +| --- | --- | +| chrom | Chromosome name | +| POS | Position of SNP | +| BAF | (optional) B-allele frequency at this SNP | +| total_counts | (optional) Total number of reads at this SNP | + +POS and total_counts are used to derive the BAF and the LogR respectively. We strongly recommend that total_counts is derived from a normal sample sequenced with the same bisulfite-sequencing assay as the tumor, and unmatched patient samples are acceptable. + +CAMDAC may be run to the copy number calling stage using the external heterozygous SNP file: + +```{r} +library(CAMDAC) + +# Load test data +b_tumor <- system.file("testdata", "tumor.bam", package = "CAMDAC") +snps_file <- system.file("testdata", "test.to.norm_pos.csv.gz", package = "CAMDAC") + +# Set config +config <- CamConfig(outdir="./results", bsseq="wgbs", lib="pe", build="hg38", n_cores=10) + +# Create tumor object and attach CNA solution +tumor <- CamSample(id="T", sex="XY", bam=b_tumor) +attach_output(tumor, config, "cna", cna_file) + +# Define normal object(s) for deconvolution or differential methylation +germline <- CamSample(id="G", sex="XY") +attach_output(germline, config, "snps", snps_file) + +# Run pipeline with CNA solution +pipeline( + tumor=tumor, + germline=germline, + infiltrates=NULL, + origin=NULL, + config=config +) + +``` + +After this, we recommend inspecting the CNA results. If all is well, the pipeline() function can be repeated with the infiltrates and origin CamSamples to complete deconvolution and differential methylation respectively. + +## Allele-specific methylation (ASM) analysis + +CAMDAC can be used to detect allele-specific methylation (ASM) by phasing CpGs to heterozygous SNPs and deconvolving bulk methylation rates per allele. + +This tutorial steps through the ASM analysis pipeline (WGBS only): + +1. Count CpG methylation on tumor and normal at sites phased to SNP loci. +2. Deconvolve methylation on tumor **per haplotype** using the normal +3. Assign allele-specific copy number state **per CpG** using the bulk tumor solution +4. Call allele-specific differential methylation within samples +5. Call allele-specific differential methylation between samples + +Results from this pipeline are found in the results directory under 'PATIENT/AlleleSpecific' and 'PATIENT/Methylation'. See output file headings below for files and their content. + +### CAMDAC-ASM from BAM files + +The `asm_pipeline()` function runs CAMDAC-ASM analysis by generates the allele-specific copy number solution and heterozygous SNP loci, followed by deconvolution and differential ASM analysis: + +```{r} +b_tumor <- system.file("testdata", "tumor.bam", package = "CAMDAC") +b_normal <- system.file("testdata", "normal.bam", package = "CAMDAC") +regions <- system.file("testdata", "test_wgbs_segments.bed", package = "CAMDAC") # speed up tests + +tumor <- CamSample(id = "T", sex = "XY", bam = b_tumor) +normal <- CamSample(id = "N", sex = "XY", bam = b_normal) +config <- CamConfig( + outdir = "./results", ref = "./pipeline_files", bsseq = "wgbs", lib = "pe", cores = 10, + min_cov = 1, # For test data + regions = regions +) + +asm_pipeline( + tumor = tumor, + germline = normal, + infiltrates = normal, + origin = normal, + config = config +) +``` + + +### CAMDAC-ASM from external inputs (in_development) + +To run the ASM pipeline without BAM files, CAMDAC requires: +- Each CamSample object has SNP loci +- The tumor CamSample object has an allele-specific CNA solution +- All CamSample objects have BAM files available for phasing + +CAMDAC-ASM requires a file of heterozygous SNP loci against which CpGs will be phased. This is a tab-delimited file with a header containing four fields: + +| Field | Description | +| ---- | ------------ | +| chrom | Chromosome name | +| pos | SNP loci position | +| ref | The reference allele (A/C/T/G) | +| alt | The alternate SNP allele (A/C/T/G) | + +First, attach your SNP loci file to the tumor object with `attach_output()`, then run `asm_pipeline()`: +```{r} +# Setup CAMDAC samples +tumor <- CamSample(id = "tumor", sex = "XY", bam = b_tumor) +normal <- CamSample(id = "normal", sex = "XY", bam = b_normal) +config <- CamConfig( + outdir = "./results", ref = "./pipeline_files", bsseq = "wgbs", lib = "pe", cores = 10, + min_cov = 1, # For test data + regions = regions +) # For arapid testing) + +# Add SNPs +asm_snps_file <- system.file("testdata", "test_het_snps.tsv", package = "CAMDAC") +attach_output(tumor, config, "asm_snps", asm_snps_file) +attach_output(normal, config, "asm_snps", asm_snps_file) +``` + +Next, CAMDAC requires the allele-specific copy number solution from the tumor, attached as follows: +```{r} +cna_file <- system.file("testdata", "test_cna.tsv", package = "CAMDAC") +attach_output(tumor, config, "cna", cna_file) +``` + +Finally, run the allele-specific methylation pipeline: +```{r} +asm_pipeline( + tumor = tumor, + infiltrates = normal, + origin = normal, + config = config +) +``` + +### CAMDAC-ASM using SNP calls from previous CAMDAC runs + +If you have already run the CAMDAC pipeline in tumor-normal mode, then the germline object's SNP files will be used by default. The simplest run from BAM to ASM is shown below using matched normals for infiltrates and DMPs: + +```{r} +b_tumor <- system.file("testdata", "tumor.bam", package = "CAMDAC") +b_normal <- system.file("testdata", "normal.bam", package = "CAMDAC") +regions <- system.file("testdata", "test_wgbs_segments.bed", package = "CAMDAC") # speed up tests + +tumor <- CamSample(id = "T", sex = "XY", bam = b_tumor) +normal <- CamSample(id = "N", sex = "XY", bam = b_normal) +config <- CamConfig( + outdir = "./test_results", bsseq = "wgbs", lib = "pe", + build = "hg38", n_cores = 10, + regions = regions, + min_cov = 1, # For test data + cna_caller = "ascat" # Battenberg always recommended, however ASCAT used here for rapid testing. +) + +# Run main CAMDAC generate SNP files for ASM +# Deconvolution skipped here for simplicity. +pipeline(tumor, germline = normal, infiltrates = NULL, origin = NULL, config) + +# Run ASM pipeline +asm_pipeline( + tumor = tumor, + germline = normal, + infiltrates = normal, + origin = normal, + config = config +) +``` + +### ASM output file headings + +** Allele-specific/ ** + +- *asm_counts.csv.gz - The number of reads supporting each allele at each CpG +- *asm_hap_stats.csv.gz - Summary statistics for each phased SNP +- *asm_phase_map.csv.gz - A mapping of CpG-SNP phased pairs per read +- *snps.txt - The heterozygous SNP loci input for ASM analysis +- *cna.csv - For the tumour, the allele-specific copy number profile. See format in `vignettes("pipeline")`. + +** Methylation/ ** + +- *asm_meth.csv.gz - Allele-specific methylation rates for bulk samples +- *asm_ss_dmp.csv.gz - Single sample differential allele-specific methylation +- *asm_meth_cna.csv.gz - For the tumour, ASM rates with annotated copy number states +- *asm_meth_pure.csv.gz - For the tumour, pure methylation rates for each allele +- *asm_dmp.csv.gz - Differential allele-specific methylation between tumor and origin sample + +## Normal DNA methylation panels + +This feature is currently described for CAMDAC-WGBS only. + +### Create a methylation panel from multiple normal BAM files + +CAMDAC supports the use of multiple DNA methylation BAM files as a source of the normal infiltrates or normal cell of origin. + +To create a panel, process your BAM files with the CAMDAC allele counter: + +``` +library(CAMDAC) + +# Get BAM files +b_normal1 = system.file("inst/testdata/normal.bam") +b_normal2 = system.file("inst/testdata/normal.bam") +b_normal3 = system.file("inst/testdata/normal.bam") + +# Run allele counter +for(file in c(b_normal1, b_normal2, b_normal3)){ + prefix = fs::path_ext_remove(file) + outfile = paste0(prefix, ".all.SNPs.CG.csv.gz") + data = cmain_count_alleles(bam_file) + data.table::fwrite(data, outfile) +} +``` + +The allele counts files can then be merged into a single file for the panel containing methylation data for deconvolution: + +```{r} +panel_counts <- fs::dir_ls(".", glob="*.SNPs.CG.csv.gz") +panel <- panel_meth_from_counts(panel_counts) +data.table::fwrite(panel, "panel.m.csv.gz") +``` + +By default, panel counts are merged by summing the methylation read counts for each CpG site. You can customise the proportion of each sample that is used in the panel by specifying the `ac_props` argument in panel_meth_from_counts. To get the mean across each CpG site, simply pass equal proportions for each sample. + +To run CAMDAC with your newly created panel, attach your panel to a CamSample object using the `meth` argument. + +```{r} +# Load test data +b_tumor <- system.file("testdata", "tumor.bam", package = "CAMDAC") +b_normal <- system.file("testdata", "normal.bam", package = "CAMDAC") + +# Setup CAMDAC samples +tumor <- CamSample(id="tumor", sex="XY", bam=b_tumor) +normal <- CamSample(id="normal", sex="XY", bam=b_normal) +config <- CamConfig(outdir="./results", ref="./pipeline_files", bsseq="wgbs", lib="pe", cores=10) + +# Setup panel sample +panel <- CamSample(id="panel", sex="XY") +panel_file <- system.file("testdata", "test_panel.m.csv.gz", package = "CAMDAC") +attach_output(panel, config, "meth", panel_file) + +# Run CAMDAC with panel +pipeline( + tumor=tumor, + germline=normal, + infiltrates=panel, + origin=panel, + config=config +) +``` + +### Create a methylation panel from a matrix of beta values + +If you have not started from BAM files, you can create a panel using a matrix of beta values: + +| sample1 | sample2 | sample3 | +|---------|---------|---------| +| 0.5 | 0.6 | 0.7 | +| 0.4 | 0.5 | 0.6 | + +Additionally, a data frame specifying the positions of each CpG site in the beta value matrix is required. Here, start and end refer to the C and G of the CpG site respectively: + +| chrom | start | end | +|-------|-------|-----| +| chr1 | 100 | 101 | +| chr1 | 200 | 201 | + +The matrix and CpG locations can be passed directly to the `panel_meth_from_beta()` function, along with settings. + +```{r} +# Load beta values and chromosome positions +ex <- system.file("testdata", "test_panel_from_beta.csv", package = "CAMDAC") +data <- data.table::fread(ex) +mat = data[, 4:ncol(data)] # Beta value matrix with 3 samples + +# Create panel from beta values +panel_beta <- panel_meth_from_beta( + mat = mat, + chrom = data$chrom, + start = data$start, + end = data$end, + cov = 100, + props = c(0.1, 0.8, 0.1), # Proportions of each sample in panel + min_samples = 1, + max_sd = 1 +) +``` + +As CAMDAC requires coverage at each CpG site to estimate uncertainty, the `cov` value is given to all CpG sites when building a panel from beta values. Additionally, if any beta values are missing from a sample, proportions are recalculated among the remaining samples as this is the only information available to build the panel for that site. + +There are two experimental arguments that can be set to filter CpG sites from the panel: + +* min_samples: The minimum number of samples that have to have a beta value for a CpG to be included in the panel. The idea here is if you have sparse data, you can skip sites where you aren’t confident in the panel. Set this to 1 to use any sample. + +* max_sd: Maximum standard deviation of beta values across samples a CpG must have to be included in the panel. The idea here is that when combining many bulk methylomes from the same tissue, sites with high variability reflect sample-specific differences and their averages are less reliable for use in a methylation panel. + +## DMR visualisation + +CAMDAC produces several output files that visualise the copy number state. DNA methylation rates can be passed to external packages for visualisation. For a quick view of DMRs in R: + +```{r} +library(data.table) +library(ggplot2) +library(CAMDAC) + +# Show DMPs around a region +dmr <- data.table(dmr) # Object from CAMDAC output *annotated_DMRs.fst +dmp <- data.table(dmp) # Object from CAMDAC *results_per_CpG.fst +chrome <- dmr[1, ]$chrom +starte <- dmr[1, ]$start +ende <- dmr[1, ]$end +offset <- 1000 # Offset 1kB either side of region +dmp <- data.table(dmp) +dm_regions <- dmp[chrom == as.character(chrome) & start >= (starte - offset) & end <= (ende + offset), ] + +# Using ggplot, generate a geom where the m_t values are +tplt <- ggplot(dm_regions, aes(x = start)) + + geom_point(aes(y = m_t), color = "skyblue") + + geom_point(aes(y = m_n), color = "grey") + + geom_vline(aes(xintercept = start, color = DMP_t)) + + theme_classic() + + scale_color_manual(values = c("skyblue", "blue")) + + scale_y_continuous(limits = c(0, 1)) + + geom_vline(xintercept = c(start, end), color = "red", linetype = "dashed") + + labs(x = dm_regions$chrom[[1]]) +tplt +``` + +![CAMDAC DMR Visualization](images/camdac_dmr_vis.png) + +Here, light blue dots are the pure tumour, while light-grey are the normal. The red dash is the DMR region and the vertical lines are hypomethylated DMPs (blue) and hypermethylated DMPs (light blue). \ No newline at end of file diff --git a/vignettes/images/CAMDAC_manual_DMR_summary_plots.png b/vignettes/images/CAMDAC_manual_DMR_summary_plots.png new file mode 100644 index 0000000..b359f72 Binary files /dev/null and b/vignettes/images/CAMDAC_manual_DMR_summary_plots.png differ diff --git a/vignettes/images/CAMDAC_manual_SNP_data.png b/vignettes/images/CAMDAC_manual_SNP_data.png new file mode 100644 index 0000000..22c7423 Binary files /dev/null and b/vignettes/images/CAMDAC_manual_SNP_data.png differ diff --git a/vignettes/images/CAMDAC_manual_SNP_output.png b/vignettes/images/CAMDAC_manual_SNP_output.png new file mode 100644 index 0000000..938f966 Binary files /dev/null and b/vignettes/images/CAMDAC_manual_SNP_output.png differ diff --git a/vignettes/images/CAMDAC_manual_fig1.png b/vignettes/images/CAMDAC_manual_fig1.png new file mode 100644 index 0000000..1f3e4e8 Binary files /dev/null and b/vignettes/images/CAMDAC_manual_fig1.png differ diff --git a/vignettes/images/CAMDAC_manual_fig2.png b/vignettes/images/CAMDAC_manual_fig2.png new file mode 100644 index 0000000..872b72f Binary files /dev/null and b/vignettes/images/CAMDAC_manual_fig2.png differ diff --git a/vignettes/images/CAMDAC_manual_formatted_allele_counts_output.png b/vignettes/images/CAMDAC_manual_formatted_allele_counts_output.png new file mode 100644 index 0000000..20104bd Binary files /dev/null and b/vignettes/images/CAMDAC_manual_formatted_allele_counts_output.png differ diff --git a/vignettes/images/CAMDAC_manual_fragment_length_histogram.png b/vignettes/images/CAMDAC_manual_fragment_length_histogram.png new file mode 100644 index 0000000..431c52b Binary files /dev/null and b/vignettes/images/CAMDAC_manual_fragment_length_histogram.png differ diff --git a/vignettes/images/CAMDAC_manual_normal_SNP_data.png b/vignettes/images/CAMDAC_manual_normal_SNP_data.png new file mode 100644 index 0000000..48926b4 Binary files /dev/null and b/vignettes/images/CAMDAC_manual_normal_SNP_data.png differ diff --git a/vignettes/images/CAMDAC_manual_normal_methylation_output.png b/vignettes/images/CAMDAC_manual_normal_methylation_output.png new file mode 100644 index 0000000..113023d Binary files /dev/null and b/vignettes/images/CAMDAC_manual_normal_methylation_output.png differ diff --git a/vignettes/images/CAMDAC_manual_normal_methylation_rate_summary.png b/vignettes/images/CAMDAC_manual_normal_methylation_rate_summary.png new file mode 100644 index 0000000..77c21e8 Binary files /dev/null and b/vignettes/images/CAMDAC_manual_normal_methylation_rate_summary.png differ diff --git a/vignettes/images/CAMDAC_manual_tumour_methylation_rate_summary_panelA.png b/vignettes/images/CAMDAC_manual_tumour_methylation_rate_summary_panelA.png new file mode 100644 index 0000000..51fe1c4 Binary files /dev/null and b/vignettes/images/CAMDAC_manual_tumour_methylation_rate_summary_panelA.png differ diff --git a/vignettes/images/CAMDAC_manual_tumour_versus_normal_methylomes.png b/vignettes/images/CAMDAC_manual_tumour_versus_normal_methylomes.png new file mode 100644 index 0000000..71ed4d9 Binary files /dev/null and b/vignettes/images/CAMDAC_manual_tumour_versus_normal_methylomes.png differ diff --git a/vignettes/images/camdac_dmr_vis.png b/vignettes/images/camdac_dmr_vis.png new file mode 100644 index 0000000..b64ad76 Binary files /dev/null and b/vignettes/images/camdac_dmr_vis.png differ diff --git a/vignettes/introduction.Rmd b/vignettes/introduction.Rmd new file mode 100644 index 0000000..2e16307 --- /dev/null +++ b/vignettes/introduction.Rmd @@ -0,0 +1,34 @@ +--- +title: "Introduction" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{introduction} + %\VignetteEngine{rmarkdown::render} + %\VignetteEncoding{UTF-8} +--- + +### Introduction + +Solid tumours typically contain both cancer and admixed normal contaminating cells, which confounds the analysis of bulk cancer methylomes from bisulfite sequencing. To address these issues we present CAMDAC, a tool for **C**opy-number **A**ware **M**ethylation **D**econvolution **A**nalysis of **C**ancer. + +In brief, we show that the bulk tumour methylation rate ($m_b$) can be expressed as a weighted sum of the methylation rates of the tumour cells and normal contaminants, accounting for tumour purity and copy number (**Figure 1**). We derive purity and copy number estimates directly from bulk tumour RRBS data, leveraging somatic copy number aberration calls from ASCAT or Battenberg. We use bulk tissue- and sex-matched normal samples as proxy for the normal tumour-infiltrating cells ($m_{n,i}$), and obtain $m_b$ from the bulk tumour data itself. This provides all the necessary information to extract the pure tumour methylation rate ($m_t$). + +
    + +![**Figure 1.** CAMDAC principles and key variables. Adapted from Larose Cadieux *et al.*, 2020.](images/CAMDAC_manual_fig1.png){width=85%} + +
    + +
    In [Larose Cadieux *et al.*, 2020](https://doi.org/10.1101/2020.11.03.366252), we obtained bulk tumour RRBS data from surgically resected lung cancers and patient-matched tumour-adjacent normal lung samples. Normal samples may be used for copy number profiling, as proxy a for the normal tumour-infiltrating cells ($m_{n,i}$), and as a proxy for the tumour cell of origin ($m_{n,o}$). Here, $m_{n,i}$ is needed for bulk tumour methylation rate deconvolution and $m_{n,o}$ is required for differential methylation analyses (**Figure 2**). In non-small cell lung cancer, we demonstrate that patient-matched tumour-adjacent normal is a suitable proxy for all normals, i.e. $m_{n,i} \approx m_{n,o}$ [(Larose Cadieux et *al.*, 2020)](https://doi.org/10.1101/2020.11.03.366252). + +
    + +![**Figure 2.** Key input and output data for CAMDAC](images/CAMDAC_manual_fig2.png){width=85%} + +
    + +
    If the patient-matched tumour-adjacent normal tissue is not available, a tissue- and sex-matched normal may provide a substitute for the tumour-infiltrating normal cells (**Figure 2**). If the tissue-matched normal is a poor representative of the cell of origin, a different proxy may be used for differential methylation analysis. + +The purified tumour methylation rates allow for accurate differential methylation analysis, both between tumour and normal cells and, in the case of multi-region sequencing, between different tumour samples. The deconvoluted methylation profiles accurately inform inter- and intra-tumour sample relationships and could enable the timing of copy number gains and (epi)mutations in tumour evolution. This is explained in more detail in [Larose Cadieux et *al.*, 2020](https://doi.org/10.1101/2020.11.03.366252). + +At time of writing, CAMDAC is compatible with human *Msp1* digested single-end directional reduced representation bisulfite sequencing (RRBS) data and whole genome bisulfite sequencing (WGBS) data. The input must be in binary alignment map (BAM) format. Bases should be quality and adapter trimmed and PCR duplicates should be removed. BAM files may be aligned to hg19, hg38, GRCH37 and GRHCH38 reference human genome builds.

    \ No newline at end of file diff --git a/vignettes/output.Rmd b/vignettes/output.Rmd new file mode 100644 index 0000000..4f0ee5a --- /dev/null +++ b/vignettes/output.Rmd @@ -0,0 +1,130 @@ +--- +title: "Results" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{output} + %\VignetteEngine{rmarkdown::render} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + eval=FALSE +) +``` + +The CAMDAC pipeline returns a structured directory at the `outdir` from the `CamConfig()` object. The pipeline returns files unique to the RRBS and WGBS modules with the general structure: + +``` +└── + ├── Allelecounts + │ ├── + ├── Copynumber + │ ├── + └── Methylation + └── +``` + +The sections below describe each results file in more detail. Next, see `vignette("questions")` for frequently asked questions or `vignette("experimental")` for details on experimental CAMDAC features. + +## RRBS pipeline output + +``` +results/ +└── P + ├── Allelecounts + │ ├── N + │ │ └── P.N.SNPs.CpGs.all.sorted.RData + │ └── T + │ └── P.T.SNPs.CpGs.all.sorted.RData + ├── Copy_number + │ ├── N + │ │ ├── fragment_length_histogram.pdf + │ │ ├── msp1_fragments_RRBS.RData + │ │ ├── P_N_normal_SNP_data.pdf + │ │ ├── P.N.SNPs.RData + │ │ └── Rplots.pdf + │ └── T + │ ├── fragment_length_histogram.pdf + │ ├── msp1_fragments_RRBS.RData + │ ├── P_T_SNP_data.pdf + │ ├── P.T.ACF.and.ploidy.txt + │ ├── P.T.ascat.bc.RData + │ ├── P.T.ascat.frag.RData + │ ├── P.T.ascat.output.RData + │ ├── P.T.ASCATprofile.png + │ ├── P.T.ASPCF.png + │ ├── P.T.BAF.PCFed.txt + │ ├── P.T.germline.png + │ ├── P.T.LogR.PCFed.txt + │ ├── P.T.rawprofile.png + │ ├── P.T.SNPs.RData + │ ├── P.T.sunrise.png + │ ├── P.T.tumour.png + │ └── Rplots.pdf + └── Methylation + ├── N + │ ├── dt_normal_m.RData + │ └── P_N_methylation_rate_summary.pdf + └── T + ├── CAMDAC_DMPs.bed + ├── CAMDAC_purified_tumour.bed + ├── CAMDAC_results_per_CpG.RData + ├── P_T_DMP_stats.txt + ├── P_T_methylation_rate_summary.pdf + ├── purified_tumour.RData + └── tumour_versus_normal_methylomes.pdf +``` + +| File | Description | +|------|-------------| +| `P.T.SNPs.CpGs.all.sorted.RData` | Allele counts for a sample. Generated by processing BAM file | +| `P.T.ascat.output.RData` | ASCAT copy number results | +| `P.T.ASCATprofile.png` | ASCAT copy number profile | +| `dt_normal_m.RData` | Bulk normal DNA methylation data | +| `purified_tumour.RData` | CAMDAC-purified DNA methylation rates | +| `CAMDAC_results_per_CpG.fst` | CAMDAC deconvolution and differential methylation results | + +## WGBS pipeline output + +CAMDAC outputs are written in the directory given by `config$outdir` in the format `PATIENT/DATASET/SAMPLE/`: + +``` +└── P + ├── Allelecounts + │ ├── N + │ │ └── P.N.SNPs.CpGs.all.sorted.csv.gz + │ └── T + │ └── P.T.SNPs.CpGs.all.sorted.csv.gz + ├── Copynumber + │ ├── N + │ │ └── P.N.SNPs.csv.gz + │ └── T + │ ├── ascat + │ ├── battenberg + │ ├── P.T.cna.txt + │ ├── P.T.SNPs.csv.gz + │ └── P.T.tnSNP.csv.gz + └── Methylation + ├── N + │ └── P.N.m.csv.gz + └── T + ├── P.T.CAMDAC_annotated_DMRs.fst + ├── P.T.CAMDAC_results_per_CpG.fst + ├── P.T.m.csv.gz + └── P.T.pure.csv.gz +``` + +| File | Description | +|------|-------------| +| `P.T.SNPs.CpGs.all.sorted.csv.gz` | Allele counts for a sample. Generated by processing BAM file | +| `P.T.SNPs.csv.gz` | SNP counts for a sample. | +| `P.T.cna.txt` | CAMDAC CNA result | +| `P.T.m.csv.gz` | Bulk methylation data | +| `P.T.m.pure.csv.gz` | CAMDAC-deconvolved methylation data | +| `P1.T.CAMDAC_results_per_CpG.fst` | CAMDAC differentially methylated cytosines | +| `P1.T.CAMDAC_annotated_DMRs.fst` | CAMDAC differentially methylated regions | + +It is possible to manually override outputs for runs. See `vignette("questions")` for more details. diff --git a/vignettes/pipeline.Rmd b/vignettes/pipeline.Rmd new file mode 100644 index 0000000..f66b387 --- /dev/null +++ b/vignettes/pipeline.Rmd @@ -0,0 +1,50 @@ +--- +title: "CAMDAC pipeline" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{pipeline} + %\VignetteEngine{rmarkdown::render} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + eval = FALSE +) +``` + +The entry-point to CAMDAC is the `pipeline()` function which expects a `CamConfig()` object and four `CamSample()` objects representing: + +* **tumor** : The bulk tumor sample to deconvolve +* **germline** : The germline normal data for copy number calling +* **infiltrates** : A proxy for the normal infiltrating cells +* **origin** : A proxy for the normal cell from which the tumour originated + +The same normal sample may be passed repeatedly for the germline, infiltrates or origin, depending on your experimental design. See `?pipeline` for more details. + +```{r pipeline} +library(CAMDAC) + +# Path to BAM files +tumor_bam <- system.file("testdata", "tumor.bam", package = "CAMDAC") +normal_bam <- system.file("testdata", "normal.bam", package = "CAMDAC") + +# Select samples for basic tumor-normal analysis +tumor <- CamSample(id = "T", sex = "XY", bam = tumor_bam) +normal <- CamSample(id = "N", sex = "XY", bam = normal_bam) + +# Configure pipeline +config <- CamConfig( + outdir = "./results", bsseq = "rrbs", lib = "pe", + build = "hg38", refs = "./refs", n_cores = 1, cna_caller = 'ascat' +) + +# Run CAMDAC +CAMDAC::pipeline( + tumor, germline = normal, infiltrates = normal, origin = normal, config +) +``` + +Next, see `vignette("output")` for a detailed summary of CAMDAC results files. diff --git a/vignettes/questions.Rmd b/vignettes/questions.Rmd new file mode 100644 index 0000000..3416e1d --- /dev/null +++ b/vignettes/questions.Rmd @@ -0,0 +1,74 @@ +--- +title: "FAQs" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{questions} + %\VignetteEngine{rmarkdown::render} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + eval=FALSE +) +``` + +## General FAQ + +### What if I don't have a CNA profile or matched germline sample? (WGBS) + +Ideally, CAMDAC is run with a matched normal sample from which to derive heterozygous germline SNPs for copy number estimation. In the absence of matched normals, a panel of sex- and tissue-matched normal samples may be used by averaging DNA methylation rates from multiple patients. See `vignette("experimental")` for more information. + +### I want to run CAMDAC on something other than hg19 or hg38 (WGBS) + +Please raise an issue on GitHub to request files for a new reference genome. + +### Can I skip steps of the analysis? (WGBS) + +When calling `pipeline` if you do not give a normal infiltrate or cell of origin, the pipeline skips deconvolution and differential methylation respectively. This may be useful to run a quick first-pass to find and refit copy number solutions. When CAMDAC has found a solution and is rerun with the same tumor, config, and normal, the `infiltrates` and `cell_of_origin` arguments will continue the pipeline where it left off. The entire pipeline can be re-run be deleting the output directory or setting `overwrite=TRUE` in the `CamConfig`. + +### How do I run individual steps of the CAMDAC pipeline? (WGBS) + +The simplest way is to call `pipeline` with `overwrite=FALSE` in your config, giving the right normal sample for your step. Additionally, you `CamConfig` must run with the same output directory. + +If for any reason, you have changed the output directory structure from previous run, you can initiate CAMDAC by manually passing outputs to `CamSample` objects. See the vignette `vignette("output")` for more information. + +Finally, you can run the `cmain_*` functions used by `pipeline()` directly. For example, to run the deconvolution step, you can call `cmain_deconvolve_methylation()`. + +### My CNA solution wasn't right. How can I refit with different purity and ploidy values? (WGBS) + +If you want to use an external purity and ploidy solution, simply pass a CNA file that has only the purity and ploidy fields. Additionally, set `refit==TRUE` in the CamConfig and CAMDAC will use this to refit the sample. + +### Can I limit my analysis to specific regions of interest? + +To analyse specific genomic regions, you may pass a BED file to CAMDAC config: + +```{r} +CamConfig(outdir=".", ref="./pipeline_files", regions="regions.bed") +``` + +CAMDAC will merge any overlapping regions prior to analysis. + +### How can I manually replace pipeline outputs? (WGBS) + +If you have outputs from a previous run, you can manually assign them to a CAMDAC object. This overwrites the expected path for that output type, allowing the pipeline to run with this data instead of computing it. Use the `attach_output` function, passing one of three arguments: + +* `counts`: CAMDAC allele counts `*.SNP.CpGs.all.sorted.csv.gz` file +* `snps`: CAMDAC sample SNP counts `*.SNPs.csv.gz` file +* `meth`: CAMDAC bulk methylation `*.m.csv.gz` file +* `cna`: CAMDAC CNA `*.cna.txt` file +* `pure`: CAMDAC deconvolved methylation `*.m.pure.csv.gz` file + +For example, to attach a previous counts file to a CAMDAC object: + +```{r} +library(CAMDAC) +tumor <- CamSample(id = "T", sex = "XY", bam = NULL) +config <- CamConfig(outdir = tempdir(), build="hg38", bsseq="wgbs", lib="pe") +counts_file <- system.file("testdata", "test.SNPs.CpGs.all.sorted.csv.gz", package = "CAMDAC") +tumor <- attach_output(tumor, config, "counts", counts_file) +``` + +The CAMDAC pipeline can now access the file in the expected location at `config$outdir`. diff --git a/vignettes/setup.Rmd b/vignettes/setup.Rmd new file mode 100644 index 0000000..4ae8ad8 --- /dev/null +++ b/vignettes/setup.Rmd @@ -0,0 +1,60 @@ +--- +title: "Installation" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{setup} + %\VignetteEngine{rmarkdown::render} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + eval = FALSE +) +``` + +## Install CAMDAC + +From the R console, install CAMDAC from github: + +```{r} +install.packages("remotes") +remotes::install_github("VanLoo-lab/CAMDAC") +``` + +## Download pipeline reference files + +CAMDAC requires custom annotation files for RRBS and WGBS analysis, available [at the Zenodo repository: (10565423)](https://zenodo.org/records/10565423). An R convenience function is provided to download these files: + +```{r} +CAMDAC::download_pipeline_files(bsseq = "rrbs", directory = "./refs") +CAMDAC::download_pipeline_files(bsseq = "wgbs", directory = "./refs") +``` + +Now, you're ready to run CAMDAC! Next, see `vignette("pipeline")`. + +### Reference file search priority + +CAMDAC searches for pipeline files in the following order: + +1. A directory passed when creating the config object (see `CamConfig()`) +1. The location defined by the environment variable CAMDAC_PIPELINE_FILES. +1. The current working directory + +We recommend that you set the environment variable `CAMDAC_PIPELINE_FILES` to the directory where you downloaded the files. This will allow CAMDAC to find the files automatically whenever you load R. + +From a Unix terminal: + +> echo "CAMDAC_PIPELINE_FILES=$(realpath R)" >> ~/.Renviron + +## External dependencies + +**CAMDAC-RRBS** + +* None + +**CAMDAC WGBS** + +* `java`: To run CAMDAC on WGBS data, we leverage Battenberg which requires the `java` command-line utility. Download Java from [https://openjdk.org/](https://openjdk.org/). \ No newline at end of file diff --git a/vignettes/technical.Rmd b/vignettes/technical.Rmd new file mode 100644 index 0000000..447c6b6 --- /dev/null +++ b/vignettes/technical.Rmd @@ -0,0 +1,199 @@ +--- +title: "Technical Note" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{technical} + %\VignetteEngine{rmarkdown::render} + %\VignetteEncoding{UTF-8} +--- + +
    In this section, we provide a high-level summary of the CAMDAC pipeline, which covers six key steps: + +1. **Allele Counting**: Obtain allele counts at SNP and CpG loci. +2. **Copy-number calling**: Obtain allele-specific copy number profiles, tumour purity and SNP plot data. +3. **Methylation Processing**: Filter, format and plot methylation data. +4. **Deconvolution**: Deconvolve the pure tumour methylation rates from bulk tumour RRBS data. +5. **Differential methylation**: Perform differential tumour-normal methylation analysis. + +For a full outline and validation of CAMDAC, please see Larose Cadieux _et al_. (2020) _bioRxiv_. + +## Allele counting + +Take a hypothetical female patient with primary tumour sample ID "**T1**" and normal-adjacent sample ID "**N1**". First, CAMDAC takes the sequencing alignment files from each sample using the `CamSample()` functions, users should provide the full path and file name for the RRBS or WGBS binary mapping alignments (.bam) files for input samples, and use the `CamConfig()` sample to indicate whether they are aligned hg19, hg38, GRCH37 or GRCH38. Bases should be quality and adapter trimmed and PCR duplicates should be removed. Please ensure that the bam file is sorted and indexed. + +CAMDAC employs an allele counter module to count SNP and CpG (methylation) alleles for downstream analysis. SNP counts are performed at 1000 genome SNP positions, and CpG alleles are counted using dinucleotides. To speed up the computation, we leverage a reference RRBS and WGBS genome files listing all genomic regions supported by the respective platforms. + +By default, the read mapping quality filter is set to **mq**>=0 as default in `CamConfig()`. Mapping quality scores from bisulfite sequencing aligners may be biased against the alternate allele for reads with polymorphisms. Please review the mapping quality distribution of your data to determine if it is appropriate to increase this setting. + + +If the function is successful, a signle file output with the suffix "SNPs.CpGs". This file carries compiled SNP and methylation information with the following columns: + +
    + +![**Figure.** Formatted SNP and methylation information](images/CAMDAC_manual_formatted_allele_counts_output.png) + +
    + +Each row is either a CG locus (and CCGG for RRBS) and/or a 1000g SNP position. These can be distinguished by the *width* column. While polymorphic CG/CCGG have the same width as their non-polymorphic counterpart, they are easily identified by looking at the *POS*, *ref*, *alt* and other SNP-informative columns. + +For each SNP locus, 1000 Genomes genomic coordinate and reference and alternate alleles are listed under *POS*, *ref* and *alt* columns. The *total_counts* is the sum of *alt_counts* and *ref_counts*, which including all informative strand-specific allele counts. For example, at $C>T$ SNPs, only the reverse strand allows to distinguish between the (un)methylated reference and the alternate allele and thus all forward read counts would be excluded from the *total_counts* column, but included in the *total_depth*. The SNP *type* column is only added to the patient-matched normal, which is used to assign SNP genotypes as either Homozygous or Heterozygous based on internal B-allele frequency (BAF) cut-offs. + +*M*, *UM*, *total_counts_m*, and *m* are the counts methylated, counts unmethylated, the total counts (un)methylated and the methylation rate, respectively. Methylation rates are calculated per CG allele, meaning that at polymorphic CpGs, only the CG-forming allele counts are considered. CAMDAC methylation rates are therefore polymorphism-independent. + +For CCGG loci found in RRBS, the *CCGG* column indicates the number of fragments with a 5' end at this CCGG loci. This number may be 0 at polymorphic CCGG loci homozygous for the CCGG-destroying allele. Furthermore, for RRBS, *MspI* fragment boundaries are determined from the aligned reads and *MspI* fragment the size distribution is visualised for quality assessment in the file *fragment_length_histogram.pdf*. You should observe 3 disctinct peaks in the fragment length distribution. This is characteristic of human RRBS libraries and originates from *MspI* containing micro-satellite repeats of distinct lengths. The *MspI* fragment boundaries and their GC content are saved as an .RData object and used downstream in RRBS copy number profiling. + +
    + +![**Figure.** *MspI* fragment size distribution](images/CAMDAC_manual_fragment_length_histogram.png){width=50%} + +
    +
    + +## Copy number calling + +B-allele frequencies at heterozygous SNPs are leveraged to calculate pure tumour copy number aberrations using either ASCAT.m for RRBS or Battenberg.m for WGBS. These tools are inspired from ASCAT ([Van Loo *et al.*, 2010](https://doi.org/10.1073/pnas.1009843107)) and Battenberg ([Nik-Zainal *et al.*, 2012](https://www.cell.com/cell/fulltext/S0092-8674(12)00527-2)). If sucessful, CAMDAC writes copy number output to the "Copy_number" directory. + +A SNPs file lists the heterozygous SNPs selected for copy number analysis, resulting in a table where each row is a 1000g SNP position with minimum coverage defined by the `germline` sample with a minimum coverage set by the **min_normal** argument. The *total_counts* column is the total informative read counts. For example, at C$>$T SNPs, only the reverse strand allows to distinguish between the unmethylated reference and the alternate allele and thus, forward read counts would not contribute to the *total_counts* and the *BAF* (B-allele frequency calculation). *rBAF* is randomly assigned *BAF* or 1-*BAF* to remove biases against the alternate allele in downstream tumour copy number profiling. All read counts however contribute to the *total_depth* which is used for LogR calculation, a measure of total coverage. Genotyping is performed and assignments stored under *type*. + +For the RRBS pipeline, we provide an experimental feature to visualise the magnitude of biases against alternate of (B)-alleles. The number of homozygous to heterozygous SNPs is depicted and any biases in coverage against the latter can be evaluated. Due to being biases for CpG-rich genomic regions, a typical RRBS sample should show a high ratio of C$>$T SNPs. We note that C$>$T and A$>$G germline heterozygous SNPs will have roughly half the coverage of the 4 types of SNPs. + +
    + +![**Figure** Normal SNP data QC](images/CAMDAC_manual_normal_SNP_data.png){width=65%} + +
    +
    + +In addition to the above-mentionned columns, we also adjust for biases in the tumour LogR. The LogR is a normalised measure of tumour coverage used by ASCAT.m and Battenberg.m for copy number profiling together with the BAF. The covariates used for LogR correction are: + +- *GC_content*: The GC content of fragments leads to sequencing biases, namely at the PCR amplification step. +- *replic*: The local genomic replication timing affects the number of copies present at a given locus in cells undergoing S phase. +- *msp1_length*: RRBS only. The *MspI* fragment length is highly variable and we observe sequencing biases against fragments at the extremes of the fragment size distribution. + +Next, the standard ASCAT or Battenberg output are then generated. All files have the dot-separated patient and sample IDs as prefix. In addition, we plot the BAF and LogR. In the BAF profiles, heterozygous SNPs are highlighted in red. The BAF and LogR tracks are then segmented by the respective tools. The segmentation is then analysed to determine the optimal tumour purity and ploidy solution via a grid search (see sunrise plot). Raw and rounded allele-specific copy number segments are provided as output png images. + +Finally, the purity, ploidy, number of heterozygous and homozygous 1000g SNP positions and median tumour and normal SNP depth are saved for each tumour sample. For RRBS, summary SNP data is plotted and saved as a pdf with filename "*_SNP_data.pdf*" and may help you troubleshoot your data. + +
    + +![**Figure.** Tumour SNP data summary](images/CAMDAC_manual_SNP_data.png){width=90%} + +
    +
    + +## Methylation processing + +As part of the allele counting step, CAMDAC calculates bulk DNA methylation rates for each input sample. For the patient- and tissue-matched normal sample "N1", the methylation data columns have the suffix is $x = n$, since $m_{n,i} \sim m_{n,o}$. Where $m_{n,i} \neq m_{n,o}$, the suffix is set to $x = n\_i$ for the normal infiltrates and $x = n\_o$ for the normal cell of origin proxy sample. The uncertainty on $m_{x}$ is computed as the lower and upper boundaries of the 99% Highest Density Interval (HDI) are stored under columns $m_{x,low}$ and $m_{x,high}$. + +- *CHR*: Chromosome name with 'chr' prefix +- *start*: First base of CG/CCGG +- *end*: Last base of CG/CCGG +- *M_x*: Counts methylated +- *UM_x*: Counts unmethylated +- *m_x*: Methylation rate +- *m_x_low*: Lower boundary of the 99% HDI for $m_{x}$ +- *m_x_high*: Upper boundary of the 99% HDI for $m_{x}$ + +
    + +![**Figure.** Normal methylation output.](images/CAMDAC_manual_normal_methylation_output.png){width=50%} + +
    + +In the normal sample methylation output directory, you will find a pdf with methylation data summary and QC (RRBS only). We expect DNA methylation rates to sit near 0 and 1. CAMDAC calculates DNA methylation rates in a polymorphism-independent manner, meaning that the CG-destroying allele at a heterozygous CpG does not contribute to its methylation rate. The minimum coverage threshold applied to CpG sites is based on the CpG allele read depth, so any heterozygous SNPs present at the CG location may be removed due to insufficient coverage. + +
    + +![**Figure.** Normal methylation rate QC.](images/CAMDAC_manual_normal_methylation_rate_summary.png){width=90%} + +
    +
    + +## Deconvolution + +At this stage, CAMDAC has obtained methylation rates for both the normal infiltrates and bulk tumour, as well as tumour copy number and purity estimates. The DNA methylation profile of the normal-adjacent samples may be used as a proxy for the methylation rate of tumour-infiltrating normal cells ($m_{n,i}$). We have all the necessary information to obtain CAMDAC pure tumour methylation rates, $m_t$. + +In the Methylation/ output directory, CpG copy number and purified tumour methylation data are written to output CSV files. Header fields include: + +- *nA*: Major allele copy number +- *nB*: Minor allele copy number +- *CN*: Total allele copy number +- *seg_start*: Copy number segment start point +- *seg_end*: Copy number segment end point +- *CG_CN*: CpG allele total copy number +(this differ from *CN* at polymorphic CpGs) +- *m_t_raw*: Raw CAMDAC purified tumour methylation rate +- *m_t_corr*: Corrected CAMDAC purified tumour methylation rate +- *cov_t*: CAMDAC purified tumour effective read coverage +- *m_t_low*: CAMDAC purified tumour 99% HDI lower boundary +- *m_t_high*: CAMDAC purified tumour 99% HDI upper boundary + +CAMDAC-deconvoluted methylation rate can have any values between 0 and 1 while the range of bulk tumour methylation rates is driven by tumour DNA content. In the bulk tumour profiles, bi-allelic tumour-normal differentially methylated positions appear at intermediate methylation values while after purification, they form a peak near 0 or 1 for hypo- and hypermethylated positions, respectively.

    + +
    + +![**Figure.** Tumour *versus* normal methylation rates from before and after CAMDAC.](images/CAMDAC_manual_tumour_versus_normal_methylomes.png){width=60%} + +
    +
    + + +## Differential methylation + +For tumour-normal differential methylation analysis, CAMDAC expects a DNA methylation profile representing the tumour cell of origin ($m_{n,o}$). In this hypothetical example, we set the normal sample **N1** as the cell of origin. Leveraging CAMDAC purified methylomes, we then obtain differentially methylated positions and regions. + +Differential DNA methylation is detected with a minimum tumour-normal methylation rate difference (effect size, where $\delta\beta$ >= 0.2) and a probability threshold, representing the probability that the tumour and normal beta distributions do not overlap. Both variables are used for calling differentially methylated positions (DMPs). + +Next, CAMDAC builds on DMP calls to call DMRs. To identify differentially methylated regions (DMRs), we group CpGs into bins and look for clusters with at least 5 DMPs (**min_DMP_counts_in_DMR**=5), 4 of which must be consecutive (**min_consec_DMP_in_DMR**=4). After completion, this function generates a pure tumor methylation file (*CAMDAC_results_per_CpG.RData for RRBS or *pure.csv.gz for WGBS) in the CAMDAC methylation output directory. This R object is a combination of all CAMDAC results per CpG with DMP information included: + +- *cluster_id*: RRBS CpG cluster +- *chrom*: Chromsome name (i.e. 1, 2, ..., X) +- *start*: First base of CG/CCGG +- *end*: Last base of CG/CCGG +- *m_n*: normal methylation rate +- *m_n_low*: normal methylation rate HDI99 lower boundary +- *m_n_high*: normal methylation rate HDI99 upper boundary +- *m_t*: CAMDAC pure tumour methylation rate +- *m_t_low*: CAMDAC pure tumour methylation rate HDI99 lower boundary +- *m_t_high*: CAMDAC pure tumour methylation rate HDI99 upper boundary +- *prob*: Tumour-noraml DMP probability +- *CG_CN*: CpG allele total copy number +- *nA*: Major allele copy number +- *nB*: Minor allele copy number +- *segment*: Copy number segment endpoints +- *DMR_type*: "hyper", "hypo" or "mixed" +- *CpG_counts*: Number of CpGs in a given bin +- *DMP_counts*: Number of DMPs +- *consec_DMPs*: Number of consecutive DMPs +- *DMR*: "DMR" if differentially methylated, NA otherwise. +- *m_diff_tn*: CAMDAC-purified tumour $-$ normal methylation rates +- *prob_DMP*: DMP probability +- *DMP_t*: DMP calls based on CAMDAC-purified tumour *versus* normal methylation rates + + +The ratio of hyper- to hypomethylated DMRs varies across genomic regions is reflected by the tumour-normal methylation rate difference. + +
    + +![**Figure.** DMR summary data.](images/CAMDAC_manual_DMR_summary_plots.png){width=80%} + +
    +
    + +### Leveraging CAMDAC outputs + +CAMDAC outputs will be stored at the user-defined project `outdir` variable given to the configuration (`CamConfig()`). A patient folder is created at this path with directory name set to **patient_id**. This will contain 3 subdirectories: Allelecounts, Copy_number and Methylation, with further sub-directories created for each of a given patient's samples. + +With CAMDAC differential methylation calls in hand, users may choose to look for recurrently aberrated loci across their cohort. Note that tumour-tumour DMPs can be easily identified by looking for overlap between the 99% HDIs for CAMDAC pure tumour methylation rates between samples (99% HDI $\subseteq$ [**m_t_low**,**m_t_high**]). + +Clustering* analyses can also easily be performed by the user using well-established R packages such as 'pvclust' for hierarchical clustering with bootstrap and 'umap' (uniform manifold approximation and projection) for non-linear dimensionality reduction. Clustering of pure tumour methylation rates at promoter DMRs across large cohorts by 'umap' may reveal histology and/or sex-driven clusters as described in non-small cell lung cancer [Larose Cadieux et *al.*, 2020](https://doi.org/10.1101/2020.11.03.366252)^1^. + +For multi-region data, sample tree reconstruction by neighbour joining leveraging CAMDAC pure tumour methylation rates at hypermethylated DMPs in at least on sample, subset to loci confidently unmethylated in the normal cell of origin (**m_n_high**<0.2), can reveal inter-sample relationships, as demonstrated in non-small cell lung cancer [Larose Cadieux et *al.*, 2020](https://doi.org/10.1101/2020.11.03.366252)^1^. + +When running gene-set enrichment analysis (*GSEA*) on CAMDAC DMR calls, gene sets should be limited to those genes with promoters covered by RRBS. It may be desirable to subset DMR calls to hypermethylated promoter-associated CpG Islands given that methylation at these loci is most correlated with expression. + +Users may leverage normal, deconvoluted tumour methylation rates and tumour-normal DMP calls to separate clonal mono- and bi-allelic from subclonal bi-allelic methylation changes to shed light into tumour evolutionary histories [Larose Cadieux et *al.*, 2020](https://doi.org/10.1101/2020.11.03.366252)^1^. The allele-specific CAMDAC module will be made available in future releases. + +### References + +Larose Cadieux *et al.* (2020). Copy number-aware deconvolution of tumor-normal DNA methylation profiles, bioRxiv 2020.11.03.366252 \ No newline at end of file