Skip to content

Commit a75980f

Browse files
authored
Merge pull request #298 from broadinstitute/development
Release 1.25.0
2 parents 9db8dfb + 74a1d97 commit a75980f

18 files changed

+1997
-131
lines changed

.circleci/config.yml

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ orbs:
1010
jobs:
1111
build:
1212
docker:
13-
- image: circleci/python:3.7.5-stretch
13+
- image: cimg/python:3.10.9
1414
resource_class: large
1515

1616
working_directory: ~/scp-ingest-pipeline
@@ -21,34 +21,29 @@ jobs:
2121
# Download and cache dependencies
2222
- restore_cache:
2323
keys:
24-
- v3-dependencies-{{ checksum "requirements.txt" }}
24+
- v6-dependencies-{{ checksum "requirements.txt" }}
2525
# fallback to using the latest cache if no exact match is found
26-
- v3-dependencies-
27-
28-
- run:
29-
name: Install system dependencies for Genomes Pipeline
30-
command: |
31-
sudo apt-get install -y tabix
26+
- v6-dependencies-
3227

3328
- run:
3429
name: Install Python dependencies
3530
command: |
36-
python3 -m venv venv
31+
python -m venv venv
3732
. venv/bin/activate
3833
pip install --upgrade pip
3934
pip install -r requirements.txt
4035
4136
- save_cache:
4237
paths:
4338
- ./venv
44-
key: v3-dependencies-{{ checksum "requirements.txt" }}
39+
key: v6-dependencies-{{ checksum "requirements.txt" }}
4540

4641
- run:
4742
name: Run tests
4843
command: |
4944
. venv/bin/activate
5045
cd tests
51-
pytest -k 'not test_genomes and not test_make_toy and not test_delocalize_file' --cov-report=xml --cov=../ingest/
46+
pytest -k 'not test_genomes and not test_make_toy and not test_delocalize_file' -p no:warnings --cov-report=xml --cov=../ingest/
5247
5348
- codecov/upload:
5449
file: tests/coverage.xml

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
tests/issues.json
22

3+
# Ignore secrets
4+
config/
5+
36
# Ignore Python runtime assets
47
env/
58
__pycache__/

Dockerfile

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,23 +10,24 @@
1010
# Use a managed base image from Google. It is continually updated for
1111
# security fixes (thus the "latest" tag).
1212
# https://github.com/GoogleContainerTools/base-images-docker/tree/master/ubuntu
13-
FROM marketplace.gcr.io/google/ubuntu1804:latest
13+
FROM marketplace.gcr.io/google/ubuntu2004:latest
1414

15-
# RUN echo "Uncomment to clear cached layers below this statement (2020-01-07-0947)"
15+
# RUN echo "Uncomment to clear cached layers below this statement (2022-03-14-1441)"
1616

17-
# Install Python 3.7
17+
# Install Python 3.10
1818
RUN apt-get -y update && \
1919
apt-get -y install software-properties-common && \
2020
add-apt-repository ppa:deadsnakes/ppa && \
2121
apt-get -y install python3-pip && \
22-
apt-get -y install python3.7 && \
23-
apt-get -y install python3.7-dev
22+
apt-get -y install python3.10 && \
23+
apt-get -y install python3.10-dev && \
24+
apt-get -y install python3.10-distutils
2425

25-
RUN python3.7 -m pip install --upgrade pip
26+
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
2627

27-
# Set cleaner defaults (`alias` fails)
28-
RUN ln -s /usr/bin/python3.7 /usr/bin/python && \
29-
ln -s /usr/bin/pip3 /usr/bin/pip
28+
# # Set cleaner defaults (`alias` fails)
29+
# RUN ln -s /usr/bin/python3.10 /usr/bin/python && \
30+
# ln -s /usr/bin/pip3 /usr/bin/pip
3031

3132
# Copy contents of this repo into the Docker image
3233
# (See .Dockerignore for omitted files)
@@ -35,7 +36,7 @@ COPY . scp-ingest-pipeline
3536
WORKDIR /scp-ingest-pipeline
3637

3738
# Install Python dependencies
38-
RUN python3.7 -m pip install -r requirements.txt
39+
RUN python3.10 -m pip install -r requirements.txt
3940

4041
WORKDIR /scp-ingest-pipeline/ingest
4142
CMD ["python", "ingest_pipeline.py", "--help"]

README.md

Lines changed: 59 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,16 @@ The SCP Ingest Pipeline is an ETL pipeline for single-cell RNA-seq data.
99

1010
# Prerequisites
1111

12-
- Python 3.7+
12+
- Python 3.10
1313
- Google Cloud Platform project
1414
- Suitable service account (SA) and MongoDB VM in GCP. SA needs roles "Editor", "Genomics Pipelines Runner", and "Storage Object Admin". Broad Institute engineers: see instructions [here](https://github.com/broadinstitute/single_cell_portal_configs/tree/master/terraform-mongodb).
1515
- SAMtools, if using `ingest/make_toy_data.py`
1616
- Tabix, if using `ingest/genomes/genomes_pipeline.py`
1717

1818
# Install
1919

20+
### Native
21+
2022
Fetch the code, boot your virtualenv, install dependencies:
2123

2224
```
@@ -25,68 +27,41 @@ cd scp-ingest-pipeline
2527
python3 -m venv env --copies
2628
source env/bin/activate
2729
pip install -r requirements.txt
30+
scripts/setup-mongo-dev.sh <PATH_TO_YOUR_VAULT_TOKEN> # E.g. ~/.github-token
2831
```
2932

30-
To use `ingest/make_toy_data.py`:
33+
### Docker
3134

32-
```
33-
brew install samtools
34-
```
35-
36-
To use `ingest/genomes/genomes_pipeline.py`:
35+
With Docker running and Vault active on your local machine, run:
3736

3837
```
39-
brew install tabix
38+
scripts/docker-compose-setup.sh -t <PATH_TO_YOUR_VAULT_TOKEN> # E.g. ~/.github-token
4039
```
4140

42-
Now get secrets from Vault to set environment variables needed to write to the database:
43-
(see also scripts/setup_mongo_dev.sh)
41+
If on Apple silicon Mac (e.g. M1), and performance seems poor, consider generating a docker image using the arm64 base. Example test image: gcr.io/broad-singlecellportal-staging/single-cell-portal:development-2.2.0-arm64, usage:
4442

4543
```
46-
export BROAD_USER="<username in your email address>"
47-
48-
export DATABASE_NAME="single_cell_portal_development"
49-
50-
vault login -method=github token=`~/bin/git-vault-token`
51-
52-
# Get username and password
53-
vault read secret/kdux/scp/development/$BROAD_USER/mongo/user
54-
55-
export MONGODB_USERNAME="<username from Vault>"
56-
export MONGODB_PASSWORD="<password from Vault>"
57-
58-
# Get external IP address for host
59-
vault read secret/kdux/scp/development/$BROAD_USER/mongo/hostname
60-
61-
export DATABASE_HOST="<ip from Vault (omit brackets)>"
44+
scripts/docker-compose-setup.sh -i development-2.2.0-arm64 -t <PATH_TO_YOUR_VAULT_TOKEN>
6245
```
6346

64-
For testing/development using ingest_pipeline.py on the command line, annotation file input validation and MongoDB writes can be bypassed if you set:
65-
(Note: this bypass does not currently apply to expression matrix ingest).
47+
To update dependencies when in Docker, you can pip install from within the Docker Bash shell after adjusting your requirements.txt.
48+
If you close your shell after that, your newly installed dependencies will be lost. Dependencies only persist after merging your
49+
new requirements.txt into `development`. TODO (SCP-4941): Add entry-point script to run `pip install`.
6650

67-
```
68-
export BYPASS_MONGO_WRITES='yes'
69-
```
51+
### Optional
7052

71-
If you are developing updates for Mixpanel logging, set the Bard host URL:
53+
To use `ingest/make_toy_data.py`:
7254

7355
```
74-
75-
export BARD_HOST_URL="https://terra-bard-dev.appspot.com"
56+
brew install samtools
7657
```
7758

78-
Be sure to `unset BARD_HOST_URL` when your updates are done, so development ingest events are not always sent to Mixpanel.
79-
80-
If you are developing updates for Sentry logging, then set the DSN:
59+
To use `ingest/genomes/genomes_pipeline.py`:
8160

8261
```
83-
vault read secret/kdux/scp/production/scp_config.json | grep SENTRY
84-
85-
export SENTRY_DSN="<Sentry DSN value from Vault>"
62+
brew install tabix
8663
```
8764

88-
Be sure to `unset SENTRY_DSN` when your updates are done, so development logs are not always sent to Sentry.
89-
9065
## Git hooks
9166

9267
After installing Ingest Pipeline, add Git hooks to help ensure code quality:
@@ -106,7 +81,7 @@ In rare cases, you might need to skip Git hooks, like so:
10681

10782
# Test
10883

109-
After [installing](#Install):
84+
After [installing](#install):
11085

11186
```
11287
source env/bin/activate
@@ -128,49 +103,83 @@ pytest test_ingest.py
128103
# Run all tests, show code coverage metrics
129104
pytest --cov=../ingest/
130105
```
106+
131107
For more, see <https://docs.pytest.org/en/stable/usage.html>.
132108

133-
## Testing in Docker
134-
If you have difficulties installing and configuring `scp-ingest-pipeline` due to hardware issues (e.g. Mac M1 chips),
135-
you can alternatively test locally by building the Docker image and then running any commands inside the container.
109+
## Testing in Docker locally
110+
<!--
111+
Step 1 is also useful for troubleshooting when Dockerfile updates fail to build
112+
-->
113+
If you have difficulties installing and configuring `scp-ingest-pipeline` due to hardware issues (e.g. Mac M1 chips),
114+
you can alternatively test locally by building the Docker image and then running any commands inside the container.
136115
There are some extra steps required, but this sidesteps the need to install packages locally.
137116

138117
### 1. Build the image
139-
Run the following command to build the testing Docker image locally (make sure Docker is running first):
118+
119+
Run the following command to build the testing Docker image locally (make sure Docker is running first). This build command will incorporate any changes in the local instance of your repo, committed or not:
120+
140121
```
141122
docker build -t gcr.io/broad-singlecellportal-staging/ingest-pipeline:test-candidate .
142123
```
124+
125+
Note - if this is your first time doing `docker build` you may need to configure Docker to use the Google Cloud CLI to authenticate requests to Container Registry:
126+
127+
```
128+
gcloud auth configure-docker
129+
```
130+
131+
Pro-Tip: For local builds, you can try adding docker build options `--progress=plain` (for more verbose build info) and/or `--no-cache` (when you want to ensure a build with NO cached layers)
132+
143133
### 2. Set up environment variables
134+
144135
Run the following to pull database-specific secrets out of vault (passing in the path to your vault token):
136+
145137
```
146-
source scripts/setup_mongo_dev.sh ~/.your-vault-token
138+
source scripts/setup-mongo-dev.sh ~/.your-vault-token
147139
```
140+
148141
Now run `env` to make sure you've set the following values:
142+
149143
```
150144
MONGODB_USERNAME=single_cell
151145
DATABASE_NAME=single_cell_portal_development
152146
MONGODB_PASSWORD=<password>
153147
DATABASE_HOST=<ip address>
154148
```
149+
155150
### 3. Print out your service account keyfile
151+
156152
Run the following to export out your default service account JSON keyfile:
153+
157154
```
158155
vault read -format=json secret/kdux/scp/development/$(whoami)/scp_service_account.json | jq .data > /tmp/keyfile.json
159156
```
157+
160158
### 4. Start the Docker container
159+
161160
Run the container, passing in the proper environment variables:
161+
162162
```
163163
docker run --name scp-ingest-test -e MONGODB_USERNAME="$MONGODB_USERNAME" -e DATABASE_NAME="$DATABASE_NAME" \
164164
-e MONGODB_PASSWORD="$MONGODB_PASSWORD" -e DATABASE_HOST="$DATABASE_HOST" \
165165
-e GOOGLE_APPLICATION_CREDENTIALS=/tmp/keyfile.json --rm -it \
166166
gcr.io/broad-singlecellportal-staging/ingest-pipeline:test-candidate bash
167167
```
168+
169+
Note: on an M1 machine, you may see this message:
170+
171+
> WARNING: The requested image's platform (linux/amd64) does not match the detected host platform (linux/arm64/v8) and no specific platform was requested
172+
168173
### 5. Copy keyfile to running container
174+
169175
In a separate terminal window, copy the JSON keyfile from above to the expected location:
176+
170177
```
171178
docker cp /tmp/keyfile.json scp-ingest-test:/tmp
172179
```
180+
173181
You can now run any `ingest_pipeline.py` command you wish inside the container.
182+
174183
# Use
175184

176185
Run this every time you start a new terminal to work on this project:
@@ -184,10 +193,12 @@ See [`ingest_pipeline.py`](https://github.com/broadinstitute/scp-ingest-pipeline
184193
## Troubleshooting during set up
185194

186195
If you run into an error like: "... [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed ... " try:
196+
187197
- Open terminal
188198
- `cd` to where python is installed
189199
- Run the certificates command with `/Applications/Python\ < Your Version of Python Here >/Install\ Certificates.command`
190200

191201
If you run into an error like "ModuleNotFoundError: No module named 'google'" try:
202+
192203
- Open terminal
193204
- Run `pip install --upgrade google-api-python-client`

docker-compose-dev.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
services:
2+
ingest:
3+
container_name: scp-ingest-pipeline
4+
image: "${GCR_IMAGE}"
5+
env_file:
6+
- config/.ingest_env
7+
expose:
8+
- 27017 # Allow outgoing traffic to MongoDB
9+
volumes:
10+
- .:/scp-ingest-pipeline # Needed to mount
11+

images/metadata.png

593 KB
Loading

ingest/a_vs_b_de.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#using https://nbviewer.org/github/theislab/diffxpy_tutorials/blob/master/diffxpy_tutorials/test/multiple_tests_per_gene.ipynb
2+
import anndata
3+
import matplotlib.pyplot as plt
4+
import seaborn as sns
5+
import logging
6+
import numpy as np
7+
import pandas as pd
8+
import scipy.stats
9+
import scanpy as sc
10+
import os
11+
12+
13+
import batchglm.api as glm
14+
15+
import diffxpy.api as de
16+
17+
# TODO (SCP-5041): Extract these to CLI arguments
18+
19+
#Pairwise tests between groups
20+
#answers whether a given pair of groups shows differential expression for each gene
21+
22+
#arguments:
23+
#data: Anndata object, data matrix with cells x genes
24+
25+
#grouping: str, column in data, the column that contains cell type labels. alternatively vector containing group labels
26+
27+
#test: type of test, default is z-test. other options: ’wald’, ’lrt’, ’t-test’, ’rank’
28+
29+
#lazy: bool, only possible if test is ztest, if true only evaluated once the user requests pval/coefficients for a specific pair of models
30+
31+
#noise_model: default is nb, specify NONE for wald and t test
32+
33+
h5ad_file = "tests/data/anndata/trimmed_compliant_pbmc3K.h5ad"
34+
35+
adata_file = sc.read_h5ad(h5ad_file)
36+
37+
adata_file.var_names_make_unique()
38+
39+
print(adata_file.obs["louvain"])
40+
41+
42+
test = de.test.pairwise(
43+
data=adata_file,
44+
grouping="louvain",
45+
test="rank",
46+
lazy=False,
47+
noise_model=None
48+
)
49+
50+
# Accessing results
51+
52+
#outputs shape of p values
53+
np.set_printoptions(precision=3)
54+
print("shape of p-values: %s" % str(test.pval.shape))
55+
56+
57+
#brings up pvalue
58+
print(test.pval[:,:,0])
59+
60+
#brings up plot
61+
test.plot_volcano()
62+

0 commit comments

Comments
 (0)