Skip to content

Commit fa49099

Browse files
authored
Merge branch 'pytorch:master' into generator
2 parents 6d5d2d2 + d291621 commit fa49099

File tree

330 files changed

+5914
-31044
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

330 files changed

+5914
-31044
lines changed

.bazelrc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ build:short_logs --output_filter=DONT_MATCH_ANYTHING
9191
#build:tpu --@xla//xla/python:enable_tpu=true
9292
build:tpu --define=with_tpu_support=true
9393

94-
# Run tests serially with TPU and GPU (only 1 device is available).
94+
# Run tests serially with TPU (only 1 device is available).
9595
test:tpu --local_test_jobs=1
9696

9797
#########################################################################
@@ -100,11 +100,11 @@ test:tpu --local_test_jobs=1
100100
common --experimental_repo_remote_exec
101101

102102
# Inherit environmental variables that are used in testing.
103-
test --test_env=TPU_NUM_DEVICES --test_env=GPU_NUM_DEVICES --test_env=CPU_NUM_DEVICES --test_env=XRT_LOCAL_WORKER
103+
test --test_env=TPU_NUM_DEVICES --test_env=CPU_NUM_DEVICES --test_env=XRT_LOCAL_WORKER
104104
test --test_env=XRT_TPU_CONFIG --test_env=XRT_DEVICE_MAP --test_env=XRT_WORKERS --test_env=XRT_MESH_SERVICE_ADDRESS
105105
test --test_env=XRT_SHARD_WORLD_SIZE --test_env=XRT_MULTI_PROCESSING_DEVICE --test_env=XRT_HOST_ORDINAL --test_env=XRT_SHARD_ORDINAL
106106
test --test_env=XRT_START_LOCAL_SERVER --test_env=TPUVM_MODE --test_env=PJRT_DEVICE --test_env=PJRT_TPU_MAX_INFLIGHT_COMPUTATIONS
107-
test --test_env=PJRT_CPU_ASYNC_CLIENT --test_env=PJRT_GPU_ASYNC_CLIENT --test_env=TPU_LIBRARY_PATH --test_env=PJRT_DIST_SERVICE_ADDR
107+
test --test_env=PJRT_CPU_ASYNC_CLIENT --test_env=TPU_LIBRARY_PATH --test_env=PJRT_DIST_SERVICE_ADDR
108108
test --test_env=PJRT_LOCAL_PROCESS_RANK
109109

110110
# This environmental variable is important for properly integrating with XLA.

.circleci/common.sh

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -158,26 +158,12 @@ function run_torch_xla_cpp_tests() {
158158
fi
159159

160160
if [ "$USE_COVERAGE" != "0" ]; then
161-
if [ -x "$(command -v nvidia-smi)" ]; then
162-
PJRT_DEVICE=CUDA test/cpp/run_tests.sh $EXTRA_ARGS -L""
163-
cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/cov1.dat
164-
PJRT_DEVICE=CUDA test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
165-
cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/cov2.dat
166-
lcov --add-tracefile /tmp/cov1.dat -a /tmp/cov2.dat -o /tmp/merged.dat
167-
else
168-
PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
169-
cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/merged.dat
170-
fi
161+
PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
162+
cp $XLA_DIR/bazel-out/_coverage/_coverage_report.dat /tmp/merged.dat
171163
genhtml /tmp/merged.dat -o ~/htmlcov/cpp/cpp_lcov.info
172164
mv /tmp/merged.dat ~/htmlcov/cpp_lcov.info
173165
else
174-
# Shard GPU testing
175-
if [ -x "$(command -v nvidia-smi)" ]; then
176-
PJRT_DEVICE=CUDA test/cpp/run_tests.sh $EXTRA_ARGS -L""
177-
PJRT_DEVICE=CUDA test/cpp/run_tests.sh -X early_sync -F AtenXlaTensorTest.TestEarlySyncLiveTensors -L"" $EXTRA_ARGS
178-
else
179-
PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
180-
fi
166+
PJRT_DEVICE=CPU test/cpp/run_tests.sh $EXTRA_ARGS -L""
181167
fi
182168
popd
183169
}
@@ -196,11 +182,6 @@ function run_torch_xla_tests() {
196182
RUN_CPP="${RUN_CPP_TESTS:0}"
197183
RUN_PYTHON="${RUN_PYTHON_TESTS:0}"
198184

199-
if [ -x "$(command -v nvidia-smi)" ]; then
200-
num_devices=$(nvidia-smi --list-gpus | wc -l)
201-
echo "Found $num_devices GPU devices..."
202-
export GPU_NUM_DEVICES=$num_devices
203-
fi
204185
export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla"
205186
export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")
206187

.devcontainer/gpu-internal/devcontainer.json

Lines changed: 0 additions & 30 deletions
This file was deleted.

.github/ISSUE_TEMPLATE.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,5 @@ Error messages and stack traces are also helpful.
1313

1414
## System Info
1515

16-
- reproducible on XLA backend [CPU/TPU/CUDA]:
16+
- reproducible on XLA backend [CPU/TPU]:
1717
- torch_xla version:

.github/ISSUE_TEMPLATE/bug-report.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ Steps to reproduce the behavior:
4646

4747
## Environment
4848

49-
- Reproducible on XLA backend [CPU/TPU/CUDA]:
49+
- Reproducible on XLA backend [CPU/TPU]:
5050
- torch_xla version:
5151

5252

.github/ci.md

Lines changed: 40 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,22 @@
33
PyTorch and PyTorch/XLA use CI to lint, build, and test each PR that is
44
submitted. All CI tests should succeed before the PR is merged into master.
55
PyTorch CI pins PyTorch/XLA to a specific commit. On the other hand, PyTorch/XLA
6-
CI pulls PyTorch from master unless a pin is manually provided. This README will
7-
go through the reasons of these pins, how to pin a PyTorch/XLA PR to an upstream
8-
PyTorch PR, and how to coordinate a merge for breaking PyTorch changes.
6+
CI pulls PyTorch from `.torch_commit` unless a pin is manually provided. This
7+
README will go through the reasons of these pins, how to pin a PyTorch/XLA PR
8+
to an upstream PyTorch PR, and how to coordinate a merge for breaking PyTorch
9+
changes.
910

1011
## Usage
1112

12-
### Pinning PyTorch PR in PyTorch/XLA PR
13+
### Temporarily Pinning PyTorch PR in PyTorch/XLA PR
1314

1415
Sometimes a PyTorch/XLA PR needs to be pinned to a specific PyTorch PR to test
15-
new features, fix breaking changes, etc. Since PyTorch/XLA CI pulls from PyTorch
16-
master by default, we need to manually provide a PyTorch pin. In a PyTorch/XLA
17-
PR, PyTorch can be manually pinned by creating a `.torch_pin` file at the root
18-
of the repository. The `.torch_pin` should have the corresponding PyTorch PR
19-
number prefixed by "#". Take a look at [example
20-
here](https://github.com/pytorch/xla/pull/7313). Before the PyTorch/XLA PR gets
21-
merged, the `.torch_pin` must be deleted.
16+
new features, fix breaking changes, etc. In a PyTorch/XLA PR, PyTorch can be
17+
manually pinned by creating a `.torch_pin` file at the root of the repository.
18+
The `.torch_pin` should have the corresponding PyTorch PR number prefixed by
19+
"#". Take a look at [example here](https://github.com/pytorch/xla/pull/7313).
20+
Before the PyTorch/XLA PR gets merged, the `.torch_pin` must be deleted and
21+
`.torch_commit` updated.
2222

2323
### Coordinating merges for breaking PyTorch PRs
2424

@@ -35,29 +35,42 @@ fail. Steps for fixing and merging such breaking PyTorch change is as following:
3535
PyTorch PR to pin the PyTorch/XLA to the commit hash created in step 1 by
3636
updating `pytorch/.github/ci_commit_pins/xla.txt`.
3737
1. Once CI tests are green on both ends, merge PyTorch PR.
38-
1. Remove the `.torch_pin` in PyTorch/XLA PR and merge. To be noted, `git commit
39-
--amend` should be avoided in this step as PyTorch CI will keep using the
40-
commit hash created in step 1 until other PRs update that manually or the
41-
nightly buildbot updates that automatically.
38+
1. Remove the `.torch_pin` in PyTorch/XLA PR and update the `.torch_commit` to
39+
the hash of the merged PyTorch PR. To be noted, `git commit --amend` should
40+
be avoided in this step as PyTorch CI will keep using the commit hash
41+
created in step 1 until other PRs update that manually or the nightly
42+
buildbot updates that automatically.
4243
1. Finally, don't delete your branch until 2 days later. See step 4 for
4344
explanations.
4445

4546
### Running TPU tests on PRs
4647

47-
The `build_and_test.yml` workflow runs tests on the TPU in addition to CPU and
48-
GPU. The set of tests run on the TPU is defined in `test/tpu/run_tests.sh`.
48+
The `build_and_test.yml` workflow runs tests on the TPU in addition to CPU.
49+
The set of tests run on the TPU is defined in `test/tpu/run_tests.sh`.
50+
51+
## Update the PyTorch Commit Pin
52+
53+
In order to reduce development burden of PyTorch/XLA, starting from #9654, we
54+
started pinning PyTorch using the `.torch_commit` file. This should reduce the
55+
number of times a PyTorch PR breaks our most recent commits. However, this also
56+
requires maintenance, i.e. someone has to keep updating the PyTorch commit so
57+
as to make sure it's always supporting (almost) the latest PyTorch versions.
58+
59+
Updating the PyTorch commit pin is, theoretically, simple. You just have to run
60+
`scripts/update_deps.py --pytorch` file, and open a PR. In practice, you may
61+
encounter a few compilation errors, or even segmentation faults.
4962

5063
## CI Environment
5164

5265
Before the CI in this repository runs, we build a base dev image. These are the
5366
same images we recommend in our VSCode `.devcontainer` setup and nightly build
54-
to ensure consistency between environments. We produce variants with and without
55-
CUDA, configured in `infra/ansible` (build config) and
56-
`infra/tpu-pytorch-releases/dev_images.tf` (build triggers).
67+
to ensure consistency between environments. We produce variants configured in
68+
`infra/ansible` (build config) and `infra/tpu-pytorch-releases/dev_images.tf`
69+
(build triggers).
5770

5871
The CI runs in two environments:
5972

60-
1. Organization self-hosted runners for CPU and GPU: used for almost every step
73+
1. Organization self-hosted runners for CPU: used for almost every step
6174
of the CI. These runners are managed by PyTorch and have access to the shared
6275
ECR repository.
6376
1. TPU self-hosted runners: these are managed by us and are only available in
@@ -68,48 +81,35 @@ The CI runs in two environments:
6881

6982
We have two build paths for each CI run:
7083

71-
- `torch_xla`: we build the main package to support both TPU and GPU[^1], along
84+
- `torch_xla`: we build the main package to support TPU, along
7285
with a CPU build of `torch` from HEAD. This build step exports the
7386
`torch-xla-wheels` artifact for downstream use in tests.
7487
- Some CI tests also require `torchvision`. To reduce flakiness, we compile
7588
`torchvision` from [`torch`'s CI pin][pytorch-vision-pin].
7689
- C++ tests are piggybacked onto the same build and uploaded in the
7790
`cpp-test-bin` artifact.
78-
- `torch_xla_cuda_plugin`: the XLA CUDA runtime can be built independently of
79-
either `torch` or `torch_xla` -- it depends only on our pinned OpenXLA. Thus,
80-
this build should be almost entirely cached, unless your PR changes the XLA
81-
pin or adds a patch.
8291

83-
Both the main package build and plugin build are configured with ansible at
84-
`infra/ansible`, although they run in separate stages (`stage=build_srcs` vs
85-
`stage=build_plugin`). This is the same configuration we use for our nightly and
86-
release builds.
92+
The main package build is configured with ansible at `infra/ansible`. This is
93+
the same configuration we use for our nightly and release builds.
8794

88-
The CPU and GPU test configs are defined in the same file, `_test.yml`. Since
95+
The CPU test config is defined in the file `_test.yml`. Since
8996
some of the tests come from the upstream PyTorch repository, we check out
9097
PyTorch at the same git rev as the `build` step (taken from
9198
`torch_xla.version.__torch_gitrev__`). The tests are split up into multiple
9299
groups that run in parallel; the `matrix` section of `_test.yml` corresponds to
93100
in `.github/scripts/run_tests.sh`.
94101

95102
CPU tests run immediately after the `torch_xla` build completes. This will
96-
likely be the first test feedback on your commit. GPU tests will launch when
97-
both the `torch_xla` and `torch_xla_cuda_plugin` complete. GPU compilation is
98-
much slower due to the number of possible optimizations, and the GPU chips
99-
themselves are quite outdated, so these tests will take longer to run than the
100-
CPU tests.
103+
likely be the first test feedback on your commit.
101104

102105
![CPU tests launch when `torch_xla` is
103106
complete](../docs/assets/ci_test_dependency.png)
104107

105-
![GPU tests also depend on CUDA
106-
plugin](../docs/assets/ci_test_dependency_gpu.png)
107-
108108
For the C++ test groups in either case, the test binaries are pre-built during
109109
the build phase and packaged in `cpp-test-bin`. This will only be downloaded if
110110
necessary.
111111

112-
[^1]: Note: both GPU and TPU support require their respective plugins to be
112+
[^1]: Note: TPU support require its respective plugins to be
113113
installed. This package will _not_ work on either out of the box.
114114

115115
### TPU CI
@@ -165,13 +165,6 @@ good" commit to prevent accidental changes from PyTorch/XLA to break PyTorch CI
165165
without warning. PyTorch has hundreds of commits each week, and this pin ensures
166166
that PyTorch/XLA as a downstream package does not cause failures in PyTorch CI.
167167

168-
#### Why does PyTorch/XLA CI pull from PyTorch master?
169-
170-
[PyTorch/XLA CI pulls PyTorch from master][pull-pytorch-master] unless a PyTorch
171-
pin is manually provided. PyTorch/XLA is a downstream package to PyTorch, and
172-
pulling from master ensures that PyTorch/XLA will stay up-to-date and works with
173-
the latest PyTorch changes.
174-
175168
#### TPU CI is broken
176169

177170
If the TPU CI won't run, try to debug using the following steps:

.github/scripts/run_tests.sh

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,7 @@ function run_torch_xla_cpp_tests() {
3030

3131
TORCH_DIR=$(python -c "import pkgutil; import os; print(os.path.dirname(pkgutil.get_loader('torch').get_filename()))")
3232
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${TORCH_DIR}/lib
33-
if [ -x "$(command -v nvidia-smi)" ]; then
34-
CUDA_PLUGIN_DIR=$(python -c "import pkgutil; import os; print(os.path.dirname(pkgutil.get_loader('torch_xla_cuda_plugin').get_filename()))")
35-
export PJRT_LIBRARY_PATH=$CUDA_PLUGIN_DIR/lib/pjrt_c_api_gpu_plugin.so
36-
export PJRT_DEVICE=LIBRARY
37-
export PJRT_DYNAMIC_PLUGINS=1
38-
else
39-
export PJRT_DEVICE=CPU
40-
fi
33+
export PJRT_DEVICE=CPU
4134
export XLA_EXPERIMENTAL="nonzero:masked_select:nms"
4235

4336
test_names=("test_aten_xla_tensor_1"
@@ -84,11 +77,6 @@ PYTORCH_DIR=$1
8477
XLA_DIR=$2
8578
USE_COVERAGE="${3:-0}"
8679

87-
if [ -x "$(command -v nvidia-smi)" ]; then
88-
num_devices=$(nvidia-smi --list-gpus | wc -l)
89-
echo "Found $num_devices GPU devices..."
90-
export GPU_NUM_DEVICES=$num_devices
91-
fi
9280
export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla"
9381
export CXX_ABI=$(python -c "import torch;print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")
9482

.github/upstream/install_conda.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ function install_and_setup_conda() {
2727
fi
2828
export CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
2929

30+
# Accept Conda channels' ToS automatically.
31+
# Ref: https://github.com/pytorch/pytorch/issues/158438#issuecomment-3084935777
32+
export CONDA_PLUGINS_AUTO_ACCEPT_TOS="yes"
33+
3034
conda update -y -n base conda
3135
conda install -y python=$PYTHON_VERSION
3236

0 commit comments

Comments
 (0)