Skip to content

model training failed #40

@yurasong

Description

@yurasong

Hi!

I am currently trying to practice your package with your vignette and I faced error when launching mod.train().

I am currently running HPC of my institute, linux-based with GPU of 40x NVidia RTX.

The message I got is:

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.


RuntimeError Traceback (most recent call last)
Input In [22], in <cell line: 1>()
----> 1 mod.train()

File /opt/anaconda/lib/python3.9/site-packages/cell2fate/_cell2fate_DynamicalModel.py:164, in Cell2fate_DynamicalModel.train(self, max_epochs, batch_size, train_size, lr, **kwargs)
161 kwargs["train_size"] = train_size
162 kwargs["lr"] = lr
--> 164 super().train(**kwargs)

File /opt/anaconda/lib/python3.9/site-packages/scvi/model/base/_pyromixin.py:146, in PyroSviTrainMixin.train(self, max_epochs, use_gpu, train_size, validation_size, batch_size, early_stopping, lr, training_plan, plan_kwargs, **trainer_kwargs)
136 trainer_kwargs["callbacks"].append(PyroJitGuideWarmup())
138 runner = TrainRunner(
139 self,
140 training_plan=training_plan,
(...)
144 **trainer_kwargs,
145 )
--> 146 return runner()

File /opt/anaconda/lib/python3.9/site-packages/scvi/train/_trainrunner.py:74, in TrainRunner.call(self)
71 if hasattr(self.data_splitter, "n_val"):
72 self.training_plan.n_obs_validation = self.data_splitter.n_val
---> 74 self.trainer.fit(self.training_plan, self.data_splitter)
75 self._update_history()
77 # data splitter only gets these attrs after fit

File /opt/anaconda/lib/python3.9/site-packages/scvi/train/_trainer.py:186, in Trainer.fit(self, *args, **kwargs)
180 if isinstance(args[0], PyroTrainingPlan):
181 warnings.filterwarnings(
182 action="ignore",
183 category=UserWarning,
184 message="LightningModule.configure_optimizers returned None",
185 )
--> 186 super().fit(*args, **kwargs)

File /opt/anaconda/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:740, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, train_dataloader, ckpt_path)
735 rank_zero_deprecation(
736 "trainer.fit(train_dataloader) is deprecated in v1.4 and will be removed in v1.6."
737 " Use trainer.fit(train_dataloaders) instead. HINT: added 's'"
738 )
739 train_dataloaders = train_dataloader
--> 740 self._call_and_handle_interrupt(
741 self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
742 )

File /opt/anaconda/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:685, in Trainer._call_and_handle_interrupt(self, trainer_fn, *args, **kwargs)
675 r"""
676 Error handling, intended to be used only for main trainer function entry points (fit, validate, test, predict)
677 as all errors should funnel through them
(...)
682 **kwargs: keyword arguments to be passed to trainer_fn
683 """
684 try:
--> 685 return trainer_fn(*args, **kwargs)
686 # TODO: treat KeyboardInterrupt as BaseException (delete the code below) in v1.7
687 except KeyboardInterrupt as exception:

File /opt/anaconda/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:777, in Trainer._fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
775 # TODO: ckpt_path only in v1.7
776 ckpt_path = ckpt_path or self.resume_from_checkpoint
--> 777 self._run(model, ckpt_path=ckpt_path)
779 assert self.state.stopped
780 self.training = False

File /opt/anaconda/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1199, in Trainer._run(self, model, ckpt_path)
1196 self.checkpoint_connector.resume_end()
1198 # dispatch start_training or start_evaluating or start_predicting
-> 1199 self._dispatch()
1201 # plugin will finalized fitting (e.g. ddp_spawn will load trained model)
1202 self._post_dispatch()

File /opt/anaconda/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1279, in Trainer._dispatch(self)
1277 self.training_type_plugin.start_predicting(self)
1278 else:
-> 1279 self.training_type_plugin.start_training(self)

File /opt/anaconda/lib/python3.9/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py:202, in TrainingTypePlugin.start_training(self, trainer)
200 def start_training(self, trainer: "pl.Trainer") -> None:
201 # double dispatch to initiate the training loop
--> 202 self._results = trainer.run_stage()

File /opt/anaconda/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1289, in Trainer.run_stage(self)
1287 if self.predicting:
1288 return self._run_predict()
-> 1289 return self._run_train()

File /opt/anaconda/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1319, in Trainer._run_train(self)
1317 self.fit_loop.trainer = self
1318 with torch.autograd.set_detect_anomaly(self._detect_anomaly):
-> 1319 self.fit_loop.run()

File /opt/anaconda/lib/python3.9/site-packages/pytorch_lightning/loops/base.py:145, in Loop.run(self, *args, **kwargs)
143 try:
144 self.on_advance_start(*args, **kwargs)
--> 145 self.advance(*args, **kwargs)
146 self.on_advance_end()
147 self.restarting = False

File /opt/anaconda/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py:234, in FitLoop.advance(self)
231 data_fetcher = self.trainer._data_connector.get_profiled_dataloader(dataloader)
233 with self.trainer.profiler.profile("run_training_epoch"):
--> 234 self.epoch_loop.run(data_fetcher)
236 # the global step is manually decreased here due to backwards compatibility with existing loggers
237 # as they expect that the same step is used when logging epoch end metrics even when the batch loop has
238 # finished. this means the attribute does not exactly track the number of optimizer steps applied.
239 # TODO(@carmocca): deprecate and rename so users don't get confused
240 self.global_step -= 1

File /opt/anaconda/lib/python3.9/site-packages/pytorch_lightning/loops/base.py:145, in Loop.run(self, *args, **kwargs)
143 try:
144 self.on_advance_start(*args, **kwargs)
--> 145 self.advance(*args, **kwargs)
146 self.on_advance_end()
147 self.restarting = False

File /opt/anaconda/lib/python3.9/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py:193, in TrainingEpochLoop.advance(self, *args, **kwargs)
190 self.batch_progress.increment_started()
192 with self.trainer.profiler.profile("run_training_batch"):
--> 193 batch_output = self.batch_loop.run(batch, batch_idx)
195 self.batch_progress.increment_processed()
197 # update non-plateau LR schedulers
198 # update epoch-interval ones only when we are at the end of training epoch

File /opt/anaconda/lib/python3.9/site-packages/pytorch_lightning/loops/base.py:145, in Loop.run(self, *args, **kwargs)
143 try:
144 self.on_advance_start(*args, **kwargs)
--> 145 self.advance(*args, **kwargs)
146 self.on_advance_end()
147 self.restarting = False

File /opt/anaconda/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py:90, in TrainingBatchLoop.advance(self, batch, batch_idx)
88 outputs = self.optimizer_loop.run(split_batch, optimizers, batch_idx)
89 else:
---> 90 outputs = self.manual_loop.run(split_batch, batch_idx)
91 if outputs:
92 # automatic: can be empty if all optimizers skip their batches
93 # manual: #9052 added support for raising StopIteration in the training_step. If that happens,
94 # then advance doesn't finish and an empty dict is returned
95 self._outputs.append(outputs)

File /opt/anaconda/lib/python3.9/site-packages/pytorch_lightning/loops/base.py:145, in Loop.run(self, *args, **kwargs)
143 try:
144 self.on_advance_start(*args, **kwargs)
--> 145 self.advance(*args, **kwargs)
146 self.on_advance_end()
147 self.restarting = False

File /opt/anaconda/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/manual_loop.py:111, in ManualOptimization.advance(self, batch, batch_idx)
109 lightning_module._current_fx_name = "training_step"
110 with self.trainer.profiler.profile("training_step"):
--> 111 training_step_output = self.trainer.accelerator.training_step(step_kwargs)
112 self.trainer.training_type_plugin.post_training_step()
114 del step_kwargs

File /opt/anaconda/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py:219, in Accelerator.training_step(self, step_kwargs)
214 """The actual training step.
215
216 See :meth:~pytorch_lightning.core.lightning.LightningModule.training_step for more details
217 """
218 with self.precision_plugin.train_step_context():
--> 219 return self.training_type_plugin.training_step(*step_kwargs.values())

File /opt/anaconda/lib/python3.9/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py:213, in TrainingTypePlugin.training_step(self, *args, **kwargs)
212 def training_step(self, *args, **kwargs):
--> 213 return self.model.training_step(*args, **kwargs)

File /opt/anaconda/lib/python3.9/site-packages/scvi/train/_trainingplans.py:741, in PyroTrainingPlan.training_step(self, batch, batch_idx)
739 kwargs.update({"kl_weight": self.kl_weight})
740 # pytorch lightning requires a Tensor object for loss
--> 741 loss = torch.Tensor([self.svi.step(*args, **kwargs)])
743 return {"loss": loss}

File /opt/anaconda/lib/python3.9/site-packages/pyro/infer/svi.py:145, in SVI.step(self, *args, **kwargs)
143 # get loss and compute gradients
144 with poutine.trace(param_only=True) as param_capture:
--> 145 loss = self.loss_and_grads(self.model, self.guide, *args, **kwargs)
147 params = set(
148 site["value"].unconstrained() for site in param_capture.trace.nodes.values()
149 )
151 # actually perform gradient steps
152 # torch.optim objects gets instantiated for any params that haven't been seen yet

File /opt/anaconda/lib/python3.9/site-packages/pyro/infer/trace_elbo.py:140, in Trace_ELBO.loss_and_grads(self, model, guide, *args, **kwargs)
138 loss = 0.0
139 # grab a trace from the generator
--> 140 for model_trace, guide_trace in self._get_traces(model, guide, args, kwargs):
141 loss_particle, surrogate_loss_particle = self._differentiable_loss_particle(
142 model_trace, guide_trace
143 )
144 loss += loss_particle / self.num_particles

File /opt/anaconda/lib/python3.9/site-packages/pyro/infer/elbo.py:237, in ELBO._get_traces(self, model, guide, args, kwargs)
235 else:
236 for i in range(self.num_particles):
--> 237 yield self._get_trace(model, guide, args, kwargs)

File /opt/anaconda/lib/python3.9/site-packages/pyro/infer/trace_elbo.py:57, in Trace_ELBO._get_trace(self, model, guide, args, kwargs)
52 def _get_trace(self, model, guide, args, kwargs):
53 """
54 Returns a single trace from the guide, and the model that is run
55 against it.
56 """
---> 57 model_trace, guide_trace = get_importance_trace(
58 "flat", self.max_plate_nesting, model, guide, args, kwargs
59 )
60 if is_validation_enabled():
61 check_if_enumerated(guide_trace)

File /opt/anaconda/lib/python3.9/site-packages/pyro/infer/enum.py:75, in get_importance_trace(graph_type, max_plate_nesting, model, guide, args, kwargs, detach)
72 guide_trace = prune_subsample_sites(guide_trace)
73 model_trace = prune_subsample_sites(model_trace)
---> 75 model_trace.compute_log_prob()
76 guide_trace.compute_score_parts()
77 if is_validation_enabled():

File /opt/anaconda/lib/python3.9/site-packages/pyro/poutine/trace_struct.py:230, in Trace.compute_log_prob(self, site_filter)
228 if "log_prob" not in site:
229 try:
--> 230 log_p = site["fn"].log_prob(
231 site["value"], *site["args"], **site["kwargs"]
232 )
233 except ValueError as e:
234 _, exc_value, traceback = sys.exc_info()

File /opt/anaconda/lib/python3.9/site-packages/torch/distributions/gamma.py:71, in Gamma.log_prob(self, value)
67 if self._validate_args:
68 self._validate_sample(value)
69 return (self.concentration * torch.log(self.rate) +
70 (self.concentration - 1) * torch.log(value) -
---> 71 self.rate * value - torch.lgamma(self.concentration))

RuntimeError: nvrtc: error: invalid value for --gpu-architecture (-arch)

#define POS_INFINITY __int_as_float(0x7f800000)
#define INFINITY POS_INFINITY
#define NEG_INFINITY __int_as_float(0xff800000)
#define NAN __int_as_float(0x7fffffff)

typedef long long int int64_t;
typedef unsigned int uint32_t;
typedef signed char int8_t;
typedef unsigned char uint8_t; // NOTE: this MUST be "unsigned char"! "char" is equivalent to "signed char"
typedef short int16_t;
static_assert(sizeof(int64_t) == 8, "expected size does not match");
static_assert(sizeof(uint32_t) == 4, "expected size does not match");
static_assert(sizeof(int8_t) == 1, "expected size does not match");
constexpr int num_threads = 128;
constexpr int thread_work_size = 4; // TODO: make template substitution once we decide where those vars live
constexpr int block_work_size = thread_work_size * num_threads;
//TODO use _assert_fail, because assert is disabled in non-debug builds
#define ERROR_UNSUPPORTED_CAST assert(false);

namespace std {

using ::signbit;
using ::isfinite;
using ::isinf;
using ::isnan;

using ::abs;

using ::acos;
using ::acosf;
using ::asin;
using ::asinf;
using ::atan;
using ::atanf;
using ::atan2;
using ::atan2f;
using ::ceil;
using ::ceilf;
using ::cos;
using ::cosf;
using ::cosh;
using ::coshf;

using ::exp;
using ::expf;

using ::fabs;
using ::fabsf;
using ::floor;
using ::floorf;

using ::fmod;
using ::fmodf;

using ::frexp;
using ::frexpf;
using ::ldexp;
using ::ldexpf;

using ::log;
using ::logf;

using ::log10;
using ::log10f;
using ::modf;
using ::modff;

using ::pow;
using ::powf;

using ::sin;
using ::sinf;
using ::sinh;
using ::sinhf;

using ::sqrt;
using ::sqrtf;
using ::tan;
using ::tanf;

using ::tanh;
using ::tanhf;

using ::acosh;
using ::acoshf;
using ::asinh;
using ::asinhf;
using ::atanh;
using ::atanhf;
using ::cbrt;
using ::cbrtf;

using ::copysign;
using ::copysignf;

using ::erf;
using ::erff;
using ::erfc;
using ::erfcf;
using ::exp2;
using ::exp2f;
using ::expm1;
using ::expm1f;
using ::fdim;
using ::fdimf;
using ::fmaf;
using ::fma;
using ::fmax;
using ::fmaxf;
using ::fmin;
using ::fminf;
using ::hypot;
using ::hypotf;
using ::ilogb;
using ::ilogbf;
using ::lgamma;
using ::lgammaf;
using ::llrint;
using ::llrintf;
using ::llround;
using ::llroundf;
using ::log1p;
using ::log1pf;
using ::log2;
using ::log2f;
using ::logb;
using ::logbf;
using ::lrint;
using ::lrintf;
using ::lround;
using ::lroundf;

using ::nan;
using ::nanf;

using ::nearbyint;
using ::nearbyintf;
using ::nextafter;
using ::nextafterf;
using ::remainder;
using ::remainderf;
using ::remquo;
using ::remquof;
using ::rint;
using ::rintf;
using ::round;
using ::roundf;
using ::scalbln;
using ::scalblnf;
using ::scalbn;
using ::scalbnf;
using ::tgamma;
using ::tgammaf;
using ::trunc;
using ::truncf;

} // namespace std

// NB: Order matters for this macro; it is relied upon in
// promoteTypesLookup and the serialization format.
// Note, some types have ctype as void because we don't support them in codegen
#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(
)
_(uint8_t, Byte) /* 0 /
_(int8_t, Char) /
1 /
_(int16_t, Short) /
2 /
_(int, Int) /
3 /
_(int64_t, Long) /
4 /
_(at::Half, Half) /
5 /
_(float, Float) /
6 /
_(double, Double) /
7 /
_(std::complexat::Half, ComplexHalf) /
8 /
_(std::complex, ComplexFloat) /
9 /
_(std::complex, ComplexDouble) /
10 /
_(bool, Bool) /
11 /
_(void, QInt8) /
12 /
_(void, QUInt8) /
13 /
_(void, QInt32) /
14 /
_(at::BFloat16, BFloat16) /
15 */ \

#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(_)
_(uint8_t, Byte)
_(int8_t, Char)
_(int16_t, Short)
_(int, Int)
_(int64_t, Long)
_(at::Half, Half)
_(float, Float)
_(double, Double)
_(std::complex, ComplexFloat)
_(std::complex, ComplexDouble)
_(bool, Bool)
_(at::BFloat16, BFloat16)

enum class ScalarType : int8_t {
#define DEFINE_ENUM(_1, n) n,
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ENUM)
#undef DEFINE_ENUM
Undefined,
NumOptions
};

template <typename T, int size>
struct Array {
T data[size];

device T operator[](int i) const {
return data[i];
}
device T& operator[](int i) {
return data[i];
}
Array() = default;
Array(const Array&) = default;
Array& operator=(const Array&) = default;
};

template
device inline scalar_t load(char* base_ptr, uint32_t offset) {
return (reinterpret_cast<scalar_t>(base_ptr) + offset);
}

template
device inline void store(scalar_t value, char *base_ptr, uint32_t offset) {
*(reinterpret_cast<scalar_t *>(base_ptr) + offset) = value;
}

// aligned vector generates vectorized load/store on CUDA
template<typename scalar_t, int vec_size>
struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
scalar_t val[vec_size];
};

template T lgamma_kernel(T a) { return lgamma(a); }

// TODO: setup grid-stride loop

extern "C" global
void lgamma_kernel_vectorized4_kernel(
const int N,
Array<char*, 1+1> data,
float scalar_val) //[1+1],
{
constexpr int vec_size = 4;
int remaining = N - block_work_size * blockIdx.x;
auto thread_idx = threadIdx.x;
int idx = blockIdx.x;
float arg0[4];

  float out[4];
  

  if (remaining < block_work_size) {
    #pragma unroll
    for (int j = 0; j < thread_work_size; j++){
      if (thread_idx >= remaining) {
        break;
      }
      int linear_idx = thread_idx + block_work_size * idx;
      arg0[j] = load<float>(data[1], linear_idx);
      
      thread_idx += num_threads;
    }
    #pragma unroll
    for (int j = 0; j < thread_work_size; j++) {
      if ((threadIdx.x  + j*num_threads) < remaining) {
        out[j] = lgamma_kernel<float>(arg0[j] );
      }
    }
    thread_idx = threadIdx.x;
    #pragma unroll
    for (int j = 0; j < thread_work_size; j++) {
      if (thread_idx >= remaining) {
          break;
      }
      int linear_idx = thread_idx + block_work_size * idx;
      store<float>(out[j], data[0], linear_idx);
      thread_idx += num_threads;
    }
  } else {
    static constexpr int loop_size = thread_work_size / vec_size;

//actual loading
using vec_t_input = aligned_vector<float, vec_size>;
vec_t_input * vec0 = reinterpret_cast<vec_t_input *>(data[0+1]) + block_work_size / vec_size * idx;

    #pragma unroll
    for (int i = 0; i<loop_size; i++){
      vec_t_input v;
      v = vec0[thread_idx];
      #pragma unroll
      for (int j=0; j < vec_size; j++){
        arg0[vec_size * i + j] = v.val[j];
      }
      
      thread_idx += num_threads;
    }


    #pragma unroll
    for (int j = 0; j < thread_work_size; j++) {
      out[j] = lgamma_kernel<float>(arg0[j]);
    }
    using vec_t_output = aligned_vector<float, vec_size>;
    vec_t_output * to_ = reinterpret_cast<vec_t_output *>(data[0]) + block_work_size / vec_size * idx;
    int thread_idx = threadIdx.x;
    #pragma unroll
    for (int i = 0; i<loop_size; i++){
      vec_t_output v;
      #pragma unroll
      for (int j=0; j<vec_size; j++){
        v.val[j] = out[vec_size * i + j];
      }
      to_[thread_idx] = v;
      thread_idx += num_threads;
    }
  }

}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions