diff --git a/.gitignore b/.gitignore
index e6e38132f5..e3f7613fee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,5 @@
 # for QtCreator:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cfb2516631..74433f3332 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ endif()
diff --git a/cmake/HandlePython.cmake b/cmake/HandlePython.cmake
index 0c24824bcd..16215986b6 100644
--- a/cmake/HandlePython.cmake
+++ b/cmake/HandlePython.cmake
       set(Python_VERSION_MAJOR ${Python3_VERSION_MAJOR})
       set(Python_VERSION_MINOR ${Python3_VERSION_MINOR})
+      set(Python_VERSION_PATCH ${Python3_VERSION_PATCH})
-        "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}"
+        "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}.${Python_VERSION_PATCH}"
         CACHE STRING "The version of Python to build the wrappers against."
diff --git a/gtsam/base/OptionalJacobian.h b/gtsam/base/OptionalJacobian.h
index 07801df7ad..c9a960a893 100644
--- a/gtsam/base/OptionalJacobian.h
+++ b/gtsam/base/OptionalJacobian.h
@@ -20,6 +20,8 @@
 #pragma once
 #include <gtsam/config.h>      // Configuration from CMake
 #include <Eigen/Dense>
+#include <stdexcept>
+#include <string>
 #include <boost/optional.hpp>
@@ -96,6 +98,24 @@ class OptionalJacobian {
+  /**
+   * @brief Constructor from an Eigen::Ref *value*. Will not usurp if dimension is wrong
+   * @note This is important so we don't overwrite someone else's memory!
+   */
+  template<class MATRIX>
+  OptionalJacobian(Eigen::Ref<MATRIX> dynamic_ref) :
+      map_(nullptr) {
+    if (dynamic_ref.rows() == Rows && dynamic_ref.cols() == Cols && !dynamic_ref.IsRowMajor) {
+      usurp(dynamic_ref.data());
+    } else {
+      throw std::invalid_argument(
+          std::string("OptionalJacobian called with wrong dimensions or "
+                      "storage order.\n"
+                      "Expected: ") +
+          "(" + std::to_string(Rows) + ", " + std::to_string(Cols) + ")");
+    }
+  }
   /// Constructor with boost::none just makes empty
diff --git a/gtsam/discrete/AlgebraicDecisionTree.h b/gtsam/discrete/AlgebraicDecisionTree.h
index a2ceac834f..9769715a17 100644
--- a/gtsam/discrete/AlgebraicDecisionTree.h
+++ b/gtsam/discrete/AlgebraicDecisionTree.h
@@ -160,7 +160,7 @@ namespace gtsam {
               const typename Base::LabelFormatter& labelFormatter =
                   &DefaultFormatter) const {
       auto valueFormatter = [](const double& v) {
-        return (boost::format("%4.4g") % v).str();
+        return (boost::format("%4.8g") % v).str();
       Base::print(s, labelFormatter, valueFormatter);
diff --git a/gtsam/discrete/DecisionTree-inl.h b/gtsam/discrete/DecisionTree-inl.h
index b6e5482978..99f29b8e5f 100644
--- a/gtsam/discrete/DecisionTree-inl.h
+++ b/gtsam/discrete/DecisionTree-inl.h
@@ -59,33 +59,41 @@ namespace gtsam {
     /** constant stored in this leaf */
     Y constant_;
-    /** Constructor from constant */
-    Leaf(const Y& constant) :
-      constant_(constant) {}
+    /** The number of assignments contained within this leaf.
+     * Particularly useful when leaves have been pruned.
+     */
+    size_t nrAssignments_;
+    /// Constructor from constant
+    Leaf(const Y& constant, size_t nrAssignments = 1)
+        : constant_(constant), nrAssignments_(nrAssignments) {}
-    /** return the constant */
+    /// Return the constant
     const Y& constant() const {
       return constant_;
+    /// Return the number of assignments contained within this leaf.
+    size_t nrAssignments() const { return nrAssignments_; }
     /// Leaf-Leaf equality
     bool sameLeaf(const Leaf& q) const override {
       return constant_ == q.constant_;
-    /// polymorphic equality: is q is a leaf, could be
+    /// polymorphic equality: is q a leaf and is it the same as this leaf?
     bool sameLeaf(const Node& q) const override {
       return (q.isLeaf() && q.sameLeaf(*this));
-    /** equality up to tolerance */
+    /// equality up to tolerance
     bool equals(const Node& q, const CompareFunc& compare) const override {
       const Leaf* other = dynamic_cast<const Leaf*>(&q);
       if (!other) return false;
       return compare(this->constant_, other->constant_);
-    /** print */
+    /// print
     void print(const std::string& s, const LabelFormatter& labelFormatter,
                const ValueFormatter& valueFormatter) const override {
       std::cout << s << " Leaf " << valueFormatter(constant_) << std::endl;
@@ -108,14 +116,14 @@ namespace gtsam {
     /** apply unary operator */
     NodePtr apply(const Unary& op) const override {
-      NodePtr f(new Leaf(op(constant_)));
+      NodePtr f(new Leaf(op(constant_), nrAssignments_));
       return f;
     /// Apply unary operator with assignment
     NodePtr apply(const UnaryAssignment& op,
-                  const Assignment<L>& choices) const override {
-      NodePtr f(new Leaf(op(choices, constant_)));
+                  const Assignment<L>& assignment) const override {
+      NodePtr f(new Leaf(op(assignment, constant_), nrAssignments_));
       return f;
@@ -130,7 +138,8 @@ namespace gtsam {
     // Applying binary operator to two leaves results in a leaf
     NodePtr apply_g_op_fL(const Leaf& fL, const Binary& op) const override {
-      NodePtr h(new Leaf(op(fL.constant_, constant_)));  // fL op gL
+      // fL op gL
+      NodePtr h(new Leaf(op(fL.constant_, constant_), nrAssignments_));
       return h;
@@ -141,7 +150,7 @@ namespace gtsam {
     /** choose a branch, create new memory ! */
     NodePtr choose(const L& label, size_t index) const override {
-      return NodePtr(new Leaf(constant()));
+      return NodePtr(new Leaf(constant(), nrAssignments()));
     bool isLeaf() const override { return true; }
@@ -159,7 +168,10 @@ namespace gtsam {
     std::vector<NodePtr> branches_;
-    /** incremental allSame */
+    /**
+     * Incremental allSame.
+     * Records if all the branches are the same leaf.
+     */
     size_t allSame_;
     using ChoicePtr = boost::shared_ptr<const Choice>;
@@ -172,15 +184,22 @@ namespace gtsam {
-    /** If all branches of a choice node f are the same, just return a branch */
+    /// If all branches of a choice node f are the same, just return a branch.
     static NodePtr Unique(const ChoicePtr& f) {
-#ifndef DT_NO_PRUNING
       if (f->allSame_) {
         assert(f->branches().size() > 0);
         NodePtr f0 = f->branches_[0];
-        assert(f0->isLeaf());
+        size_t nrAssignments = 0;
+        for(auto branch: f->branches()) {
+          assert(branch->isLeaf());
+          nrAssignments +=
+              boost::dynamic_pointer_cast<const Leaf>(branch)->nrAssignments();
+        }
         NodePtr newLeaf(
-            new Leaf(boost::dynamic_pointer_cast<const Leaf>(f0)->constant()));
+            new Leaf(boost::dynamic_pointer_cast<const Leaf>(f0)->constant(),
+                     nrAssignments));
         return newLeaf;
       } else
@@ -189,15 +208,13 @@ namespace gtsam {
     bool isLeaf() const override { return false; }
-    /** Constructor, given choice label and mandatory expected branch count */
+    /// Constructor, given choice label and mandatory expected branch count.
     Choice(const L& label, size_t count) :
       label_(label), allSame_(true) {
-    /**
-     * Construct from applying binary op to two Choice nodes
-     */
+    /// Construct from applying binary op to two Choice nodes.
     Choice(const Choice& f, const Choice& g, const Binary& op) :
       allSame_(true) {
       // Choose what to do based on label
@@ -225,6 +242,7 @@ namespace gtsam {
+    /// Return the label of this choice node.
     const L& label() const {
       return label_;
@@ -246,7 +264,7 @@ namespace gtsam {
-    /** print (as a tree) */
+    /// print (as a tree).
     void print(const std::string& s, const LabelFormatter& labelFormatter,
                const ValueFormatter& valueFormatter) const override {
       std::cout << s << " Choice(";
@@ -292,7 +310,7 @@ namespace gtsam {
       return (q.isLeaf() && q.sameLeaf(*this));
-    /** equality */
+    /// equality
     bool equals(const Node& q, const CompareFunc& compare) const override {
       const Choice* other = dynamic_cast<const Choice*>(&q);
       if (!other) return false;
@@ -305,7 +323,7 @@ namespace gtsam {
       return true;
-    /** evaluate */
+    /// evaluate
     const Y& operator()(const Assignment<L>& x) const override {
 #ifndef NDEBUG
       typename Assignment<L>::const_iterator it = x.find(label_);
@@ -320,13 +338,13 @@ namespace gtsam {
       return (*child)(x);
-    /**
-     * Construct from applying unary op to a Choice node
-     */
+    /// Construct from applying unary op to a Choice node.
     Choice(const L& label, const Choice& f, const Unary& op) :
       label_(label), allSame_(true) {
       branches_.reserve(f.branches_.size());  // reserve space
-      for (const NodePtr& branch : f.branches_) push_back(branch->apply(op));
+      for (const NodePtr& branch : f.branches_) {
+        push_back(branch->apply(op));
+      }
@@ -337,28 +355,28 @@ namespace gtsam {
      * @param f The original choice node to apply the op on.
      * @param op Function to apply on the choice node. Takes Assignment and
      * value as arguments.
-     * @param choices The Assignment that will go to op.
+     * @param assignment The Assignment that will go to op.
     Choice(const L& label, const Choice& f, const UnaryAssignment& op,
-           const Assignment<L>& choices)
+           const Assignment<L>& assignment)
         : label_(label), allSame_(true) {
       branches_.reserve(f.branches_.size());  // reserve space
-      Assignment<L> choices_ = choices;
+      Assignment<L> assignment_ = assignment;
       for (size_t i = 0; i < f.branches_.size(); i++) {
-        choices_[label_] = i;  // Set assignment for label to i
+        assignment_[label_] = i;  // Set assignment for label to i
         const NodePtr branch = f.branches_[i];
-        push_back(branch->apply(op, choices_));
+        push_back(branch->apply(op, assignment_));
-        // Remove the choice so we are backtracking
-        auto choice_it = choices_.find(label_);
-        choices_.erase(choice_it);
+        // Remove the assignment so we are backtracking
+        auto assignment_it = assignment_.find(label_);
+        assignment_.erase(assignment_it);
-    /** apply unary operator */
+    /// apply unary operator.
     NodePtr apply(const Unary& op) const override {
       auto r = boost::make_shared<Choice>(label_, *this, op);
       return Unique(r);
@@ -366,8 +384,8 @@ namespace gtsam {
     /// Apply unary operator with assignment
     NodePtr apply(const UnaryAssignment& op,
-                  const Assignment<L>& choices) const override {
-      auto r = boost::make_shared<Choice>(label_, *this, op, choices);
+                  const Assignment<L>& assignment) const override {
+      auto r = boost::make_shared<Choice>(label_, *this, op, assignment);
       return Unique(r);
@@ -640,7 +658,7 @@ namespace gtsam {
     // If leaf, apply unary conversion "op" and create a unique leaf.
     using MXLeaf = typename DecisionTree<M, X>::Leaf;
     if (auto leaf = boost::dynamic_pointer_cast<const MXLeaf>(f)) {
-      return NodePtr(new Leaf(Y_of_X(leaf->constant())));
+      return NodePtr(new Leaf(Y_of_X(leaf->constant()), leaf->nrAssignments()));
     // Check if Choice
@@ -662,7 +680,16 @@ namespace gtsam {
-  // Functor performing depth-first visit without Assignment<L> argument.
+  /**
+   * Functor performing depth-first visit to each leaf with the leaf value as
+   * the argument.
+   *
+   * NOTE: We differentiate between leaves and assignments. Concretely, a 3
+   * binary variable tree will have 2^3=8 assignments, but based on pruning, it
+   * can have less than 8 leaves. For example, if a tree has all assignment
+   * values as 1, then pruning will cause the tree to have only 1 leaf yet 8
+   * assignments.
+   */
   template <typename L, typename Y>
   struct Visit {
     using F = std::function<void(const Y&)>;
@@ -691,33 +718,74 @@ namespace gtsam {
-  // Functor performing depth-first visit with Assignment<L> argument.
+  /**
+   * Functor performing depth-first visit to each leaf with the Leaf object
+   * passed as an argument.
+   *
+   * NOTE: We differentiate between leaves and assignments. Concretely, a 3
+   * binary variable tree will have 2^3=8 assignments, but based on pruning, it
+   * can have <8 leaves. For example, if a tree has all assignment values as 1,
+   * then pruning will cause the tree to have only 1 leaf yet 8 assignments.
+   */
+  template <typename L, typename Y>
+  struct VisitLeaf {
+    using F = std::function<void(const typename DecisionTree<L, Y>::Leaf&)>;
+    explicit VisitLeaf(F f) : f(f) {}  ///< Construct from folding function.
+    F f;                           ///< folding function object.
+    /// Do a depth-first visit on the tree rooted at node.
+    void operator()(const typename DecisionTree<L, Y>::NodePtr& node) const {
+      using Leaf = typename DecisionTree<L, Y>::Leaf;
+      if (auto leaf = boost::dynamic_pointer_cast<const Leaf>(node))
+        return f(*leaf);
+      using Choice = typename DecisionTree<L, Y>::Choice;
+      auto choice = boost::dynamic_pointer_cast<const Choice>(node);
+      if (!choice)
+        throw std::invalid_argument("DecisionTree::VisitLeaf: Invalid NodePtr");
+      for (auto&& branch : choice->branches()) (*this)(branch);  // recurse!
+    }
+  };
+  template <typename L, typename Y>
+  template <typename Func>
+  void DecisionTree<L, Y>::visitLeaf(Func f) const {
+    VisitLeaf<L, Y> visit(f);
+    visit(root_);
+  }
+  /****************************************************************************/
+  /**
+   * Functor performing depth-first visit to each leaf with the leaf's
+   * `Assignment<L>` and value passed as arguments.
+   *
+   * NOTE: Follows the same pruning semantics as `visit`.
+   */
   template <typename L, typename Y>
   struct VisitWith {
-    using Choices = Assignment<L>;
-    using F = std::function<void(const Choices&, const Y&)>;
+    using F = std::function<void(const Assignment<L>&, const Y&)>;
     explicit VisitWith(F f) : f(f) {}  ///< Construct from folding function.
-    Choices choices;  ///< Assignment, mutating through recursion.
-    F f;              ///< folding function object.
+    Assignment<L> assignment;  ///< Assignment, mutating through recursion.
+    F f;                       ///< folding function object.
     /// Do a depth-first visit on the tree rooted at node.
     void operator()(const typename DecisionTree<L, Y>::NodePtr& node) {
       using Leaf = typename DecisionTree<L, Y>::Leaf;
       if (auto leaf = boost::dynamic_pointer_cast<const Leaf>(node))
-        return f(choices, leaf->constant());
+        return f(assignment, leaf->constant());
       using Choice = typename DecisionTree<L, Y>::Choice;
       auto choice = boost::dynamic_pointer_cast<const Choice>(node);
       if (!choice)
         throw std::invalid_argument("DecisionTree::VisitWith: Invalid NodePtr");
       for (size_t i = 0; i < choice->nrChoices(); i++) {
-        choices[choice->label()] = i;  // Set assignment for label to i
+        assignment[choice->label()] = i;  // Set assignment for label to i
         (*this)(choice->branches()[i]);  // recurse!
         // Remove the choice so we are backtracking
-        auto choice_it = choices.find(choice->label());
-        choices.erase(choice_it);
+        auto choice_it = assignment.find(choice->label());
+        assignment.erase(choice_it);
@@ -747,12 +815,26 @@ namespace gtsam {
-  // labels is just done with a visit
+  /**
+   * Get (partial) labels by performing a visit.
+   *
+   * This method performs a depth-first search to go to every leaf and records
+   * the keys assignment which leads to that leaf. Since the tree can be pruned,
+   * there might be a leaf at a lower depth which results in a partial
+   * assignment (i.e. not all keys are specified).
+   *
+   * E.g. given a tree with 3 keys, there may be a branch where the 3rd key has
+   * the same values for all the leaves. This leads to the branch being pruned
+   * so we get a leaf which is arrived at by just the first 2 keys and their
+   * assignments.
+   */
   template <typename L, typename Y>
   std::set<L> DecisionTree<L, Y>::labels() const {
     std::set<L> unique;
-    auto f = [&](const Assignment<L>& choices, const Y&) {
-      for (auto&& kv : choices) unique.insert(kv.first);
+    auto f = [&](const Assignment<L>& assignment, const Y&) {
+      for (auto&& kv : assignment) {
+        unique.insert(kv.first);
+      }
     return unique;
@@ -801,8 +883,8 @@ namespace gtsam {
       throw std::runtime_error(
           "DecisionTree::apply(unary op) undefined for empty tree.");
-    Assignment<L> choices;
-    return DecisionTree(root_->apply(op, choices));
+    Assignment<L> assignment;
+    return DecisionTree(root_->apply(op, assignment));
diff --git a/gtsam/discrete/DecisionTree.h b/gtsam/discrete/DecisionTree.h
index c0a2a7a1c6..1f45d320b9 100644
--- a/gtsam/discrete/DecisionTree.h
+++ b/gtsam/discrete/DecisionTree.h
@@ -105,7 +105,7 @@ namespace gtsam {
       virtual const Y& operator()(const Assignment<L>& x) const = 0;
       virtual Ptr apply(const Unary& op) const = 0;
       virtual Ptr apply(const UnaryAssignment& op,
-                        const Assignment<L>& choices) const = 0;
+                        const Assignment<L>& assignment) const = 0;
       virtual Ptr apply_f_op_g(const Node&, const Binary&) const = 0;
       virtual Ptr apply_g_op_fL(const Leaf&, const Binary&) const = 0;
       virtual Ptr apply_g_op_fC(const Choice&, const Binary&) const = 0;
@@ -153,7 +153,7 @@ namespace gtsam {
     /** Create a constant */
     explicit DecisionTree(const Y& y);
-    /** Create a new leaf function splitting on a variable */
+    /// Create tree with 2 assignments `y1`, `y2`, splitting on variable `label`
     DecisionTree(const L& label, const Y& y1, const Y& y2);
     /** Allow Label+Cardinality for convenience */
@@ -219,9 +219,8 @@ namespace gtsam {
     /// @name Standard Interface
     /// @{
-    /** Make virtual */
-    virtual ~DecisionTree() {
-    }
+    /// Make virtual
+    virtual ~DecisionTree() {}
     /// Check if tree is empty.
     bool empty() const { return !root_; }
@@ -234,11 +233,13 @@ namespace gtsam {
      * @brief Visit all leaves in depth-first fashion.
-     * 
-     * @param f side-effect taking a value.
-     * 
-     * @note Due to pruning, leaves might not exhaust choices.
-     * 
+     *
+     * @param f (side-effect) Function taking a value.
+     *
+     * @note Due to pruning, the number of leaves may not be the same as the
+     * number of assignments. E.g. if we have a tree on 2 binary variables with
+     * all values being 1, then there are 2^2=4 assignments, but only 1 leaf.
+     *
      * Example:
      *   int sum = 0;
      *   auto visitor = [&](int y) { sum += y; };
@@ -249,14 +250,33 @@ namespace gtsam {
      * @brief Visit all leaves in depth-first fashion.
-     * 
-     * @param f side-effect taking an assignment and a value.
-     * 
-     * @note Due to pruning, leaves might not exhaust choices.
-     * 
+     *
+     * @param f (side-effect) Function taking the leaf node pointer.
+     *
+     * @note Due to pruning, the number of leaves may not be the same as the
+     * number of assignments. E.g. if we have a tree on 2 binary variables with
+     * all values being 1, then there are 2^2=4 assignments, but only 1 leaf.
+     *
      * Example:
      *   int sum = 0;
-     *   auto visitor = [&](const Assignment<L>& choices, int y) { sum += y; };
+     *   auto visitor = [&](int y) { sum += y; };
+     *   tree.visitWith(visitor);
+     */
+    template <typename Func>
+    void visitLeaf(Func f) const;
+    /**
+     * @brief Visit all leaves in depth-first fashion.
+     *
+     * @param f (side-effect) Function taking an assignment and a value.
+     *
+     * @note Due to pruning, the number of leaves may not be the same as the
+     * number of assignments. E.g. if we have a tree on 2 binary variables with
+     * all values being 1, then there are 2^2=4 assignments, but only 1 leaf.
+     *
+     * Example:
+     *   int sum = 0;
+     *   auto visitor = [&](const Assignment<L>& assignment, int y) { sum += y; };
      *   tree.visitWith(visitor);
     template <typename Func>
@@ -275,7 +295,7 @@ namespace gtsam {
      * @note X is always passed by value.
      * @note Due to pruning, leaves might not exhaust choices.
-     * 
+     *
      * Example:
      *   auto add = [](const double& y, double x) { return y + x; };
      *   double sum = tree.fold(add, 0.0);
diff --git a/gtsam/discrete/DecisionTreeFactor.cpp b/gtsam/discrete/DecisionTreeFactor.cpp
index e95b8fe374..4e16fc689e 100644
--- a/gtsam/discrete/DecisionTreeFactor.cpp
+++ b/gtsam/discrete/DecisionTreeFactor.cpp
@@ -286,5 +286,43 @@ namespace gtsam {
         AlgebraicDecisionTree<Key>(keys, table),
         cardinalities_(keys.cardinalities()) {}
+  /* ************************************************************************ */
+  DecisionTreeFactor DecisionTreeFactor::prune(size_t maxNrAssignments) const {
+    const size_t N = maxNrAssignments;
+    // Get the probabilities in the decision tree so we can threshold.
+    std::vector<double> probabilities;
+    this->visitLeaf([&](const Leaf& leaf) {
+      size_t nrAssignments = leaf.nrAssignments();
+      double prob = leaf.constant();
+      probabilities.insert(probabilities.end(), nrAssignments, prob);
+    });
+    // The number of probabilities can be lower than max_leaves
+    if (probabilities.size() <= N) {
+      return *this;
+    }
+    std::sort(probabilities.begin(), probabilities.end(),
+              std::greater<double>{});
+    double threshold = probabilities[N - 1];
+    // Now threshold the decision tree
+    size_t total = 0;
+    auto thresholdFunc = [threshold, &total, N](const double& value) {
+      if (value < threshold || total >= N) {
+        return 0.0;
+      } else {
+        total += 1;
+        return value;
+      }
+    };
+    DecisionTree<Key, double> thresholded(*this, thresholdFunc);
+    // Create pruned decision tree factor and return.
+    return DecisionTreeFactor(this->discreteKeys(), thresholded);
+  }
   /* ************************************************************************ */
 }  // namespace gtsam
diff --git a/gtsam/discrete/DecisionTreeFactor.h b/gtsam/discrete/DecisionTreeFactor.h
index 91fa7c4849..86fa446498 100644
--- a/gtsam/discrete/DecisionTreeFactor.h
+++ b/gtsam/discrete/DecisionTreeFactor.h
@@ -170,6 +170,26 @@ namespace gtsam {
     /// Return all the discrete keys associated with this factor.
     DiscreteKeys discreteKeys() const;
+    /**
+     * @brief Prune the decision tree of discrete variables.
+     *
+     * Pruning will set the leaves to be "pruned" to 0 indicating a 0
+     * probability. An assignment is pruned if it is not in the top
+     * `maxNrAssignments` values.
+     *
+     * A violation can occur if there are more
+     * duplicate values than `maxNrAssignments`. A violation here is the need to
+     * un-prune the decision tree (e.g. all assignment values are 1.0). We could
+     * have another case where some subset of duplicates exist (e.g. for a tree
+     * with 8 assignments we have 1, 1, 1, 1, 0.8, 0.7, 0.6, 0.5), but this is
+     * not a violation since the for `maxNrAssignments=5` the top values are (1,
+     * 0.8).
+     *
+     * @param maxNrAssignments The maximum number of assignments to keep.
+     * @return DecisionTreeFactor
+     */
+    DecisionTreeFactor prune(size_t maxNrAssignments) const;
     /// @}
     /// @name Wrapper support
     /// @{
diff --git a/gtsam/discrete/tests/testAlgebraicDecisionTree.cpp b/gtsam/discrete/tests/testAlgebraicDecisionTree.cpp
index c800321d63..6a3fb23884 100644
--- a/gtsam/discrete/tests/testAlgebraicDecisionTree.cpp
+++ b/gtsam/discrete/tests/testAlgebraicDecisionTree.cpp
@@ -20,7 +20,7 @@
 #include <gtsam/discrete/DiscreteKey.h>  // make sure we have traits
 #include <gtsam/discrete/DiscreteValues.h>
 // headers first to make sure no missing headers
-//#define DT_NO_PRUNING
 #include <gtsam/discrete/AlgebraicDecisionTree.h>
 #include <gtsam/discrete/DecisionTree-inl.h>  // for convert only
diff --git a/gtsam/discrete/tests/testDecisionTree.cpp b/gtsam/discrete/tests/testDecisionTree.cpp
index f234905e33..5ccbcf9162 100644
--- a/gtsam/discrete/tests/testDecisionTree.cpp
+++ b/gtsam/discrete/tests/testDecisionTree.cpp
@@ -18,7 +18,7 @@
 // #define DT_DEBUG_MEMORY
-// #define DT_NO_PRUNING
 #define DISABLE_DOT
 #include <gtsam/discrete/DecisionTree-inl.h>
@@ -323,6 +323,49 @@ TEST(DecisionTree, Containers) {
   StringContainerTree converted(stringIntTree, container_of_int);
+/* ************************************************************************** */
+// Test nrAssignments.
+TEST(DecisionTree, NrAssignments) {
+  pair<string, size_t> A("A", 2), B("B", 2), C("C", 2);
+  DT tree({A, B, C}, "1 1 1 1 1 1 1 1");
+  EXPECT(tree.root_->isLeaf());
+  auto leaf = boost::dynamic_pointer_cast<const DT::Leaf>(tree.root_);
+  EXPECT_LONGS_EQUAL(8, leaf->nrAssignments());
+  DT tree2({C, B, A}, "1 1 1 2 3 4 5 5");
+  /* The tree is
+    Choice(C) 
+    0 Choice(B) 
+    0 0 Leaf 1
+    0 1 Choice(A) 
+    0 1 0 Leaf 1
+    0 1 1 Leaf 2
+    1 Choice(B) 
+    1 0 Choice(A) 
+    1 0 0 Leaf 3
+    1 0 1 Leaf 4
+    1 1 Leaf 5
+  */
+  auto root = boost::dynamic_pointer_cast<const DT::Choice>(tree2.root_);
+  CHECK(root);
+  auto choice0 = boost::dynamic_pointer_cast<const DT::Choice>(root->branches()[0]);
+  CHECK(choice0);
+  EXPECT(choice0->branches()[0]->isLeaf());
+  auto choice00 = boost::dynamic_pointer_cast<const DT::Leaf>(choice0->branches()[0]);
+  CHECK(choice00);
+  EXPECT_LONGS_EQUAL(2, choice00->nrAssignments());
+  auto choice1 = boost::dynamic_pointer_cast<const DT::Choice>(root->branches()[1]);
+  CHECK(choice1);
+  auto choice10 = boost::dynamic_pointer_cast<const DT::Choice>(choice1->branches()[0]);
+  CHECK(choice10);
+  auto choice11 = boost::dynamic_pointer_cast<const DT::Leaf>(choice1->branches()[1]);
+  CHECK(choice11);
+  EXPECT(choice11->isLeaf());
+  EXPECT_LONGS_EQUAL(2, choice11->nrAssignments());
 /* ************************************************************************** */
 // Test visit.
 TEST(DecisionTree, visit) {
diff --git a/gtsam/discrete/tests/testDecisionTreeFactor.cpp b/gtsam/discrete/tests/testDecisionTreeFactor.cpp
index 846653c383..84e45a0f54 100644
--- a/gtsam/discrete/tests/testDecisionTreeFactor.cpp
+++ b/gtsam/discrete/tests/testDecisionTreeFactor.cpp
@@ -106,13 +106,48 @@ TEST(DecisionTreeFactor, enumerate) {
   EXPECT(actual == expected);
+/* ************************************************************************* */
+// Check pruning of the decision tree works as expected.
+TEST(DecisionTreeFactor, Prune) {
+  DiscreteKey A(1, 2), B(2, 2), C(3, 2);
+  DecisionTreeFactor f(A & B & C, "1 5 3 7 2 6 4 8");
+  // Only keep the leaves with the top 5 values.
+  size_t maxNrAssignments = 5;
+  auto pruned5 = f.prune(maxNrAssignments);
+  // Pruned leaves should be 0
+  DecisionTreeFactor expected(A & B & C, "0 5 0 7 0 6 4 8");
+  EXPECT(assert_equal(expected, pruned5));
+  // Check for more extreme pruning where we only keep the top 2 leaves
+  maxNrAssignments = 2;
+  auto pruned2 = f.prune(maxNrAssignments);
+  DecisionTreeFactor expected2(A & B & C, "0 0 0 7 0 0 0 8");
+  EXPECT(assert_equal(expected2, pruned2));
+  DiscreteKey D(4, 2);
+  DecisionTreeFactor factor(
+      D & C & B & A,
+      "0.0 0.0 0.0 0.60658897 0.61241912 0.61241969 0.61247685 0.61247742 0.0 "
+      "0.0 0.0 0.99995287 1.0 1.0 1.0 1.0");
+  DecisionTreeFactor expected3(
+      D & C & B & A,
+      "0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 "
+      "0.999952870000 1.0 1.0 1.0 1.0");
+  maxNrAssignments = 5;
+  auto pruned3 = factor.prune(maxNrAssignments);
+  EXPECT(assert_equal(expected3, pruned3));
 /* ************************************************************************* */
 TEST(DecisionTreeFactor, DotWithNames) {
   DiscreteKey A(12, 3), B(5, 2);
   DecisionTreeFactor f(A & B, "1 2  3 4  5 6");
   auto formatter = [](Key key) { return key == 12 ? "A" : "B"; };
-  for (bool showZero:{true, false}) {  
+  for (bool showZero:{true, false}) {
     string actual = f.dot(formatter, showZero);
     // pretty weak test, as ids are pointers and not stable across platforms.
     string expected = "digraph G {";
@@ -194,4 +229,3 @@ int main() {
   return TestRegistry::runAllTests(tr);
 /* ************************************************************************* */
diff --git a/gtsam/discrete/tests/testDiscreteFactorGraph.cpp b/gtsam/discrete/tests/testDiscreteFactorGraph.cpp
index 0a7d869ec5..3d9621affa 100644
--- a/gtsam/discrete/tests/testDiscreteFactorGraph.cpp
+++ b/gtsam/discrete/tests/testDiscreteFactorGraph.cpp
@@ -415,16 +415,16 @@ TEST(DiscreteFactorGraph, DotWithNames) {
       "graph {\n"
       "  size=\"5,5\";\n"
-      "  varC[label=\"C\"];\n"
-      "  varA[label=\"A\"];\n"
-      "  varB[label=\"B\"];\n"
+      "  var0[label=\"C\"];\n"
+      "  var1[label=\"A\"];\n"
+      "  var2[label=\"B\"];\n"
       "  factor0[label=\"\", shape=point];\n"
-      "  varC--factor0;\n"
-      "  varA--factor0;\n"
+      "  var0--factor0;\n"
+      "  var1--factor0;\n"
       "  factor1[label=\"\", shape=point];\n"
-      "  varC--factor1;\n"
-      "  varB--factor1;\n"
+      "  var0--factor1;\n"
+      "  var2--factor1;\n"
   EXPECT(actual == expected);
diff --git a/gtsam/geometry/Point2.cpp b/gtsam/geometry/Point2.cpp
index d8060cfcfd..06c32526b0 100644
--- a/gtsam/geometry/Point2.cpp
+++ b/gtsam/geometry/Point2.cpp
@@ -113,6 +113,18 @@ list<Point2> circleCircleIntersection(Point2 c1, double r1, Point2 c2,
   return circleCircleIntersection(c1, c2, fh);
+Point2Pair means(const std::vector<Point2Pair> &abPointPairs) {
+  const size_t n = abPointPairs.size();
+  if (n == 0) throw std::invalid_argument("Point2::mean input Point2Pair vector is empty");
+  Point2 aSum(0, 0), bSum(0, 0);
+  for (const Point2Pair &abPair : abPointPairs) {
+    aSum += abPair.first;
+    bSum += abPair.second;
+  }
+  const double f = 1.0 / n;
+  return {aSum * f, bSum * f};
 /* ************************************************************************* */
 ostream &operator<<(ostream &os, const gtsam::Point2Pair &p) {
   os << p.first << " <-> " << p.second;
diff --git a/gtsam/geometry/Point2.h b/gtsam/geometry/Point2.h
index cdb9f44809..d8b6daca80 100644
--- a/gtsam/geometry/Point2.h
+++ b/gtsam/geometry/Point2.h
@@ -71,6 +71,9 @@ GTSAM_EXPORT boost::optional<Point2> circleCircleIntersection(double R_d, double
  * @return list of solutions (0,1, or 2). Identical circles will return empty list, as well.
 GTSAM_EXPORT std::list<Point2> circleCircleIntersection(Point2 c1, Point2 c2, boost::optional<Point2> fh);
+/// Calculate the two means of a set of Point2 pairs
+GTSAM_EXPORT Point2Pair means(const std::vector<Point2Pair> &abPointPairs);
  * @brief Intersect 2 circles
diff --git a/gtsam/geometry/Pose2.cpp b/gtsam/geometry/Pose2.cpp
index cc7f8e474e..b37674b925 100644
--- a/gtsam/geometry/Pose2.cpp
+++ b/gtsam/geometry/Pose2.cpp
@@ -309,54 +309,77 @@ double Pose2::range(const Pose2& pose,
 /* *************************************************************************
- * New explanation, from scan.ml
- * It finds the angle using a linear method:
- * q = Pose2::transformFrom(p) = t + R*p
+ * Align finds the angle using a linear method:
+ * a = Pose2::transformFrom(b) = t + R*b
  * We need to remove the centroids from the data to find the rotation
- * using dp=[dpx;dpy] and q=[dqx;dqy] we have
- *  |dqx|   |c  -s|     |dpx|     |dpx -dpy|     |c|
+ * using db=[dbx;dby] and a=[dax;day] we have
+ *  |dax|   |c  -s|     |dbx|     |dbx -dby|     |c|
  *  |   | = |     |  *  |   |  =  |        |  *  | | = H_i*cs
- *  |dqy|   |s   c|     |dpy|     |dpy  dpx|     |s|
+ *  |day|   |s   c|     |dby|     |dby  dbx|     |s|
  * where the Hi are the 2*2 matrices. Then we will minimize the criterion
- * J = \sum_i norm(q_i - H_i * cs)
+ * J = \sum_i norm(a_i - H_i * cs)
  * Taking the derivative with respect to cs and setting to zero we have
- * cs = (\sum_i H_i' * q_i)/(\sum H_i'*H_i)
+ * cs = (\sum_i H_i' * a_i)/(\sum H_i'*H_i)
  * The hessian is diagonal and just divides by a constant, but this
  * normalization constant is irrelevant, since we take atan2.
- * i.e., cos ~ sum(dpx*dqx + dpy*dqy) and sin ~ sum(-dpy*dqx + dpx*dqy)
+ * i.e., cos ~ sum(dbx*dax + dby*day) and sin ~ sum(-dby*dax + dbx*day)
  * The translation is then found from the centroids
- * as they also satisfy cq = t + R*cp, hence t = cq - R*cp
+ * as they also satisfy ca = t + R*cb, hence t = ca - R*cb
-boost::optional<Pose2> align(const vector<Point2Pair>& pairs) {
-  size_t n = pairs.size();
-  if (n<2) return boost::none; // we need at least two pairs
+boost::optional<Pose2> Pose2::Align(const Point2Pairs &ab_pairs) {
+  const size_t n = ab_pairs.size();
+  if (n < 2) {
+    return boost::none;  // we need at least 2 pairs
+  }
   // calculate centroids
-  Point2 cp(0,0), cq(0,0);
-  for(const Point2Pair& pair: pairs) {
-    cp += pair.first;
-    cq += pair.second;
+  Point2 ca(0, 0), cb(0, 0);
+  for (const Point2Pair& pair : ab_pairs) {
+    ca += pair.first;
+    cb += pair.second;
-  double f = 1.0/n;
-  cp *= f; cq *= f;
+  const double f = 1.0/n;
+  ca *= f;
+  cb *= f;
   // calculate cos and sin
-  double c=0,s=0;
-  for(const Point2Pair& pair: pairs) {
-    Point2 dp = pair.first - cp;
-    Point2 dq = pair.second - cq;
-    c += dp.x() * dq.x() + dp.y() * dq.y();
-    s += -dp.y() * dq.x() + dp.x() * dq.y();
+  double c = 0, s = 0;
+  for (const Point2Pair& pair : ab_pairs) {
+    Point2 da = pair.first - ca;
+    Point2 db = pair.second - cb;
+    c += db.x() * da.x() + db.y() * da.y();
+    s += -db.y() * da.x() + db.x() * da.y();
   // calculate angle and translation
-  double theta = atan2(s,c);
-  Rot2 R = Rot2::fromAngle(theta);
-  Point2 t = cq - R*cp;
+  const double theta = atan2(s, c);
+  const Rot2 R = Rot2::fromAngle(theta);
+  const Point2 t = ca - R*cb;
   return Pose2(R, t);
+boost::optional<Pose2> Pose2::Align(const Matrix& a, const Matrix& b) {
+  if (a.rows() != 2 || b.rows() != 2 || a.cols() != b.cols()) {
+    throw std::invalid_argument(
+      "Pose2:Align expects 2*N matrices of equal shape.");
+  }
+  Point2Pairs ab_pairs;
+  for (Eigen::Index j = 0; j < a.cols(); j++) {
+    ab_pairs.emplace_back(a.col(j), b.col(j));
+  }
+  return Pose2::Align(ab_pairs);
+boost::optional<Pose2> align(const Point2Pairs& ba_pairs) {
+  Point2Pairs ab_pairs;
+  for (const Point2Pair &baPair : ba_pairs) {
+    ab_pairs.emplace_back(baPair.second, baPair.first);
+  }
+  return Pose2::Align(ab_pairs);
 /* ************************************************************************* */
 } // namespace gtsam
diff --git a/gtsam/geometry/Pose2.h b/gtsam/geometry/Pose2.h
index 1e79836f5e..466c5a42ad 100644
--- a/gtsam/geometry/Pose2.h
+++ b/gtsam/geometry/Pose2.h
@@ -92,6 +92,18 @@ class Pose2: public LieGroup<Pose2, 3> {
     *this = Expmap(v);
+  /**
+   *  Create Pose2 by aligning two point pairs
+   *  A pose aTb is estimated between pairs (a_point, b_point) such that 
+   *    a_point = aTb * b_point
+   *  Note this allows for noise on the points but in that case the mapping 
+   *  will not be exact.
+   */
+  static boost::optional<Pose2> Align(const Point2Pairs& abPointPairs);
+  // Version of Pose2::Align that takes 2 matrices.
+  static boost::optional<Pose2> Align(const Matrix& a, const Matrix& b);
   /// @}
   /// @name Testable
   /// @{
@@ -331,12 +343,19 @@ inline Matrix wedge<Pose2>(const Vector& xi) {
   return Matrix(Pose2::wedge(xi(0),xi(1),xi(2))).eval();
+ * @deprecated Use static constructor (with reversed pairs!)
  * Calculate pose between a vector of 2D point correspondences (p,q)
  * where q = Pose2::transformFrom(p) = t + R*p
-typedef std::pair<Point2,Point2> Point2Pair;
-GTSAM_EXPORT boost::optional<Pose2> align(const std::vector<Point2Pair>& pairs);
+GTSAM_EXPORT boost::optional<Pose2> 
+GTSAM_DEPRECATED align(const Point2Pairs& pairs);
+// Convenience typedef
+using Pose2Pair = std::pair<Pose2, Pose2>;
+using Pose2Pairs = std::vector<Pose2Pair>;
 template <>
 struct traits<Pose2> : public internal::LieGroup<Pose2> {};
diff --git a/gtsam/geometry/Pose3.cpp b/gtsam/geometry/Pose3.cpp
index 5369475976..2da51a6251 100644
--- a/gtsam/geometry/Pose3.cpp
+++ b/gtsam/geometry/Pose3.cpp
@@ -473,12 +473,13 @@ boost::optional<Pose3> Pose3::Align(const Matrix& a, const Matrix& b) {
       "Pose3:Align expects 3*N matrices of equal shape.");
   Point3Pairs abPointPairs;
-  for (size_t j=0; j < a.cols(); j++) {
+  for (Eigen::Index j = 0; j < a.cols(); j++) {
     abPointPairs.emplace_back(a.col(j), b.col(j));
   return Pose3::Align(abPointPairs);
 boost::optional<Pose3> align(const Point3Pairs &baPointPairs) {
   Point3Pairs abPointPairs;
   for (const Point3Pair &baPair : baPointPairs) {
@@ -486,6 +487,7 @@ boost::optional<Pose3> align(const Point3Pairs &baPointPairs) {
   return Pose3::Align(abPointPairs);
 /* ************************************************************************* */
 std::ostream &operator<<(std::ostream &os, const Pose3& pose) {
diff --git a/gtsam/geometry/Rot2.cpp b/gtsam/geometry/Rot2.cpp
index 283147e4cc..9bf631e50e 100644
--- a/gtsam/geometry/Rot2.cpp
+++ b/gtsam/geometry/Rot2.cpp
@@ -129,6 +129,19 @@ Rot2 Rot2::relativeBearing(const Point2& d, OptionalJacobian<1, 2> H) {
+/* ************************************************************************* */
+Rot2 Rot2::ClosestTo(const Matrix2& M) {
+  Eigen::JacobiSVD<Matrix2> svd(M, Eigen::ComputeFullU | Eigen::ComputeFullV);
+  const Matrix2& U = svd.matrixU();
+  const Matrix2& V = svd.matrixV();
+  const double det = (U * V.transpose()).determinant();
+  Matrix2 M_prime = (U * Vector2(1, det).asDiagonal() * V.transpose());
+  double c = M_prime(0, 0);
+  double s = M_prime(1, 0);
+  return Rot2::fromCosSin(c, s);
 /* ************************************************************************* */
 } // gtsam
diff --git a/gtsam/geometry/Rot2.h b/gtsam/geometry/Rot2.h
index ec30c66576..2690ca2481 100644
--- a/gtsam/geometry/Rot2.h
+++ b/gtsam/geometry/Rot2.h
@@ -14,6 +14,7 @@
  * @brief 2D rotation
  * @date Dec 9, 2009
  * @author Frank Dellaert
+ * @author John Lambert
 #pragma once
@@ -209,6 +210,9 @@ namespace gtsam {
     /** return 2*2 transpose (inverse) rotation matrix   */
     Matrix2 transpose() const;
+    /** Find closest valid rotation matrix, given a 2x2 matrix */
+    static Rot2 ClosestTo(const Matrix2& M);
     /** Serialization function */
     friend class boost::serialization::access;
diff --git a/gtsam/geometry/Similarity2.cpp b/gtsam/geometry/Similarity2.cpp
new file mode 100644
index 0000000000..4ed3351f8b
--- /dev/null
+++ b/gtsam/geometry/Similarity2.cpp
@@ -0,0 +1,242 @@
+/* ----------------------------------------------------------------------------
+ * GTSAM Copyright 2010, Georgia Tech Research Corporation,
+ * Atlanta, Georgia 30332-0415
+ * All Rights Reserved
+ * Authors: Frank Dellaert, et al. (see THANKS for the full author list)
+ * See LICENSE for the license information
+ * -------------------------------------------------------------------------- */
+ * @file   Similarity2.cpp
+ * @brief  Implementation of Similarity2 transform
+ * @author John Lambert, Varun Agrawal
+ */
+#include <gtsam/base/Manifold.h>
+#include <gtsam/geometry/Pose2.h>
+#include <gtsam/geometry/Rot3.h>
+#include <gtsam/geometry/Similarity2.h>
+#include <gtsam/slam/KarcherMeanFactor-inl.h>
+namespace gtsam {
+using std::vector;
+namespace internal {
+/// Subtract centroids from point pairs.
+static Point2Pairs SubtractCentroids(const Point2Pairs& abPointPairs,
+                                     const Point2Pair& centroids) {
+  Point2Pairs d_abPointPairs;
+  for (const Point2Pair& abPair : abPointPairs) {
+    Point2 da = abPair.first - centroids.first;
+    Point2 db = abPair.second - centroids.second;
+    d_abPointPairs.emplace_back(da, db);
+  }
+  return d_abPointPairs;
+/// Form inner products x and y and calculate scale.
+static double CalculateScale(const Point2Pairs& d_abPointPairs,
+                             const Rot2& aRb) {
+  double x = 0, y = 0;
+  Point2 da, db;
+  for (const Point2Pair& d_abPair : d_abPointPairs) {
+    std::tie(da, db) = d_abPair;
+    const Vector2 da_prime = aRb * db;
+    y += da.transpose() * da_prime;
+    x += da_prime.transpose() * da_prime;
+  }
+  const double s = y / x;
+  return s;
+/// Form outer product H.
+static Matrix2 CalculateH(const Point2Pairs& d_abPointPairs) {
+  Matrix2 H = Z_2x2;
+  for (const Point2Pair& d_abPair : d_abPointPairs) {
+    H += d_abPair.first * d_abPair.second.transpose();
+  }
+  return H;
+ * @brief This method estimates the similarity transform from differences point
+ * pairs, given a known or estimated rotation and point centroids.
+ *
+ * @param d_abPointPairs
+ * @param aRb
+ * @param centroids
+ * @return Similarity2
+ */
+static Similarity2 Align(const Point2Pairs& d_abPointPairs, const Rot2& aRb,
+                         const Point2Pair& centroids) {
+  const double s = CalculateScale(d_abPointPairs, aRb);
+  // dividing aTb by s is required because the registration cost function
+  // minimizes ||a - sRb - t||, whereas Sim(2) computes s(Rb + t)
+  const Point2 aTb = (centroids.first - s * (aRb * centroids.second)) / s;
+  return Similarity2(aRb, aTb, s);
+ * @brief This method estimates the similarity transform from point pairs,
+ * given a known or estimated rotation.
+ * Refer to:
+ * http://www5.informatik.uni-erlangen.de/Forschung/Publikationen/2005/Zinsser05-PSR.pdf
+ * Chapter 3
+ *
+ * @param abPointPairs
+ * @param aRb
+ * @return Similarity2
+ */
+static Similarity2 AlignGivenR(const Point2Pairs& abPointPairs,
+                               const Rot2& aRb) {
+  auto centroids = means(abPointPairs);
+  auto d_abPointPairs = internal::SubtractCentroids(abPointPairs, centroids);
+  return internal::Align(d_abPointPairs, aRb, centroids);
+}  // namespace internal
+Similarity2::Similarity2() : t_(0, 0), s_(1) {}
+Similarity2::Similarity2(double s) : t_(0, 0), s_(s) {}
+Similarity2::Similarity2(const Rot2& R, const Point2& t, double s)
+    : R_(R), t_(t), s_(s) {}
+Similarity2::Similarity2(const Matrix2& R, const Vector2& t, double s)
+    : R_(Rot2::ClosestTo(R)), t_(t), s_(s) {}
+Similarity2::Similarity2(const Matrix3& T)
+    : R_(Rot2::ClosestTo(T.topLeftCorner<2, 2>())),
+      t_(T.topRightCorner<2, 1>()),
+      s_(1.0 / T(2, 2)) {}
+bool Similarity2::equals(const Similarity2& other, double tol) const {
+  return R_.equals(other.R_, tol) &&
+         traits<Point2>::Equals(t_, other.t_, tol) && s_ < (other.s_ + tol) &&
+         s_ > (other.s_ - tol);
+bool Similarity2::operator==(const Similarity2& other) const {
+  return R_.matrix() == other.R_.matrix() && t_ == other.t_ && s_ == other.s_;
+void Similarity2::print(const std::string& s) const {
+  std::cout << std::endl;
+  std::cout << s;
+  rotation().print("\nR:\n");
+  std::cout << "t: " << translation().transpose() << " s: " << scale()
+            << std::endl;
+Similarity2 Similarity2::identity() { return Similarity2(); }
+Similarity2 Similarity2::operator*(const Similarity2& S) const {
+  return Similarity2(R_ * S.R_, ((1.0 / S.s_) * t_) + R_ * S.t_, s_ * S.s_);
+Similarity2 Similarity2::inverse() const {
+  const Rot2 Rt = R_.inverse();
+  const Point2 sRt = Rt * (-s_ * t_);
+  return Similarity2(Rt, sRt, 1.0 / s_);
+Point2 Similarity2::transformFrom(const Point2& p) const {
+  const Point2 q = R_ * p + t_;
+  return s_ * q;
+Pose2 Similarity2::transformFrom(const Pose2& T) const {
+  Rot2 R = R_.compose(T.rotation());
+  Point2 t = Point2(s_ * (R_ * T.translation() + t_));
+  return Pose2(R, t);
+Point2 Similarity2::operator*(const Point2& p) const {
+  return transformFrom(p);
+Similarity2 Similarity2::Align(const Point2Pairs& abPointPairs) {
+  // Refer to Chapter 3 of
+  // http://www5.informatik.uni-erlangen.de/Forschung/Publikationen/2005/Zinsser05-PSR.pdf
+  if (abPointPairs.size() < 2)
+    throw std::runtime_error("input should have at least 2 pairs of points");
+  auto centroids = means(abPointPairs);
+  auto d_abPointPairs = internal::SubtractCentroids(abPointPairs, centroids);
+  Matrix2 H = internal::CalculateH(d_abPointPairs);
+  // ClosestTo finds rotation matrix closest to H in Frobenius sense
+  Rot2 aRb = Rot2::ClosestTo(H);
+  return internal::Align(d_abPointPairs, aRb, centroids);
+Similarity2 Similarity2::Align(const Pose2Pairs& abPosePairs) {
+  const size_t n = abPosePairs.size();
+  if (n < 2)
+    throw std::runtime_error("input should have at least 2 pairs of poses");
+  // calculate rotation
+  vector<Rot2> rotations;
+  Point2Pairs abPointPairs;
+  rotations.reserve(n);
+  abPointPairs.reserve(n);
+  // Below denotes the pose of the i'th object/camera/etc
+  // in frame "a" or frame "b".
+  Pose2 aTi, bTi;
+  for (const Pose2Pair& abPair : abPosePairs) {
+    std::tie(aTi, bTi) = abPair;
+    const Rot2 aRb = aTi.rotation().compose(bTi.rotation().inverse());
+    rotations.emplace_back(aRb);
+    abPointPairs.emplace_back(aTi.translation(), bTi.translation());
+  }
+  const Rot2 aRb_estimate = FindKarcherMean<Rot2>(rotations);
+  return internal::AlignGivenR(abPointPairs, aRb_estimate);
+Vector4 Similarity2::Logmap(const Similarity2& S,  //
+                            OptionalJacobian<4, 4> Hm) {
+  const Vector2 u = S.t_;
+  const Vector1 w = Rot2::Logmap(S.R_);
+  const double s = log(S.s_);
+  Vector4 result;
+  result << u, w, s;
+  if (Hm) {
+    throw std::runtime_error("Similarity2::Logmap: derivative not implemented");
+  }
+  return result;
+Similarity2 Similarity2::Expmap(const Vector4& v,  //
+                                OptionalJacobian<4, 4> Hm) {
+  const Vector2 t = v.head<2>();
+  const Rot2 R = Rot2::Expmap(v.segment<1>(2));
+  const double s = v[3];
+  if (Hm) {
+    throw std::runtime_error("Similarity2::Expmap: derivative not implemented");
+  }
+  return Similarity2(R, t, s);
+Matrix4 Similarity2::AdjointMap() const {
+  throw std::runtime_error("Similarity2::AdjointMap not implemented");
+std::ostream& operator<<(std::ostream& os, const Similarity2& p) {
+  os << "[" << p.rotation().theta() << " " << p.translation().transpose() << " "
+     << p.scale() << "]\';";
+  return os;
+Matrix3 Similarity2::matrix() const {
+  Matrix3 T;
+  T.topRows<2>() << R_.matrix(), t_;
+  T.bottomRows<1>() << 0, 0, 1.0 / s_;
+  return T;
+}  // namespace gtsam
diff --git a/gtsam/geometry/Similarity2.h b/gtsam/geometry/Similarity2.h
new file mode 100644
index 0000000000..05f10d1493
--- /dev/null
+++ b/gtsam/geometry/Similarity2.h
@@ -0,0 +1,200 @@
+/* ----------------------------------------------------------------------------
+ * GTSAM Copyright 2010, Georgia Tech Research Corporation,
+ * Atlanta, Georgia 30332-0415
+ * All Rights Reserved
+ * Authors: Frank Dellaert, et al. (see THANKS for the full author list)
+ * See LICENSE for the license information
+ * -------------------------------------------------------------------------- */
+ * @file   Similarity2.h
+ * @brief  Implementation of Similarity2 transform
+ * @author John Lambert, Varun Agrawal
+ */
+#pragma once
+#include <gtsam/base/Lie.h>
+#include <gtsam/base/Manifold.h>
+#include <gtsam/dllexport.h>
+#include <gtsam/geometry/Point2.h>
+#include <gtsam/geometry/Pose2.h>
+#include <gtsam/geometry/Rot2.h>
+namespace gtsam {
+// Forward declarations
+class Pose2;
+ * 2D similarity transform
+ */
+class GTSAM_EXPORT Similarity2 : public LieGroup<Similarity2, 4> {
+  /// @name Pose Concept
+  /// @{
+  typedef Rot2 Rotation;
+  typedef Point2 Translation;
+  /// @}
+ private:
+  Rot2 R_;
+  Point2 t_;
+  double s_;
+ public:
+  /// @name Constructors
+  /// @{
+  /// Default constructor
+  Similarity2();
+  /// Construct pure scaling
+  Similarity2(double s);
+  /// Construct from GTSAM types
+  Similarity2(const Rot2& R, const Point2& t, double s);
+  /// Construct from Eigen types
+  Similarity2(const Matrix2& R, const Vector2& t, double s);
+  /// Construct from matrix [R t; 0 s^-1]
+  Similarity2(const Matrix3& T);
+  /// @}
+  /// @name Testable
+  /// @{
+  /// Compare with tolerance
+  bool equals(const Similarity2& sim, double tol) const;
+  /// Exact equality
+  bool operator==(const Similarity2& other) const;
+  /// Print with optional string
+  void print(const std::string& s) const;
+  friend std::ostream& operator<<(std::ostream& os, const Similarity2& p);
+  /// @}
+  /// @name Group
+  /// @{
+  /// Return an identity transform
+  static Similarity2 identity();
+  /// Composition
+  Similarity2 operator*(const Similarity2& S) const;
+  /// Return the inverse
+  Similarity2 inverse() const;
+  /// @}
+  /// @name Group action on Point2
+  /// @{
+  /// Action on a point p is s*(R*p+t)
+  Point2 transformFrom(const Point2& p) const;
+  /**
+   * Action on a pose T.
+   * |Rs  ts|   |R t|   |Rs*R Rs*t+ts|
+   * |0  1/s| * |0 1| = | 0      1/s |, the result is still a Sim2 object.
+   * To retrieve a Pose2, we normalized the scale value into 1.
+   * |Rs*R Rs*t+ts|   |Rs*R s(Rs*t+ts)|
+   * | 0      1/s | = |  0       1    |
+   *
+   * This group action satisfies the compatibility condition.
+   * For more details, refer to: https://en.wikipedia.org/wiki/Group_action
+   */
+  Pose2 transformFrom(const Pose2& T) const;
+  /* syntactic sugar for transformFrom */
+  Point2 operator*(const Point2& p) const;
+  /**
+   *  Create Similarity2 by aligning at least two point pairs
+   */
+  static Similarity2 Align(const Point2Pairs& abPointPairs);
+  /**
+   * Create the Similarity2 object that aligns at least two pose pairs.
+   * Each pair is of the form (aTi, bTi).
+   * Given a list of pairs in frame a, and a list of pairs in frame b,
+   Align()
+   * will compute the best-fit Similarity2 aSb transformation to align them.
+   * First, the rotation aRb will be computed as the average (Karcher mean)
+   of
+   * many estimates aRb (from each pair). Afterwards, the scale factor will
+   be computed
+   * using the algorithm described here:
+   * http://www5.informatik.uni-erlangen.de/Forschung/Publikationen/2005/Zinsser05-PSR.pdf
+   */
+  static Similarity2 Align(const std::vector<Pose2Pair>& abPosePairs);
+  /// @}
+  /// @name Lie Group
+  /// @{
+  /**
+   * Log map at the identity
+   * \f$ [t_x, t_y, \delta, \lambda] \f$
+   */
+  static Vector4 Logmap(const Similarity2& S,  //
+                        OptionalJacobian<4, 4> Hm = boost::none);
+  /// Exponential map at the identity
+  static Similarity2 Expmap(const Vector4& v,  //
+                            OptionalJacobian<4, 4> Hm = boost::none);
+  /// Chart at the origin
+  struct ChartAtOrigin {
+    static Similarity2 Retract(const Vector4& v,
+                               ChartJacobian H = boost::none) {
+      return Similarity2::Expmap(v, H);
+    }
+    static Vector4 Local(const Similarity2& other,
+                         ChartJacobian H = boost::none) {
+      return Similarity2::Logmap(other, H);
+    }
+  };
+  /// Project from one tangent space to another
+  Matrix4 AdjointMap() const;
+  using LieGroup<Similarity2, 4>::inverse;
+  /// @}
+  /// @name Standard interface
+  /// @{
+  /// Calculate 4*4 matrix group equivalent
+  Matrix3 matrix() const;
+  /// Return a GTSAM rotation
+  Rot2 rotation() const { return R_; }
+  /// Return a GTSAM translation
+  Point2 translation() const { return t_; }
+  /// Return the scale
+  double scale() const { return s_; }
+  /// Dimensionality of tangent space = 4 DOF - used to autodetect sizes
+  inline static size_t Dim() { return 4; }
+  /// Dimensionality of tangent space = 4 DOF
+  inline size_t dim() const { return 4; }
+  /// @}
+template <>
+struct traits<Similarity2> : public internal::LieGroup<Similarity2> {};
+template <>
+struct traits<const Similarity2> : public internal::LieGroup<Similarity2> {};
+}  // namespace gtsam
diff --git a/gtsam/geometry/Similarity3.cpp b/gtsam/geometry/Similarity3.cpp
index e8d6e75106..7fde974c55 100644
--- a/gtsam/geometry/Similarity3.cpp
+++ b/gtsam/geometry/Similarity3.cpp
@@ -26,7 +26,7 @@ namespace gtsam {
 using std::vector;
-namespace {
+namespace internal {
 /// Subtract centroids from point pairs.
 static Point3Pairs subtractCentroids(const Point3Pairs &abPointPairs,
                                     const Point3Pair &centroids) {
@@ -81,10 +81,10 @@ static Similarity3 align(const Point3Pairs &d_abPointPairs, const Rot3 &aRb,
 static Similarity3 alignGivenR(const Point3Pairs &abPointPairs,
                                const Rot3 &aRb) {
   auto centroids = means(abPointPairs);
-  auto d_abPointPairs = subtractCentroids(abPointPairs, centroids);
+  auto d_abPointPairs = internal::subtractCentroids(abPointPairs, centroids);
   return align(d_abPointPairs, aRb, centroids);
-}  // namespace
+}  // namespace internal
 Similarity3::Similarity3() :
     t_(0,0,0), s_(1) {
@@ -165,11 +165,11 @@ Similarity3 Similarity3::Align(const Point3Pairs &abPointPairs) {
   if (abPointPairs.size() < 3)
     throw std::runtime_error("input should have at least 3 pairs of points");
   auto centroids = means(abPointPairs);
-  auto d_abPointPairs = subtractCentroids(abPointPairs, centroids);
-  Matrix3 H = calculateH(d_abPointPairs);
+  auto d_abPointPairs = internal::subtractCentroids(abPointPairs, centroids);
+  Matrix3 H = internal::calculateH(d_abPointPairs);
   // ClosestTo finds rotation matrix closest to H in Frobenius sense
   Rot3 aRb = Rot3::ClosestTo(H);
-  return align(d_abPointPairs, aRb, centroids);
+  return internal::align(d_abPointPairs, aRb, centroids);
 Similarity3 Similarity3::Align(const vector<Pose3Pair> &abPosePairs) {
@@ -192,7 +192,7 @@ Similarity3 Similarity3::Align(const vector<Pose3Pair> &abPosePairs) {
   const Rot3 aRb_estimate = FindKarcherMean<Rot3>(rotations);
-  return alignGivenR(abPointPairs, aRb_estimate);
+  return internal::alignGivenR(abPointPairs, aRb_estimate);
 Matrix4 Similarity3::wedge(const Vector7 &xi) {
@@ -283,15 +283,11 @@ std::ostream &operator<<(std::ostream &os, const Similarity3& p) {
   return os;
-const Matrix4 Similarity3::matrix() const {
+Matrix4 Similarity3::matrix() const {
   Matrix4 T;
   T.topRows<3>() << R_.matrix(), t_;
   T.bottomRows<1>() << 0, 0, 0, 1.0 / s_;
   return T;
-Similarity3::operator Pose3() const {
-  return Pose3(R_, s_ * t_);
 } // namespace gtsam
diff --git a/gtsam/geometry/Similarity3.h b/gtsam/geometry/Similarity3.h
index 0ef787b059..845d4c810d 100644
--- a/gtsam/geometry/Similarity3.h
+++ b/gtsam/geometry/Similarity3.h
@@ -18,13 +18,12 @@
 #pragma once
-#include <gtsam/geometry/Rot3.h>
-#include <gtsam/geometry/Point3.h>
-#include <gtsam/geometry/Pose3.h>
 #include <gtsam/base/Lie.h>
 #include <gtsam/base/Manifold.h>
 #include <gtsam/dllexport.h>
+#include <gtsam/geometry/Point3.h>
+#include <gtsam/geometry/Pose3.h>
+#include <gtsam/geometry/Rot3.h>
 namespace gtsam {
@@ -34,108 +33,106 @@ class Pose3;
  * 3D similarity transform
-class Similarity3: public LieGroup<Similarity3, 7> {
+class GTSAM_EXPORT Similarity3 : public LieGroup<Similarity3, 7> {
   /// @name Pose Concept
   /// @{
   typedef Rot3 Rotation;
   typedef Point3 Translation;
   /// @}
+ private:
   Rot3 R_;
   Point3 t_;
   double s_;
+ public:
   /// @name Constructors
   /// @{
   /// Default constructor
-  GTSAM_EXPORT Similarity3();
+  Similarity3();
   /// Construct pure scaling
-  GTSAM_EXPORT Similarity3(double s);
+  Similarity3(double s);
   /// Construct from GTSAM types
-  GTSAM_EXPORT Similarity3(const Rot3& R, const Point3& t, double s);
+  Similarity3(const Rot3& R, const Point3& t, double s);
   /// Construct from Eigen types
-  GTSAM_EXPORT Similarity3(const Matrix3& R, const Vector3& t, double s);
+  Similarity3(const Matrix3& R, const Vector3& t, double s);
   /// Construct from matrix [R t; 0 s^-1]
-  GTSAM_EXPORT Similarity3(const Matrix4& T);
+  Similarity3(const Matrix4& T);
   /// @}
   /// @name Testable
   /// @{
   /// Compare with tolerance
-  GTSAM_EXPORT bool equals(const Similarity3& sim, double tol) const;
+  bool equals(const Similarity3& sim, double tol) const;
   /// Exact equality
-  GTSAM_EXPORT bool operator==(const Similarity3& other) const;
+  bool operator==(const Similarity3& other) const;
   /// Print with optional string
-  GTSAM_EXPORT void print(const std::string& s) const;
+  void print(const std::string& s) const;
-  GTSAM_EXPORT friend std::ostream &operator<<(std::ostream &os, const Similarity3& p);
+  friend std::ostream& operator<<(std::ostream& os, const Similarity3& p);
   /// @}
   /// @name Group
   /// @{
   /// Return an identity transform
-  GTSAM_EXPORT static Similarity3 identity();
+  static Similarity3 identity();
   /// Composition
-  GTSAM_EXPORT Similarity3 operator*(const Similarity3& S) const;
+  Similarity3 operator*(const Similarity3& S) const;
   /// Return the inverse
-  GTSAM_EXPORT Similarity3 inverse() const;
+  Similarity3 inverse() const;
   /// @}
   /// @name Group action on Point3
   /// @{
   /// Action on a point p is s*(R*p+t)
-  GTSAM_EXPORT Point3 transformFrom(const Point3& p, //
-      OptionalJacobian<3, 7> H1 = boost::none, //
-      OptionalJacobian<3, 3> H2 = boost::none) const;
+  Point3 transformFrom(const Point3& p,                          //
+                       OptionalJacobian<3, 7> H1 = boost::none,  //
+                       OptionalJacobian<3, 3> H2 = boost::none) const;
-  /** 
+  /**
    * Action on a pose T.
-   * |Rs  ts|   |R t|   |Rs*R Rs*t+ts| 
+   * |Rs  ts|   |R t|   |Rs*R Rs*t+ts|
    * |0  1/s| * |0 1| = | 0      1/s |, the result is still a Sim3 object.
    * To retrieve a Pose3, we normalized the scale value into 1.
    * |Rs*R Rs*t+ts|   |Rs*R s(Rs*t+ts)|
    * | 0      1/s | = |  0       1    |
-   * 
-   * This group action satisfies the compatibility condition. 
+   *
+   * This group action satisfies the compatibility condition.
    * For more details, refer to: https://en.wikipedia.org/wiki/Group_action
-  GTSAM_EXPORT Pose3 transformFrom(const Pose3& T) const;
+  Pose3 transformFrom(const Pose3& T) const;
   /** syntactic sugar for transformFrom */
-  GTSAM_EXPORT Point3 operator*(const Point3& p) const;
+  Point3 operator*(const Point3& p) const;
    *  Create Similarity3 by aligning at least three point pairs
-  GTSAM_EXPORT static Similarity3 Align(const std::vector<Point3Pair>& abPointPairs);
+  static Similarity3 Align(const std::vector<Point3Pair>& abPointPairs);
    * Create the Similarity3 object that aligns at least two pose pairs.
    * Each pair is of the form (aTi, bTi).
    * Given a list of pairs in frame a, and a list of pairs in frame b, Align()
    * will compute the best-fit Similarity3 aSb transformation to align them.
    * First, the rotation aRb will be computed as the average (Karcher mean) of
-   * many estimates aRb (from each pair). Afterwards, the scale factor will be computed
-   * using the algorithm described here:
+   * many estimates aRb (from each pair). Afterwards, the scale factor will be
+   * computed using the algorithm described here:
    * http://www5.informatik.uni-erlangen.de/Forschung/Publikationen/2005/Zinsser05-PSR.pdf
-  GTSAM_EXPORT static Similarity3 Align(const std::vector<Pose3Pair>& abPosePairs);
+  static Similarity3 Align(const std::vector<Pose3Pair>& abPosePairs);
   /// @}
   /// @name Lie Group
@@ -144,20 +141,22 @@ class Similarity3: public LieGroup<Similarity3, 7> {
   /** Log map at the identity
    * \f$ [R_x,R_y,R_z, t_x, t_y, t_z, \lambda] \f$
-  GTSAM_EXPORT static Vector7 Logmap(const Similarity3& s, //
-      OptionalJacobian<7, 7> Hm = boost::none);
+  static Vector7 Logmap(const Similarity3& s,  //
+                        OptionalJacobian<7, 7> Hm = boost::none);
   /** Exponential map at the identity
-  GTSAM_EXPORT static Similarity3 Expmap(const Vector7& v, //
-      OptionalJacobian<7, 7> Hm = boost::none);
+  static Similarity3 Expmap(const Vector7& v,  //
+                            OptionalJacobian<7, 7> Hm = boost::none);
   /// Chart at the origin
   struct ChartAtOrigin {
-    static Similarity3 Retract(const Vector7& v, ChartJacobian H = boost::none) {
+    static Similarity3 Retract(const Vector7& v,
+                               ChartJacobian H = boost::none) {
       return Similarity3::Expmap(v, H);
-    static Vector7 Local(const Similarity3& other, ChartJacobian H = boost::none) {
+    static Vector7 Local(const Similarity3& other,
+                         ChartJacobian H = boost::none) {
       return Similarity3::Logmap(other, H);
@@ -170,67 +169,53 @@ class Similarity3: public LieGroup<Similarity3, 7> {
    * @return 4*4 element of Lie algebra that can be exponentiated
    * TODO(frank): rename to Hat, make part of traits
-  GTSAM_EXPORT static Matrix4 wedge(const Vector7& xi);
+  static Matrix4 wedge(const Vector7& xi);
   /// Project from one tangent space to another
-  GTSAM_EXPORT  Matrix7 AdjointMap() const;
+  Matrix7 AdjointMap() const;
   /// @}
   /// @name Standard interface
   /// @{
   /// Calculate 4*4 matrix group equivalent
-  GTSAM_EXPORT const Matrix4 matrix() const;
+  Matrix4 matrix() const;
   /// Return a GTSAM rotation
-  const Rot3& rotation() const {
-    return R_;
-  }
+  Rot3 rotation() const { return R_; }
   /// Return a GTSAM translation
-  const Point3& translation() const {
-    return t_;
-  }
+  Point3 translation() const { return t_; }
   /// Return the scale
-  double scale() const {
-    return s_;
-  }
-  /// Convert to a rigid body pose (R, s*t)
-  /// TODO(frank): why is this here? Red flag! Definitely don't have it as a cast.
-  GTSAM_EXPORT operator Pose3() const;
+  double scale() const { return s_; }
   /// Dimensionality of tangent space = 7 DOF - used to autodetect sizes
-  inline static size_t Dim() {
-    return 7;
-  }
+  inline static size_t Dim() { return 7; }
   /// Dimensionality of tangent space = 7 DOF
-  inline size_t dim() const {
-    return 7;
-  }
+  inline size_t dim() const { return 7; }
   /// @}
   /// @name Helper functions
   /// @{
+ private:
   /// Calculate expmap and logmap coefficients.
   static Matrix3 GetV(Vector3 w, double lambda);
   /// @}
+template <>
 inline Matrix wedge<Similarity3>(const Vector& xi) {
   return Similarity3::wedge(xi);
+template <>
 struct traits<Similarity3> : public internal::LieGroup<Similarity3> {};
+template <>
 struct traits<const Similarity3> : public internal::LieGroup<Similarity3> {};
-} // namespace gtsam
+}  // namespace gtsam
diff --git a/gtsam/geometry/geometry.i b/gtsam/geometry/geometry.i
index 5aeac37ec5..8e3c93224e 100644
--- a/gtsam/geometry/geometry.i
+++ b/gtsam/geometry/geometry.i
@@ -372,6 +372,9 @@ class Pose2 {
   Pose2(const gtsam::Rot2& r, const gtsam::Point2& t);
   Pose2(Vector v);
+  static boost::optional<gtsam::Pose2> Align(const gtsam::Point2Pairs& abPointPairs);
+  static boost::optional<gtsam::Pose2> Align(const gtsam::Matrix& a, const gtsam::Matrix& b);
   // Testable
   void print(string s = "") const;
   bool equals(const gtsam::Pose2& pose, double tol) const;
@@ -424,8 +427,6 @@ class Pose2 {
   void serialize() const;
-boost::optional<gtsam::Pose2> align(const gtsam::Point2Pairs& pairs);
 #include <gtsam/geometry/Pose3.h>
 class Pose3 {
   // Standard Constructors
@@ -546,6 +547,12 @@ class EssentialMatrix {
   // Standard Constructors
   EssentialMatrix(const gtsam::Rot3& aRb, const gtsam::Unit3& aTb);
+  // Constructors from Pose3
+  gtsam::EssentialMatrix FromPose3(const gtsam::Pose3& _1P2_);
+  gtsam::EssentialMatrix FromPose3(const gtsam::Pose3& _1P2_,
+                            Eigen::Ref<Eigen::MatrixXd> H);
   // Testable
   void print(string s = "") const;
   bool equals(const gtsam::EssentialMatrix& pose, double tol) const;
@@ -583,7 +590,13 @@ class Cal3_S2 {
   // Action on Point2
   gtsam::Point2 calibrate(const gtsam::Point2& p) const;
+  gtsam::Point2 calibrate(const gtsam::Point2& p,
+                          Eigen::Ref<Eigen::MatrixXd> Dcal,
+                          Eigen::Ref<Eigen::MatrixXd> Dp) const;
   gtsam::Point2 uncalibrate(const gtsam::Point2& p) const;
+  gtsam::Point2 uncalibrate(const gtsam::Point2& p,
+                            Eigen::Ref<Eigen::MatrixXd> Dcal,
+                            Eigen::Ref<Eigen::MatrixXd> Dp) const;
   // Standard Interface
   double fx() const;
@@ -622,7 +635,13 @@ virtual class Cal3DS2_Base {
   // Action on Point2
   gtsam::Point2 uncalibrate(const gtsam::Point2& p) const;
+  gtsam::Point2 uncalibrate(const gtsam::Point2& p,
+                            Eigen::Ref<Eigen::MatrixXd> Dcal,
+                            Eigen::Ref<Eigen::MatrixXd> Dp) const;
   gtsam::Point2 calibrate(const gtsam::Point2& p) const;
+  gtsam::Point2 calibrate(const gtsam::Point2& p,
+                          Eigen::Ref<Eigen::MatrixXd> Dcal,
+                          Eigen::Ref<Eigen::MatrixXd> Dp) const;
   // enabling serialization functionality
   void serialize() const;
@@ -679,7 +698,13 @@ virtual class Cal3Unified : gtsam::Cal3DS2_Base {
   // Note: the signature of this functions differ from the functions
   // with equal name in the base class.
   gtsam::Point2 calibrate(const gtsam::Point2& p) const;
+  gtsam::Point2 calibrate(const gtsam::Point2& p,
+                          Eigen::Ref<Eigen::MatrixXd> Dcal,
+                          Eigen::Ref<Eigen::MatrixXd> Dp) const;
   gtsam::Point2 uncalibrate(const gtsam::Point2& p) const;
+  gtsam::Point2 uncalibrate(const gtsam::Point2& p,
+                            Eigen::Ref<Eigen::MatrixXd> Dcal,
+                            Eigen::Ref<Eigen::MatrixXd> Dp) const;
   // enabling serialization functionality
   void serialize() const;
@@ -705,7 +730,13 @@ class Cal3Fisheye {
   // Action on Point2
   gtsam::Point2 calibrate(const gtsam::Point2& p) const;
+  gtsam::Point2 calibrate(const gtsam::Point2& p,
+                          Eigen::Ref<Eigen::MatrixXd> Dcal,
+                          Eigen::Ref<Eigen::MatrixXd> Dp) const;
   gtsam::Point2 uncalibrate(const gtsam::Point2& p) const;
+  gtsam::Point2 uncalibrate(const gtsam::Point2& p,
+                            Eigen::Ref<Eigen::MatrixXd> Dcal,
+                            Eigen::Ref<Eigen::MatrixXd> Dp) const;
   // Standard Interface
   double fx() const;
@@ -768,7 +799,13 @@ class Cal3Bundler {
   // Action on Point2
   gtsam::Point2 calibrate(const gtsam::Point2& p) const;
+  gtsam::Point2 calibrate(const gtsam::Point2& p,
+                          Eigen::Ref<Eigen::MatrixXd> Dcal,
+                          Eigen::Ref<Eigen::MatrixXd> Dp) const;
   gtsam::Point2 uncalibrate(const gtsam::Point2& p) const;
+  gtsam::Point2 uncalibrate(const gtsam::Point2& p,
+                            Eigen::Ref<Eigen::MatrixXd> Dcal,
+                            Eigen::Ref<Eigen::MatrixXd> Dp) const;
   // Standard Interface
   double fx() const;
@@ -806,12 +843,25 @@ class CalibratedCamera {
   // Action on Point3
   gtsam::Point2 project(const gtsam::Point3& point) const;
+  gtsam::Point2 project(const gtsam::Point3& point,
+                        Eigen::Ref<Eigen::MatrixXd> Dcamera,
+                        Eigen::Ref<Eigen::MatrixXd> Dpoint);
+  gtsam::Point3 backproject(const gtsam::Point2& p, double depth) const;
+  gtsam::Point3 backproject(const gtsam::Point2& p, double depth,
+                            Eigen::Ref<Eigen::MatrixXd> Dresult_dpose,
+                            Eigen::Ref<Eigen::MatrixXd> Dresult_dp,
+                            Eigen::Ref<Eigen::MatrixXd> Dresult_ddepth);
   static gtsam::Point2 Project(const gtsam::Point3& cameraPoint);
   // Standard Interface
   gtsam::Pose3 pose() const;
   double range(const gtsam::Point3& point) const;
+  double range(const gtsam::Point3& point, Eigen::Ref<Eigen::MatrixXd> Dcamera,
+               Eigen::Ref<Eigen::MatrixXd> Dpoint);
   double range(const gtsam::Pose3& pose) const;
+  double range(const gtsam::Pose3& point, Eigen::Ref<Eigen::MatrixXd> Dcamera,
+               Eigen::Ref<Eigen::MatrixXd> Dpose);
   double range(const gtsam::CalibratedCamera& camera) const;
   // enabling serialization functionality
@@ -823,6 +873,7 @@ template <CALIBRATION>
 class PinholeCamera {
   // Standard Constructors and Named Constructors
+  PinholeCamera(const gtsam::PinholeCamera<CALIBRATION> other);
   PinholeCamera(const gtsam::Pose3& pose);
   PinholeCamera(const gtsam::Pose3& pose, const CALIBRATION& K);
   static This Level(const CALIBRATION& K, const gtsam::Pose2& pose,
@@ -849,14 +900,123 @@ class PinholeCamera {
   static gtsam::Point2 Project(const gtsam::Point3& cameraPoint);
   pair<gtsam::Point2, bool> projectSafe(const gtsam::Point3& pw) const;
   gtsam::Point2 project(const gtsam::Point3& point);
+  gtsam::Point2 project(const gtsam::Point3& point,
+                        Eigen::Ref<Eigen::MatrixXd> Dpose,
+                        Eigen::Ref<Eigen::MatrixXd> Dpoint,
+                        Eigen::Ref<Eigen::MatrixXd> Dcal);
+  gtsam::Point3 backproject(const gtsam::Point2& p, double depth) const;
+  gtsam::Point3 backproject(const gtsam::Point2& p, double depth,
+                            Eigen::Ref<Eigen::MatrixXd> Dresult_dpose,
+                            Eigen::Ref<Eigen::MatrixXd> Dresult_dp,
+                            Eigen::Ref<Eigen::MatrixXd> Dresult_ddepth,
+                            Eigen::Ref<Eigen::MatrixXd> Dresult_dcal);
+  gtsam::Point2 reprojectionError(const gtsam::Point3& pw, const gtsam::Point2& measured,
+                                  Eigen::Ref<Eigen::MatrixXd> Dpose,
+                                  Eigen::Ref<Eigen::MatrixXd> Dpoint,
+                                  Eigen::Ref<Eigen::MatrixXd> Dcal);
+  double range(const gtsam::Point3& point);
+  double range(const gtsam::Point3& point, Eigen::Ref<Eigen::MatrixXd> Dcamera,
+               Eigen::Ref<Eigen::MatrixXd> Dpoint);
+  double range(const gtsam::Pose3& pose);
+  double range(const gtsam::Pose3& pose, Eigen::Ref<Eigen::MatrixXd> Dcamera,
+               Eigen::Ref<Eigen::MatrixXd> Dpose);
+  // enabling serialization functionality
+  void serialize() const;
+// Forward declaration of PinholeCameraCalX is defined here.
+#include <gtsam/geometry/SimpleCamera.h>
+// Some typedefs for common camera types
+// PinholeCameraCal3_S2 is the same as SimpleCamera above
+typedef gtsam::PinholeCamera<gtsam::Cal3_S2> PinholeCameraCal3_S2;
+typedef gtsam::PinholeCamera<gtsam::Cal3DS2> PinholeCameraCal3DS2;
+typedef gtsam::PinholeCamera<gtsam::Cal3Unified> PinholeCameraCal3Unified;
+typedef gtsam::PinholeCamera<gtsam::Cal3Bundler> PinholeCameraCal3Bundler;
+typedef gtsam::PinholeCamera<gtsam::Cal3Fisheye> PinholeCameraCal3Fisheye;
+#include <gtsam/geometry/PinholePose.h>
+template <CALIBRATION>
+class PinholePose {
+  // Standard Constructors and Named Constructors
+  PinholePose();
+  PinholePose(const gtsam::PinholePose<CALIBRATION> other);
+  PinholePose(const gtsam::Pose3& pose);
+  PinholePose(const gtsam::Pose3& pose, const CALIBRATION* K);
+  static This Level(const gtsam::Pose2& pose, double height);
+  static This Lookat(const gtsam::Point3& eye, const gtsam::Point3& target,
+                     const gtsam::Point3& upVector, const CALIBRATION* K);
+  // Testable
+  void print(string s = "PinholePose") const;
+  bool equals(const This& camera, double tol) const;
+  // Standard Interface
+  gtsam::Pose3 pose() const;
+  CALIBRATION calibration() const;
+  // Manifold
+  This retract(Vector d) const;
+  Vector localCoordinates(const This& T2) const;
+  size_t dim() const;
+  static size_t Dim();
+  // Transformations and measurement functions
+  static gtsam::Point2 Project(const gtsam::Point3& cameraPoint);
+  pair<gtsam::Point2, bool> projectSafe(const gtsam::Point3& pw) const;
+  gtsam::Point2 project(const gtsam::Point3& point);
+  gtsam::Point2 project(const gtsam::Point3& point,
+                        Eigen::Ref<Eigen::MatrixXd> Dpose,
+                        Eigen::Ref<Eigen::MatrixXd> Dpoint,
+                        Eigen::Ref<Eigen::MatrixXd> Dcal);
   gtsam::Point3 backproject(const gtsam::Point2& p, double depth) const;
+  gtsam::Point3 backproject(const gtsam::Point2& p, double depth,
+                            Eigen::Ref<Eigen::MatrixXd> Dresult_dpose,
+                            Eigen::Ref<Eigen::MatrixXd> Dresult_dp,
+                            Eigen::Ref<Eigen::MatrixXd> Dresult_ddepth,
+                            Eigen::Ref<Eigen::MatrixXd> Dresult_dcal);
   double range(const gtsam::Point3& point);
+  double range(const gtsam::Point3& point, Eigen::Ref<Eigen::MatrixXd> Dcamera,
+               Eigen::Ref<Eigen::MatrixXd> Dpoint);
   double range(const gtsam::Pose3& pose);
+  double range(const gtsam::Pose3& pose, Eigen::Ref<Eigen::MatrixXd> Dcamera,
+               Eigen::Ref<Eigen::MatrixXd> Dpose);
   // enabling serialization functionality
   void serialize() const;
+typedef gtsam::PinholePose<gtsam::Cal3_S2> PinholePoseCal3_S2;
+typedef gtsam::PinholePose<gtsam::Cal3DS2> PinholePoseCal3DS2;
+typedef gtsam::PinholePose<gtsam::Cal3Unified> PinholePoseCal3Unified;
+typedef gtsam::PinholePose<gtsam::Cal3Bundler> PinholePoseCal3Bundler;
+typedef gtsam::PinholePose<gtsam::Cal3Fisheye> PinholePoseCal3Fisheye;
+#include <gtsam/geometry/Similarity2.h>
+class Similarity2 {
+  // Standard Constructors
+  Similarity2();
+  Similarity2(double s);
+  Similarity2(const gtsam::Rot2& R, const gtsam::Point2& t, double s);
+  Similarity2(const Matrix& R, const Vector& t, double s);
+  Similarity2(const Matrix& T);
+  gtsam::Point2 transformFrom(const gtsam::Point2& p) const;
+  gtsam::Pose2 transformFrom(const gtsam::Pose2& T);
+  static gtsam::Similarity2 Align(const gtsam::Point2Pairs& abPointPairs);
+  static gtsam::Similarity2 Align(const gtsam::Pose2Pairs& abPosePairs);
+  // Standard Interface
+  bool equals(const gtsam::Similarity2& sim, double tol) const;
+  Matrix matrix() const;
+  gtsam::Rot2& rotation();
+  gtsam::Point2& translation();
+  double scale() const;
 #include <gtsam/geometry/Similarity3.h>
 class Similarity3 {
   // Standard Constructors
@@ -873,22 +1033,13 @@ class Similarity3 {
   static gtsam::Similarity3 Align(const gtsam::Pose3Pairs& abPosePairs);
   // Standard Interface
-  const Matrix matrix() const;
-  const gtsam::Rot3& rotation();
-  const gtsam::Point3& translation();
+  bool equals(const gtsam::Similarity3& sim, double tol) const;
+  Matrix matrix() const;
+  gtsam::Rot3& rotation();
+  gtsam::Point3& translation();
   double scale() const;
-// Forward declaration of PinholeCameraCalX is defined here.
-#include <gtsam/geometry/SimpleCamera.h>
-// Some typedefs for common camera types
-// PinholeCameraCal3_S2 is the same as SimpleCamera above
-typedef gtsam::PinholeCamera<gtsam::Cal3_S2> PinholeCameraCal3_S2;
-typedef gtsam::PinholeCamera<gtsam::Cal3DS2> PinholeCameraCal3DS2;
-typedef gtsam::PinholeCamera<gtsam::Cal3Unified> PinholeCameraCal3Unified;
-typedef gtsam::PinholeCamera<gtsam::Cal3Bundler> PinholeCameraCal3Bundler;
-typedef gtsam::PinholeCamera<gtsam::Cal3Fisheye> PinholeCameraCal3Fisheye;
 template <T>
 class CameraSet {
@@ -920,33 +1071,102 @@ class StereoCamera {
   static size_t Dim();
   // Transformations and measurement functions
-  gtsam::StereoPoint2 project(const gtsam::Point3& point);
+  gtsam::StereoPoint2 project(const gtsam::Point3& point) const;
   gtsam::Point3 backproject(const gtsam::StereoPoint2& p) const;
+  // project with Jacobian
+  gtsam::StereoPoint2 project2(const gtsam::Point3& point,
+                              Eigen::Ref<Eigen::MatrixXd> H1,
+                              Eigen::Ref<Eigen::MatrixXd> H2) const;
+  gtsam::Point3 backproject2(const gtsam::StereoPoint2& p,
+                             Eigen::Ref<Eigen::MatrixXd> H1,
+                             Eigen::Ref<Eigen::MatrixXd> H2) const;
   // enabling serialization functionality
   void serialize() const;
 #include <gtsam/geometry/triangulation.h>
+class TriangulationResult {
+  Status status;
+  TriangulationResult(const gtsam::Point3& p);
+  const gtsam::Point3& get() const;
+  static TriangulationResult Degenerate();
+  static TriangulationResult Outlier();
+  static TriangulationResult FarPoint();
+  static TriangulationResult BehindCamera();
+  bool valid() const;
+  bool degenerate() const;
+  bool outlier() const;
+  bool farPoint() const;
+  bool behindCamera() const;
+class TriangulationParameters {
+  double rankTolerance;
+  bool enableEPI;
+  double landmarkDistanceThreshold;
+  double dynamicOutlierRejectionThreshold;
+  SharedNoiseModel noiseModel;
+  TriangulationParameters(const double rankTolerance = 1.0,
+                          const bool enableEPI = false,
+                          double landmarkDistanceThreshold = -1,
+                          double dynamicOutlierRejectionThreshold = -1,
+                          const gtsam::SharedNoiseModel& noiseModel = nullptr);
 // Templates appear not yet supported for free functions - issue raised at
 // borglab/wrap#14 to add support
+// Cal3_S2 versions
 gtsam::Point3 triangulatePoint3(const gtsam::Pose3Vector& poses,
                                 gtsam::Cal3_S2* sharedCal,
                                 const gtsam::Point2Vector& measurements,
                                 double rank_tol, bool optimize,
                                 const gtsam::SharedNoiseModel& model = nullptr);
+gtsam::Point3 triangulatePoint3(const gtsam::CameraSetCal3_S2& cameras,
+                                const gtsam::Point2Vector& measurements,
+                                double rank_tol, bool optimize,
+                                const gtsam::SharedNoiseModel& model = nullptr);
+gtsam::Point3 triangulateNonlinear(const gtsam::Pose3Vector& poses,
+                                   gtsam::Cal3_S2* sharedCal,
+                                   const gtsam::Point2Vector& measurements,
+                                   const gtsam::Point3& initialEstimate);
+gtsam::Point3 triangulateNonlinear(const gtsam::CameraSetCal3_S2& cameras,
+                                   const gtsam::Point2Vector& measurements,
+                                   const gtsam::Point3& initialEstimate);
+gtsam::TriangulationResult triangulateSafe(
+    const gtsam::CameraSetCal3_S2& cameras,
+    const gtsam::Point2Vector& measurements,
+    const gtsam::TriangulationParameters& params);
+// Cal3DS2 versions
 gtsam::Point3 triangulatePoint3(const gtsam::Pose3Vector& poses,
                                 gtsam::Cal3DS2* sharedCal,
                                 const gtsam::Point2Vector& measurements,
                                 double rank_tol, bool optimize,
                                 const gtsam::SharedNoiseModel& model = nullptr);
-gtsam::Point3 triangulatePoint3(const gtsam::Pose3Vector& poses,
-                                gtsam::Cal3Bundler* sharedCal,
+gtsam::Point3 triangulatePoint3(const gtsam::CameraSetCal3DS2& cameras,
                                 const gtsam::Point2Vector& measurements,
                                 double rank_tol, bool optimize,
                                 const gtsam::SharedNoiseModel& model = nullptr);
-gtsam::Point3 triangulatePoint3(const gtsam::CameraSetCal3_S2& cameras,
+gtsam::Point3 triangulateNonlinear(const gtsam::Pose3Vector& poses,
+                                   gtsam::Cal3DS2* sharedCal,
+                                   const gtsam::Point2Vector& measurements,
+                                   const gtsam::Point3& initialEstimate);
+gtsam::Point3 triangulateNonlinear(const gtsam::CameraSetCal3DS2& cameras,
+                                   const gtsam::Point2Vector& measurements,
+                                   const gtsam::Point3& initialEstimate);
+gtsam::TriangulationResult triangulateSafe(
+    const gtsam::CameraSetCal3DS2& cameras,
+    const gtsam::Point2Vector& measurements,
+    const gtsam::TriangulationParameters& params);
+// Cal3Bundler versions
+gtsam::Point3 triangulatePoint3(const gtsam::Pose3Vector& poses,
+                                gtsam::Cal3Bundler* sharedCal,
                                 const gtsam::Point2Vector& measurements,
                                 double rank_tol, bool optimize,
                                 const gtsam::SharedNoiseModel& model = nullptr);
@@ -954,32 +1174,63 @@ gtsam::Point3 triangulatePoint3(const gtsam::CameraSetCal3Bundler& cameras,
                                 const gtsam::Point2Vector& measurements,
                                 double rank_tol, bool optimize,
                                 const gtsam::SharedNoiseModel& model = nullptr);
-gtsam::Point3 triangulatePoint3(const gtsam::CameraSetCal3Fisheye& cameras,
+gtsam::Point3 triangulateNonlinear(const gtsam::Pose3Vector& poses,
+                                   gtsam::Cal3Bundler* sharedCal,
+                                   const gtsam::Point2Vector& measurements,
+                                   const gtsam::Point3& initialEstimate);
+gtsam::Point3 triangulateNonlinear(const gtsam::CameraSetCal3Bundler& cameras,
+                                   const gtsam::Point2Vector& measurements,
+                                   const gtsam::Point3& initialEstimate);
+gtsam::TriangulationResult triangulateSafe(
+    const gtsam::CameraSetCal3Bundler& cameras,
+    const gtsam::Point2Vector& measurements,
+    const gtsam::TriangulationParameters& params);
+// Cal3Fisheye versions
+gtsam::Point3 triangulatePoint3(const gtsam::Pose3Vector& poses,
+                                gtsam::Cal3Fisheye* sharedCal,
                                 const gtsam::Point2Vector& measurements,
                                 double rank_tol, bool optimize,
                                 const gtsam::SharedNoiseModel& model = nullptr);
-gtsam::Point3 triangulatePoint3(const gtsam::CameraSetCal3Unified& cameras,
+gtsam::Point3 triangulatePoint3(const gtsam::CameraSetCal3Fisheye& cameras,
                                 const gtsam::Point2Vector& measurements,
                                 double rank_tol, bool optimize,
                                 const gtsam::SharedNoiseModel& model = nullptr);
 gtsam::Point3 triangulateNonlinear(const gtsam::Pose3Vector& poses,
-                                   gtsam::Cal3_S2* sharedCal,
+                                   gtsam::Cal3Fisheye* sharedCal,
                                    const gtsam::Point2Vector& measurements,
                                    const gtsam::Point3& initialEstimate);
-gtsam::Point3 triangulateNonlinear(const gtsam::Pose3Vector& poses,
-                                   gtsam::Cal3DS2* sharedCal,
+gtsam::Point3 triangulateNonlinear(const gtsam::CameraSetCal3Fisheye& cameras,
                                    const gtsam::Point2Vector& measurements,
                                    const gtsam::Point3& initialEstimate);
+gtsam::TriangulationResult triangulateSafe(
+    const gtsam::CameraSetCal3Fisheye& cameras,
+    const gtsam::Point2Vector& measurements,
+    const gtsam::TriangulationParameters& params);
+// Cal3Unified versions                                
+gtsam::Point3 triangulatePoint3(const gtsam::Pose3Vector& poses,
+                                gtsam::Cal3Unified* sharedCal,
+                                const gtsam::Point2Vector& measurements,
+                                double rank_tol, bool optimize,
+                                const gtsam::SharedNoiseModel& model = nullptr);
+gtsam::Point3 triangulatePoint3(const gtsam::CameraSetCal3Unified& cameras,
+                                const gtsam::Point2Vector& measurements,
+                                double rank_tol, bool optimize,
+                                const gtsam::SharedNoiseModel& model = nullptr);
 gtsam::Point3 triangulateNonlinear(const gtsam::Pose3Vector& poses,
-                                   gtsam::Cal3Bundler* sharedCal,
-                                   const gtsam::Point2Vector& measurements,
-                                   const gtsam::Point3& initialEstimate);
-gtsam::Point3 triangulateNonlinear(const gtsam::CameraSetCal3_S2& cameras,
+                                   gtsam::Cal3Unified* sharedCal,
                                    const gtsam::Point2Vector& measurements,
                                    const gtsam::Point3& initialEstimate);
-gtsam::Point3 triangulateNonlinear(const gtsam::CameraSetCal3Bundler& cameras,
+gtsam::Point3 triangulateNonlinear(const gtsam::CameraSetCal3Unified& cameras,
                                    const gtsam::Point2Vector& measurements,
                                    const gtsam::Point3& initialEstimate);
+gtsam::TriangulationResult triangulateSafe(
+    const gtsam::CameraSetCal3Unified& cameras,
+    const gtsam::Point2Vector& measurements,
+    const gtsam::TriangulationParameters& params);
 #include <gtsam/geometry/BearingRange.h>
diff --git a/gtsam/geometry/tests/testPose2.cpp b/gtsam/geometry/tests/testPose2.cpp
index 0df858aa8b..de779cc750 100644
--- a/gtsam/geometry/tests/testPose2.cpp
+++ b/gtsam/geometry/tests/testPose2.cpp
@@ -717,81 +717,75 @@ TEST( Pose2, range_pose )
 /* ************************************************************************* */
 TEST(Pose2, align_1) {
-  Pose2 expected(Rot2::fromAngle(0), Point2(10,10));
-  vector<Point2Pair> correspondences;
-  Point2Pair pq1(make_pair(Point2(0,0), Point2(10,10)));
-  Point2Pair pq2(make_pair(Point2(20,10), Point2(30,20)));
-  correspondences += pq1, pq2;
-  boost::optional<Pose2> actual = align(correspondences);
-  EXPECT(assert_equal(expected, *actual));
+  Pose2 expected(Rot2::fromAngle(0), Point2(10, 10));
+  Point2Pairs ab_pairs {{Point2(10, 10), Point2(0, 0)},
+                        {Point2(30, 20), Point2(20, 10)}};
+  boost::optional<Pose2> aTb = Pose2::Align(ab_pairs);
+  EXPECT(assert_equal(expected, *aTb));
 TEST(Pose2, align_2) {
-  Point2 t(20,10);
+  Point2 t(20, 10);
   Rot2 R = Rot2::fromAngle(M_PI/2.0);
   Pose2 expected(R, t);
-  vector<Point2Pair> correspondences;
-  Point2 p1(0,0), p2(10,0);
-  Point2 q1 = expected.transformFrom(p1), q2 = expected.transformFrom(p2);
-  EXPECT(assert_equal(Point2(20,10),q1));
-  EXPECT(assert_equal(Point2(20,20),q2));
-  Point2Pair pq1(make_pair(p1, q1));
-  Point2Pair pq2(make_pair(p2, q2));
-  correspondences += pq1, pq2;
+  Point2 b1(0, 0), b2(10, 0);
+  Point2Pairs ab_pairs {{expected.transformFrom(b1), b1},
+                        {expected.transformFrom(b2), b2}};
-  boost::optional<Pose2> actual = align(correspondences);
-  EXPECT(assert_equal(expected, *actual));
+  boost::optional<Pose2> aTb = Pose2::Align(ab_pairs);
+  EXPECT(assert_equal(expected, *aTb));
 namespace align_3 {
-  Point2 t(10,10);
+  Point2 t(10, 10);
   Pose2 expected(Rot2::fromAngle(2*M_PI/3), t);
-  Point2 p1(0,0), p2(10,0), p3(10,10);
-  Point2 q1 = expected.transformFrom(p1), q2 = expected.transformFrom(p2), q3 = expected.transformFrom(p3);
+  Point2 b1(0, 0), b2(10, 0), b3(10, 10);
+  Point2 a1 = expected.transformFrom(b1),
+         a2 = expected.transformFrom(b2),
+         a3 = expected.transformFrom(b3);
 TEST(Pose2, align_3) {
   using namespace align_3;
-  vector<Point2Pair> correspondences;
-  Point2Pair pq1(make_pair(p1, q1));
-  Point2Pair pq2(make_pair(p2, q2));
-  Point2Pair pq3(make_pair(p3, q3));
-  correspondences += pq1, pq2, pq3;
+  Point2Pairs ab_pairs;
+  Point2Pair ab1(make_pair(a1, b1));
+  Point2Pair ab2(make_pair(a2, b2));
+  Point2Pair ab3(make_pair(a3, b3));
+  ab_pairs += ab1, ab2, ab3;
-  boost::optional<Pose2> actual = align(correspondences);
-  EXPECT(assert_equal(expected, *actual));
+  boost::optional<Pose2> aTb = Pose2::Align(ab_pairs);
+  EXPECT(assert_equal(expected, *aTb));
 namespace {
   /* ************************************************************************* */
   // Prototype code to align two triangles using a rigid transform
   /* ************************************************************************* */
-  struct Triangle { size_t i_,j_,k_;};
+  struct Triangle { size_t i_, j_, k_;};
-  boost::optional<Pose2> align2(const Point2Vector& ps, const Point2Vector& qs,
+  boost::optional<Pose2> align2(const Point2Vector& as, const Point2Vector& bs,
     const pair<Triangle, Triangle>& trianglePair) {
       const Triangle& t1 = trianglePair.first, t2 = trianglePair.second;
-      vector<Point2Pair> correspondences;
-      correspondences += make_pair(ps[t1.i_],qs[t2.i_]), make_pair(ps[t1.j_],qs[t2.j_]), make_pair(ps[t1.k_],qs[t2.k_]);
-      return align(correspondences);
+      Point2Pairs ab_pairs = {{as[t1.i_], bs[t2.i_]},
+                              {as[t1.j_], bs[t2.j_]},
+                              {as[t1.k_], bs[t2.k_]}};
+      return Pose2::Align(ab_pairs);
 TEST(Pose2, align_4) {
   using namespace align_3;
-  Point2Vector ps,qs;
-  ps += p1, p2, p3;
-  qs += q3, q1, q2; // note in 3,1,2 order !
+  Point2Vector as, bs;
+  as += a1, a2, a3;
+  bs += b3, b1, b2;  // note in 3,1,2 order !
   Triangle t1; t1.i_=0; t1.j_=1; t1.k_=2;
   Triangle t2; t2.i_=1; t2.j_=2; t2.k_=0;
-  boost::optional<Pose2> actual = align2(ps, qs, make_pair(t1,t2));
+  boost::optional<Pose2> actual = align2(as, bs, {t1, t2});
   EXPECT(assert_equal(expected, *actual));
diff --git a/gtsam/geometry/tests/testSimilarity2.cpp b/gtsam/geometry/tests/testSimilarity2.cpp
new file mode 100644
index 0000000000..dd4fd0efd1
--- /dev/null
+++ b/gtsam/geometry/tests/testSimilarity2.cpp
@@ -0,0 +1,66 @@
+/* ----------------------------------------------------------------------------
+ * GTSAM Copyright 2010, Georgia Tech Research Corporation,
+ * Atlanta, Georgia 30332-0415
+ * All Rights Reserved
+ * Authors: Frank Dellaert, et al. (see THANKS for the full author list)
+ * See LICENSE for the license information
+ * -------------------------------------------------------------------------- */
+ * @file   testSimilarity2.cpp
+ * @brief  Unit tests for Similarity2 class
+ * @author Varun Agrawal
+ */
+#include <CppUnitLite/TestHarness.h>
+#include <gtsam/base/Testable.h>
+#include <gtsam/base/numericalDerivative.h>
+#include <gtsam/base/testLie.h>
+#include <gtsam/geometry/Similarity2.h>
+#include <functional>
+using namespace std::placeholders;
+using namespace gtsam;
+using namespace std;
+static const Point2 P(0.2, 0.7);
+static const Rot2 R = Rot2::fromAngle(0.3);
+static const double s = 4;
+const double degree = M_PI / 180;
+TEST(Similarity2, Concepts) {
+  BOOST_CONCEPT_ASSERT((IsGroup<Similarity2>));
+  BOOST_CONCEPT_ASSERT((IsManifold<Similarity2>));
+  BOOST_CONCEPT_ASSERT((IsLieGroup<Similarity2>));
+TEST(Similarity2, Constructors) {
+  Similarity2 sim2_Construct1;
+  Similarity2 sim2_Construct2(s);
+  Similarity2 sim2_Construct3(R, P, s);
+  Similarity2 sim2_Construct4(R.matrix(), P, s);
+TEST(Similarity2, Getters) {
+  Similarity2 sim2_default;
+  EXPECT(assert_equal(Rot2(), sim2_default.rotation()));
+  EXPECT(assert_equal(Point2(0, 0), sim2_default.translation()));
+  EXPECT_DOUBLES_EQUAL(1.0, sim2_default.scale(), 1e-9);
+int main() {
+  TestResult tr;
+  return TestRegistry::runAllTests(tr);
diff --git a/gtsam/geometry/tests/testSimilarity3.cpp b/gtsam/geometry/tests/testSimilarity3.cpp
index 428422072f..7a134f6efd 100644
--- a/gtsam/geometry/tests/testSimilarity3.cpp
+++ b/gtsam/geometry/tests/testSimilarity3.cpp
@@ -458,18 +458,18 @@ TEST(Similarity3, Optimization2) {
   Values result;
   result = LevenbergMarquardtOptimizer(graph, initial).optimize();
   //result.print("Optimized Estimate\n");
-  Pose3 p1, p2, p3, p4, p5;
-  p1 = Pose3(result.at<Similarity3>(X(1)));
-  p2 = Pose3(result.at<Similarity3>(X(2)));
-  p3 = Pose3(result.at<Similarity3>(X(3)));
-  p4 = Pose3(result.at<Similarity3>(X(4)));
-  p5 = Pose3(result.at<Similarity3>(X(5)));
-  //p1.print("Pose1");
-  //p2.print("Pose2");
-  //p3.print("Pose3");
-  //p4.print("Pose4");
-  //p5.print("Pose5");
+  Similarity3 p1, p2, p3, p4, p5;
+  p1 = result.at<Similarity3>(X(1));
+  p2 = result.at<Similarity3>(X(2));
+  p3 = result.at<Similarity3>(X(3));
+  p4 = result.at<Similarity3>(X(4));
+  p5 = result.at<Similarity3>(X(5));
+  //p1.print("Similarity1");
+  //p2.print("Similarity2");
+  //p3.print("Similarity3");
+  //p4.print("Similarity4");
+  //p5.print("Similarity5");
   Similarity3 expected(0.7);
   EXPECT(assert_equal(expected, result.at<Similarity3>(X(5)), 0.4));
diff --git a/gtsam/geometry/triangulation.h b/gtsam/geometry/triangulation.h
index 49b5aef04b..401fd2d0bb 100644
--- a/gtsam/geometry/triangulation.h
+++ b/gtsam/geometry/triangulation.h
@@ -23,6 +23,7 @@
 #include <gtsam/geometry/Cal3Fisheye.h>
 #include <gtsam/geometry/Cal3Unified.h>
 #include <gtsam/geometry/Cal3_S2.h>
+#include <gtsam/geometry/Cal3DS2.h>
 #include <gtsam/geometry/CameraSet.h>
 #include <gtsam/geometry/PinholeCamera.h>
 #include <gtsam/geometry/SphericalCamera.h>
@@ -510,18 +511,18 @@ struct GTSAM_EXPORT TriangulationParameters {
- * TriangulationResult is an optional point, along with the reasons why it is invalid.
+ * TriangulationResult is an optional point, along with the reasons why it is
+ * invalid.
-class TriangulationResult: public boost::optional<Point3> {
-  enum Status {
-  };
-  Status status_;
-  TriangulationResult(Status s) :
-      status_(s) {
-  }
+class TriangulationResult : public boost::optional<Point3> {
+ public:
+  Status status;
+ private:
+  TriangulationResult(Status s) : status(s) {}
+ public:
    * Default constructor, only for serialization
@@ -530,54 +531,38 @@ class TriangulationResult: public boost::optional<Point3> {
    * Constructor
-  TriangulationResult(const Point3& p) :
-      status_(VALID) {
-    reset(p);
-  }
+  TriangulationResult(const Point3& p) : status(VALID) { reset(p); }
   static TriangulationResult Degenerate() {
     return TriangulationResult(DEGENERATE);
-  static TriangulationResult Outlier() {
-    return TriangulationResult(OUTLIER);
-  }
+  static TriangulationResult Outlier() { return TriangulationResult(OUTLIER); }
   static TriangulationResult FarPoint() {
     return TriangulationResult(FAR_POINT);
   static TriangulationResult BehindCamera() {
     return TriangulationResult(BEHIND_CAMERA);
-  bool valid() const {
-    return status_ == VALID;
-  }
-  bool degenerate() const {
-    return status_ == DEGENERATE;
-  }
-  bool outlier() const {
-    return status_ == OUTLIER;
-  }
-  bool farPoint() const {
-    return status_ == FAR_POINT;
-  }
-  bool behindCamera() const {
-    return status_ == BEHIND_CAMERA;
-  }
+  bool valid() const { return status == VALID; }
+  bool degenerate() const { return status == DEGENERATE; }
+  bool outlier() const { return status == OUTLIER; }
+  bool farPoint() const { return status == FAR_POINT; }
+  bool behindCamera() const { return status == BEHIND_CAMERA; }
   // stream to output
-  friend std::ostream &operator<<(std::ostream &os,
-      const TriangulationResult& result) {
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const TriangulationResult& result) {
     if (result)
       os << "point = " << *result << std::endl;
-      os << "no point, status = " << result.status_ << std::endl;
+      os << "no point, status = " << result.status << std::endl;
     return os;
+ private:
   /// Serialization function
   friend class boost::serialization::access;
-  template<class ARCHIVE>
-  void serialize(ARCHIVE & ar, const unsigned int version) {
-    ar & BOOST_SERIALIZATION_NVP(status_);
+  template <class ARCHIVE>
+  void serialize(ARCHIVE& ar, const unsigned int version) {
@@ -644,6 +629,7 @@ TriangulationResult triangulateSafe(const CameraSet<CAMERA>& cameras,
 // Vector of Cameras - used by the Python/MATLAB wrapper
 using CameraSetCal3Bundler = CameraSet<PinholeCamera<Cal3Bundler>>;
 using CameraSetCal3_S2 = CameraSet<PinholeCamera<Cal3_S2>>;
+using CameraSetCal3DS2 = CameraSet<PinholeCamera<Cal3DS2>>;
 using CameraSetCal3Fisheye = CameraSet<PinholeCamera<Cal3Fisheye>>;
 using CameraSetCal3Unified = CameraSet<PinholeCamera<Cal3Unified>>;
 using CameraSetSpherical = CameraSet<SphericalCamera>;
diff --git a/gtsam/inference/BayesNet-inst.h b/gtsam/inference/BayesNet-inst.h
index afde5498dc..e792b5c032 100644
--- a/gtsam/inference/BayesNet-inst.h
+++ b/gtsam/inference/BayesNet-inst.h
@@ -53,8 +53,9 @@ void BayesNet<CONDITIONAL>::dot(std::ostream& os,
     auto frontals = conditional->frontals();
     const Key me = frontals.front();
     auto parents = conditional->parents();
-    for (const Key& p : parents)
-      os << "  var" << keyFormatter(p) << "->var" << keyFormatter(me) << "\n";
+    for (const Key& p : parents) {
+      os << "  var" << p << "->var" << me << "\n";
+    }
   os << "}";
diff --git a/gtsam/inference/ClusterTree-inst.h b/gtsam/inference/ClusterTree-inst.h
index b042c0c8e3..9bc1419558 100644
--- a/gtsam/inference/ClusterTree-inst.h
+++ b/gtsam/inference/ClusterTree-inst.h
@@ -15,6 +15,10 @@
 #include <gtsam/base/timing.h>
 #include <gtsam/base/treeTraversal-inst.h>
+#include <mutex>
 namespace gtsam {
 /* ************************************************************************* */
@@ -120,12 +124,25 @@ struct EliminationData {
   size_t myIndexInParent;
   FastVector<sharedFactor> childFactors;
   boost::shared_ptr<BTNode> bayesTreeNode;
+  boost::shared_ptr<std::mutex> writeLock;
   EliminationData(EliminationData* _parentData, size_t nChildren) :
-      parentData(_parentData), bayesTreeNode(boost::make_shared<BTNode>()) {
+      parentData(_parentData), bayesTreeNode(boost::make_shared<BTNode>())
+      , writeLock(boost::make_shared<std::mutex>())
+    {
     if (parentData) {
+      parentData->writeLock->lock();
       myIndexInParent = parentData->childFactors.size();
+      parentData->writeLock->unlock();
     } else {
       myIndexInParent = 0;
@@ -196,8 +213,15 @@ struct EliminationData {
         nodesIndex_.insert(std::make_pair(j, myData.bayesTreeNode));
       // Store remaining factor in parent's gathered factors
-      if (!eliminationResult.second->empty())
+      if (!eliminationResult.second->empty()) {
+        myData.parentData->writeLock->lock();
         myData.parentData->childFactors[myData.myIndexInParent] = eliminationResult.second;
+        myData.parentData->writeLock->unlock();
+      }
diff --git a/gtsam/inference/DotWriter.cpp b/gtsam/inference/DotWriter.cpp
index ad53305757..eac0c90f93 100644
--- a/gtsam/inference/DotWriter.cpp
+++ b/gtsam/inference/DotWriter.cpp
@@ -43,7 +43,7 @@ void DotWriter::drawVariable(Key key, const KeyFormatter& keyFormatter,
                              const boost::optional<Vector2>& position,
                              ostream* os) const {
   // Label the node with the label from the KeyFormatter
-  *os << "  var" << keyFormatter(key) << "[label=\"" << keyFormatter(key)
+  *os << "  var" << key << "[label=\"" << keyFormatter(key)
       << "\"";
   if (position) {
     *os << ", pos=\"" << position->x() << "," << position->y() << "!\"";
@@ -65,13 +65,13 @@ void DotWriter::DrawFactor(size_t i, const boost::optional<Vector2>& position,
 static void ConnectVariables(Key key1, Key key2,
                              const KeyFormatter& keyFormatter, ostream* os) {
-  *os << "  var" << keyFormatter(key1) << "--"
-      << "var" << keyFormatter(key2) << ";\n";
+  *os << "  var" << key1 << "--"
+      << "var" << key2 << ";\n";
 static void ConnectVariableFactor(Key key, const KeyFormatter& keyFormatter,
                                   size_t i, ostream* os) {
-  *os << "  var" << keyFormatter(key) << "--"
+  *os << "  var" << key << "--"
       << "factor" << i << ";\n";
diff --git a/gtsam/linear/GaussianConditional.cpp b/gtsam/linear/GaussianConditional.cpp
index c44ab246b1..6199f91a75 100644
--- a/gtsam/linear/GaussianConditional.cpp
+++ b/gtsam/linear/GaussianConditional.cpp
@@ -91,13 +91,18 @@ namespace gtsam {
   void GaussianConditional::print(const string &s, const KeyFormatter& formatter) const {
     cout << s << " p(";
     for (const_iterator it = beginFrontals(); it != endFrontals(); ++it) {
-      cout << (boost::format("%1%")%(formatter(*it))).str() << " ";
+      cout << (boost::format("%1%") % (formatter(*it))).str()
+           << (nrFrontals() > 1 ? " " : "");
-    cout << "|";
-    for (const_iterator it = beginParents(); it != endParents(); ++it) {
-      cout << " " << (boost::format("%1%")%(formatter(*it))).str();
+    if (nrParents()) {
+      cout << " |";
+      for (const_iterator it = beginParents(); it != endParents(); ++it) {
+        cout << " " << (boost::format("%1%") % (formatter(*it))).str();
+      }
     cout << ")" << endl;
     cout << formatMatrixIndented("  R = ", R()) << endl;
     for (const_iterator it = beginParents() ; it != endParents() ; ++it) {
       cout << formatMatrixIndented((boost::format("  S[%1%] = ")%(formatter(*it))).str(), getA(it))
diff --git a/gtsam/linear/GaussianConditional.h b/gtsam/linear/GaussianConditional.h
index 6dd278536a..b2b616dab7 100644
--- a/gtsam/linear/GaussianConditional.h
+++ b/gtsam/linear/GaussianConditional.h
@@ -109,8 +109,9 @@ namespace gtsam {
     /// @{
     /** print */
-    void print(const std::string& = "GaussianConditional",
-      const KeyFormatter& formatter = DefaultKeyFormatter) const override;
+    void print(
+        const std::string& = "GaussianConditional",
+        const KeyFormatter& formatter = DefaultKeyFormatter) const override;
     /** equals function */
     bool equals(const GaussianFactor&cg, double tol = 1e-9) const override;
diff --git a/gtsam/linear/LossFunctions.h b/gtsam/linear/LossFunctions.h
index c3d7d64dbd..d9cfc1f3c3 100644
--- a/gtsam/linear/LossFunctions.h
+++ b/gtsam/linear/LossFunctions.h
@@ -54,23 +54,31 @@ namespace noiseModel {
 // clang-format on
 namespace mEstimator {
+ * Pure virtual class for all robust error function classes.
+ *
+ * It provides the machinery for block vs scalar reweighting strategies, in
+ * addition to defining the interface of derived classes.
+ */
 class GTSAM_EXPORT Base {
+  /** the rows can be weighted independently according to the error
+   * or uniformly with the norm of the right hand side */
   enum ReweightScheme { Scalar, Block };
   typedef boost::shared_ptr<Base> shared_ptr;
-  /** the rows can be weighted independently according to the error
-   * or uniformly with the norm of the right hand side */
+  /// Strategy for reweighting \sa ReweightScheme
   ReweightScheme reweight_;
   Base(const ReweightScheme reweight = Block) : reweight_(reweight) {}
   virtual ~Base() {}
-  /*
+  /// Returns the reweight scheme, as explained in ReweightScheme
+  ReweightScheme reweightScheme() const { return reweight_; }
+  /**
    * This method is responsible for returning the total penalty for a given
    * amount of error. For example, this method is responsible for implementing
    * the quadratic function for an L2 penalty, the absolute value function for
@@ -80,16 +88,20 @@ class GTSAM_EXPORT Base {
    * error vector, then it prevents implementations of asymmeric loss
    * functions. It would be better for this function to accept the vector and
    * internally call the norm if necessary.
+   *
+   * This returns \rho(x) in \ref mEstimator
-  virtual double loss(double distance) const { return 0; };
+  virtual double loss(double distance) const { return 0; }
-  /*
+  /**
    * This method is responsible for returning the weight function for a given
    * amount of error. The weight function is related to the analytic derivative
    * of the loss function. See
    *  https://members.loria.fr/MOBerger/Enseignement/Master2/Documents/ZhangIVC-97-01.pdf
    * for details. This method is required when optimizing cost functions with
    * robust penalties using iteratively re-weighted least squares.
+   *
+   * This returns w(x) in \ref mEstimator
   virtual double weight(double distance) const = 0;
@@ -124,7 +136,15 @@ class GTSAM_EXPORT Base {
-/// Null class should behave as Gaussian
+/** "Null" robust loss function, equivalent to a Gaussian pdf noise model, or
+ *  plain least-squares (non-robust).
+ *
+ *  This model has no additional parameters.
+ *
+ * - Loss       \rho(x)          = 0.5 x²
+ * - Derivative \phi(x)          = x
+ * - Weight     w(x) = \phi(x)/x = 1
+ */
 class GTSAM_EXPORT Null : public Base {
   typedef boost::shared_ptr<Null> shared_ptr;
@@ -146,7 +166,14 @@ class GTSAM_EXPORT Null : public Base {
-/// Fair implements the "Fair" robust error model (Zhang97ivc)
+/** Implementation of the "Fair" robust error model (Zhang97ivc)
+ *
+ *  This model has a scalar parameter "c".
+ *
+ * - Loss       \rho(x) = c² (|x|/c - log(1+|x|/c))
+ * - Derivative \phi(x) = x/(1+|x|/c)
+ * - Weight     w(x) = \phi(x)/x = 1/(1+|x|/c)
+ */
 class GTSAM_EXPORT Fair : public Base {
   double c_;
@@ -160,6 +187,7 @@ class GTSAM_EXPORT Fair : public Base {
   void print(const std::string &s) const override;
   bool equals(const Base &expected, double tol = 1e-8) const override;
   static shared_ptr Create(double c, const ReweightScheme reweight = Block);
+  double modelParameter() const { return c_; }
   /** Serialization function */
@@ -171,7 +199,14 @@ class GTSAM_EXPORT Fair : public Base {
-/// Huber implements the "Huber" robust error model (Zhang97ivc)
+/** The "Huber" robust error model (Zhang97ivc).
+ *
+ *  This model has a scalar parameter "k".
+ *
+ * - Loss       \rho(x)          = 0.5 x²  if |x|<k, 0.5 k² + k|x-k|  otherwise
+ * - Derivative \phi(x)          = x       if |x|<k, k sgn(x)         otherwise
+ * - Weight     w(x) = \phi(x)/x = 1       if |x|<k, k/|x|            otherwise
+ */
 class GTSAM_EXPORT Huber : public Base {
   double k_;
@@ -185,6 +220,7 @@ class GTSAM_EXPORT Huber : public Base {
   void print(const std::string &s) const override;
   bool equals(const Base &expected, double tol = 1e-8) const override;
   static shared_ptr Create(double k, const ReweightScheme reweight = Block);
+  double modelParameter() const { return k_; }
   /** Serialization function */
@@ -196,12 +232,19 @@ class GTSAM_EXPORT Huber : public Base {
-/// Cauchy implements the "Cauchy" robust error model (Lee2013IROS). Contributed
-/// by:
-///   Dipl.-Inform. Jan Oberlaender (M.Sc.), FZI Research Center for
-///   Information Technology, Karlsruhe, Germany.
-///   oberlaender@fzi.de
-/// Thanks Jan!
+/** Implementation of the "Cauchy" robust error model (Lee2013IROS).
+ * Contributed by:
+ *  Dipl.-Inform. Jan Oberlaender (M.Sc.), FZI Research Center for
+ *  Information Technology, Karlsruhe, Germany.
+ *  oberlaender@fzi.de
+ *  Thanks Jan!
+ *
+ *  This model has a scalar parameter "k".
+ *
+ * - Loss       \rho(x) = 0.5 k² log(1+x²/k²)
+ * - Derivative \phi(x) = (k²x)/(x²+k²)
+ * - Weight     w(x) = \phi(x)/x = k²/(x²+k²)
+ */
 class GTSAM_EXPORT Cauchy : public Base {
   double k_, ksquared_;
@@ -215,6 +258,7 @@ class GTSAM_EXPORT Cauchy : public Base {
   void print(const std::string &s) const override;
   bool equals(const Base &expected, double tol = 1e-8) const override;
   static shared_ptr Create(double k, const ReweightScheme reweight = Block);
+  double modelParameter() const { return k_; }
   /** Serialization function */
@@ -223,10 +267,18 @@ class GTSAM_EXPORT Cauchy : public Base {
   void serialize(ARCHIVE &ar, const unsigned int /*version*/) {
+    ar &BOOST_SERIALIZATION_NVP(ksquared_);
-/// Tukey implements the "Tukey" robust error model (Zhang97ivc)
+/** Implementation of the "Tukey" robust error model (Zhang97ivc).
+ *
+ *  This model has a scalar parameter "c".
+ *
+ * - Loss       \rho(x) = c² (1 - (1-x²/c²)³)/6  if |x|<c,  c²/6   otherwise
+ * - Derivative \phi(x) = x(1-x²/c²)² if |x|<c,  0   otherwise
+ * - Weight     w(x) = \phi(x)/x = (1-x²/c²)² if |x|<c,  0   otherwise
+ */
 class GTSAM_EXPORT Tukey : public Base {
   double c_, csquared_;
@@ -240,6 +292,7 @@ class GTSAM_EXPORT Tukey : public Base {
   void print(const std::string &s) const override;
   bool equals(const Base &expected, double tol = 1e-8) const override;
   static shared_ptr Create(double k, const ReweightScheme reweight = Block);
+  double modelParameter() const { return c_; }
   /** Serialization function */
@@ -251,7 +304,14 @@ class GTSAM_EXPORT Tukey : public Base {
-/// Welsch implements the "Welsch" robust error model (Zhang97ivc)
+/** Implementation of the "Welsch" robust error model (Zhang97ivc).
+ *
+ *  This model has a scalar parameter "c".
+ *
+ * - Loss       \rho(x) = -0.5 c² (exp(-x²/c²) - 1)
+ * - Derivative \phi(x) = x exp(-x²/c²)
+ * - Weight     w(x) = \phi(x)/x = exp(-x²/c²)
+ */
 class GTSAM_EXPORT Welsch : public Base {
   double c_, csquared_;
@@ -265,6 +325,7 @@ class GTSAM_EXPORT Welsch : public Base {
   void print(const std::string &s) const override;
   bool equals(const Base &expected, double tol = 1e-8) const override;
   static shared_ptr Create(double k, const ReweightScheme reweight = Block);
+  double modelParameter() const { return c_; }
   /** Serialization function */
@@ -273,15 +334,20 @@ class GTSAM_EXPORT Welsch : public Base {
   void serialize(ARCHIVE &ar, const unsigned int /*version*/) {
+    ar &BOOST_SERIALIZATION_NVP(csquared_);
-/// GemanMcClure implements the "Geman-McClure" robust error model
-/// (Zhang97ivc).
-/// Note that Geman-McClure weight function uses the parameter c == 1.0,
-/// but here it's allowed to use different values, so we actually have
-/// the generalized Geman-McClure from (Agarwal15phd).
+/** Implementation of the "Geman-McClure" robust error model (Zhang97ivc).
+ *
+ * Note that Geman-McClure weight function uses the parameter c == 1.0,
+ * but here it's allowed to use different values, so we actually have
+ * the generalized Geman-McClure from (Agarwal15phd).
+ *
+ * - Loss       \rho(x) = 0.5 (c²x²)/(c²+x²)
+ * - Derivative \phi(x) = xc⁴/(c²+x²)²
+ * - Weight     w(x) = \phi(x)/x = c⁴/(c²+x²)²
+ */
 class GTSAM_EXPORT GemanMcClure : public Base {
   typedef boost::shared_ptr<GemanMcClure> shared_ptr;
@@ -293,6 +359,7 @@ class GTSAM_EXPORT GemanMcClure : public Base {
   void print(const std::string &s) const override;
   bool equals(const Base &expected, double tol = 1e-8) const override;
   static shared_ptr Create(double k, const ReweightScheme reweight = Block);
+  double modelParameter() const { return c_; }
   double c_;
@@ -307,11 +374,18 @@ class GTSAM_EXPORT GemanMcClure : public Base {
-/// DCS implements the Dynamic Covariance Scaling robust error model
-/// from the paper Robust Map Optimization (Agarwal13icra).
-/// Under the special condition of the parameter c == 1.0 and not
-/// forcing the output weight s <= 1.0, DCS is similar to Geman-McClure.
+/** DCS implements the Dynamic Covariance Scaling robust error model
+ *  from the paper Robust Map Optimization (Agarwal13icra).
+ *
+ *  Under the special condition of the parameter c == 1.0 and not
+ *  forcing the output weight s <= 1.0, DCS is similar to Geman-McClure.
+ *
+ *  This model has a scalar parameter "c" (with "units" of squared error).
+ *
+ * - Loss       \rho(x) = (c²x² + cx⁴)/(x²+c)²   (for any "x")
+ * - Derivative \phi(x) = 2c²x/(x²+c)²
+ * - Weight     w(x) = \phi(x)/x = 2c²/(x²+c)²  if x²>c,   1  otherwise
+ */
 class GTSAM_EXPORT DCS : public Base {
   typedef boost::shared_ptr<DCS> shared_ptr;
@@ -323,6 +397,7 @@ class GTSAM_EXPORT DCS : public Base {
   void print(const std::string &s) const override;
   bool equals(const Base &expected, double tol = 1e-8) const override;
   static shared_ptr Create(double k, const ReweightScheme reweight = Block);
+  double modelParameter() const { return c_; }
   double c_;
@@ -337,12 +412,19 @@ class GTSAM_EXPORT DCS : public Base {
-/// L2WithDeadZone implements a standard L2 penalty, but with a dead zone of
-/// width 2*k, centered at the origin. The resulting penalty within the dead
-/// zone is always zero, and grows quadratically outside the dead zone. In this
-/// sense, the L2WithDeadZone penalty is "robust to inliers", rather than being
-/// robust to outliers. This penalty can be used to create barrier functions in
-/// a general way.
+/** L2WithDeadZone implements a standard L2 penalty, but with a dead zone of
+ *  width 2*k, centered at the origin. The resulting penalty within the dead
+ *  zone is always zero, and grows quadratically outside the dead zone. In this
+ *  sense, the L2WithDeadZone penalty is "robust to inliers", rather than being
+ *  robust to outliers. This penalty can be used to create barrier functions in
+ *  a general way.
+ *
+ *  This model has a scalar parameter "k".
+ *
+ * - Loss       \rho(x) = 0 if |x|<k,    0.5(k-|x|)² otherwise
+ * - Derivative \phi(x) = 0 if |x|<k, (-k+x) if x>k,  (k+x) if x<-k
+ * - Weight     w(x) = \phi(x)/x = 0 if |x|<k, (-k+x)/x if x>k,  (k+x)/x if x<-k
+ */
 class GTSAM_EXPORT L2WithDeadZone : public Base {
   double k_;
@@ -356,6 +438,7 @@ class GTSAM_EXPORT L2WithDeadZone : public Base {
   void print(const std::string &s) const override;
   bool equals(const Base &expected, double tol = 1e-8) const override;
   static shared_ptr Create(double k, const ReweightScheme reweight = Block);
+  double modelParameter() const { return k_; }
   /** Serialization function */
diff --git a/gtsam/linear/SubgraphBuilder.cpp b/gtsam/linear/SubgraphBuilder.cpp
index 18e19cd20d..de7ae7060b 100644
--- a/gtsam/linear/SubgraphBuilder.cpp
+++ b/gtsam/linear/SubgraphBuilder.cpp
@@ -337,7 +337,6 @@ vector<size_t> SubgraphBuilder::kruskal(const GaussianFactorGraph &gfg,
   DSFVector dsf(n);
   size_t count = 0;
-  double sum = 0.0;
   for (const size_t index : sortedIndices) {
     const GaussianFactor &gf = *gfg[index];
     const auto keys = gf.keys();
@@ -347,7 +346,6 @@ vector<size_t> SubgraphBuilder::kruskal(const GaussianFactorGraph &gfg,
     if (dsf.find(u) != dsf.find(v)) {
       dsf.merge(u, v);
-      sum += weights[index];
       if (++count == n - 1) break;
diff --git a/gtsam/linear/linear.i b/gtsam/linear/linear.i
index f1bc92f69c..943b661d88 100644
--- a/gtsam/linear/linear.i
+++ b/gtsam/linear/linear.i
@@ -671,6 +671,10 @@ virtual class DummyPreconditionerParameters : gtsam::PreconditionerParameters {
+virtual class BlockJacobiPreconditionerParameters : gtsam::PreconditionerParameters {
+  BlockJacobiPreconditionerParameters();
 #include <gtsam/linear/PCGSolver.h>
 virtual class PCGSolverParameters : gtsam::ConjugateGradientParameters {
diff --git a/gtsam/linear/tests/testGaussianConditional.cpp b/gtsam/linear/tests/testGaussianConditional.cpp
index 4a95152079..6ec06a0ceb 100644
--- a/gtsam/linear/tests/testGaussianConditional.cpp
+++ b/gtsam/linear/tests/testGaussianConditional.cpp
@@ -404,13 +404,23 @@ TEST(GaussianConditional, Print) {
   const Vector2 b(20, 40);
   const double sigma = 3;
-  std::string s = "GaussianConditional";
+  GaussianConditional conditional(X(0), b, Matrix2::Identity(),
+                                  noiseModel::Isotropic::Sigma(2, sigma));
-  auto conditional =
+  // Test printing for no parents.
+  std::string expected =
+    "GaussianConditional p(x0)\n"
+    "  R = [ 1 0 ]\n"
+    "      [ 0 1 ]\n"
+    "  d = [ 20 40 ]\n"
+    "isotropic dim=2 sigma=3\n";
+  EXPECT(assert_print_equal(expected, conditional, "GaussianConditional"));
+  auto conditional1 =
       GaussianConditional::FromMeanAndStddev(X(0), A1, X(1), b, sigma);
   // Test printing for single parent.
-  std::string expected =
+  std::string expected1 =
     "GaussianConditional p(x0 | x1)\n"
     "  R = [ 1 0 ]\n"
     "      [ 0 1 ]\n"
@@ -418,7 +428,7 @@ TEST(GaussianConditional, Print) {
     "          [ -3 -4 ]\n"
     "  d = [ 20 40 ]\n"
     "isotropic dim=2 sigma=3\n";
-  EXPECT(assert_print_equal(expected, conditional, s));
+  EXPECT(assert_print_equal(expected1, conditional1, "GaussianConditional"));
   // Test printing for multiple parents.
   auto conditional2 = GaussianConditional::FromMeanAndStddev(X(0), A1, Y(0), A2,
@@ -433,7 +443,7 @@ TEST(GaussianConditional, Print) {
     "          [ -7 -8 ]\n"
     "  d = [ 20 40 ]\n"
     "isotropic dim=2 sigma=3\n";
-  EXPECT(assert_print_equal(expected2, conditional2, s));
+  EXPECT(assert_print_equal(expected2, conditional2, "GaussianConditional"));
 /* ************************************************************************* */
diff --git a/gtsam/navigation/AHRSFactor.h b/gtsam/navigation/AHRSFactor.h
index 10c33d101d..c7d82975a5 100644
--- a/gtsam/navigation/AHRSFactor.h
+++ b/gtsam/navigation/AHRSFactor.h
@@ -90,7 +90,11 @@ class GTSAM_EXPORT PreintegratedAhrsMeasurements : public PreintegratedRotation
    * Add a single Gyroscope measurement to the preintegration.
-   * @param measureOmedga Measured angular velocity (in body frame)
+   * Measurements are taken to be in the sensor
+   * frame and conversion to the body frame is handled by `body_P_sensor` in
+   * `PreintegratedRotationParams` (if provided).
+   *
+   * @param measuredOmega Measured angular velocity (as given by the sensor)
    * @param deltaT Time step
   void integrateMeasurement(const Vector3& measuredOmega, double deltaT);
diff --git a/gtsam/navigation/CombinedImuFactor.h b/gtsam/navigation/CombinedImuFactor.h
index 068a17ca48..213f5f223f 100644
--- a/gtsam/navigation/CombinedImuFactor.h
+++ b/gtsam/navigation/CombinedImuFactor.h
@@ -208,8 +208,11 @@ class GTSAM_EXPORT PreintegratedCombinedMeasurements : public PreintegrationType
    * Add a single IMU measurement to the preintegration.
-   * @param measuredAcc Measured acceleration (in body frame, as given by the
-   * sensor)
+   * Both accelerometer and gyroscope measurements are taken to be in the sensor
+   * frame and conversion to the body frame is handled by `body_P_sensor` in
+   * `PreintegrationParams`.
+   *
+   * @param measuredAcc Measured acceleration (as given by the sensor)
    * @param measuredOmega Measured angular velocity (as given by the sensor)
    * @param dt Time interval between two consecutive IMU measurements
diff --git a/gtsam/navigation/ImuFactor.h b/gtsam/navigation/ImuFactor.h
index 35207e452b..7408271627 100644
--- a/gtsam/navigation/ImuFactor.h
+++ b/gtsam/navigation/ImuFactor.h
@@ -121,7 +121,11 @@ class GTSAM_EXPORT PreintegratedImuMeasurements: public PreintegrationType {
    * Add a single IMU measurement to the preintegration.
-   * @param measuredAcc Measured acceleration (in body frame, as given by the sensor)
+   * Both accelerometer and gyroscope measurements are taken to be in the sensor
+   * frame and conversion to the body frame is handled by `body_P_sensor` in
+   * `PreintegrationParams`.
+   *
+   * @param measuredAcc Measured acceleration (as given by the sensor)
    * @param measuredOmega Measured angular velocity (as given by the sensor)
    * @param dt Time interval between this and the last IMU measurement
diff --git a/gtsam/navigation/tests/testGPSFactor.cpp b/gtsam/navigation/tests/testGPSFactor.cpp
index c94e1d3d54..5607add16e 100644
--- a/gtsam/navigation/tests/testGPSFactor.cpp
+++ b/gtsam/navigation/tests/testGPSFactor.cpp
@@ -27,7 +27,6 @@
 #include <GeographicLib/Config.h>
 #include <GeographicLib/LocalCartesian.hpp>
-using namespace std::placeholders;
 using namespace std;
 using namespace gtsam;
 using namespace GeographicLib;
@@ -71,7 +70,7 @@ TEST( GPSFactor, Constructor ) {
   // Calculate numerical derivatives
-  Matrix expectedH = numericalDerivative11<Vector,Pose3>(
+  Matrix expectedH = numericalDerivative11<Vector, Pose3>(
       std::bind(&GPSFactor::evaluateError, &factor, std::placeholders::_1, boost::none), T);
   // Use the factor to calculate the derivative
@@ -100,7 +99,7 @@ TEST( GPSFactor2, Constructor ) {
   // Calculate numerical derivatives
-  Matrix expectedH = numericalDerivative11<Vector,NavState>(
+  Matrix expectedH = numericalDerivative11<Vector, NavState>(
       std::bind(&GPSFactor2::evaluateError, &factor, std::placeholders::_1, boost::none), T);
   // Use the factor to calculate the derivative
diff --git a/gtsam/navigation/tests/testMagFactor.cpp b/gtsam/navigation/tests/testMagFactor.cpp
index e2a623710a..971803dbf6 100644
--- a/gtsam/navigation/tests/testMagFactor.cpp
+++ b/gtsam/navigation/tests/testMagFactor.cpp
@@ -26,7 +26,6 @@
 #include <GeographicLib/LocalCartesian.hpp>
-using namespace std::placeholders;
 using namespace std;
 using namespace gtsam;
 using namespace GeographicLib;
@@ -64,7 +63,7 @@ TEST( MagFactor, unrotate ) {
   Matrix H;
   Point3 expected(22735.5, 314.502, 44202.5);
   EXPECT( assert_equal(expected, MagFactor::unrotate(theta,nM,H),1e-1));
-  EXPECT( assert_equal(numericalDerivative11<Point3,Rot2> //
+  EXPECT(assert_equal(numericalDerivative11<Point3, Rot2> //
       (std::bind(&MagFactor::unrotate, std::placeholders::_1, nM, none), theta), H, 1e-6));
@@ -75,27 +74,27 @@ TEST( MagFactor, Factors ) {
   // MagFactor
   MagFactor f(1, measured, s, dir, bias, model);
-  EXPECT( assert_equal(Z_3x1,f.evaluateError(theta,H1),1e-5));
-  EXPECT( assert_equal((Matrix)numericalDerivative11<Vector,Rot2> //
+  EXPECT(assert_equal(Z_3x1,f.evaluateError(theta,H1),1e-5));
+  EXPECT(assert_equal((Matrix)numericalDerivative11<Vector, Rot2> //
       (std::bind(&MagFactor::evaluateError, &f, std::placeholders::_1, none), theta), H1, 1e-7));
-// MagFactor1
+  // MagFactor1
   MagFactor1 f1(1, measured, s, dir, bias, model);
-  EXPECT( assert_equal(Z_3x1,f1.evaluateError(nRb,H1),1e-5));
-  EXPECT( assert_equal(numericalDerivative11<Vector,Rot3> //
+  EXPECT(assert_equal(Z_3x1,f1.evaluateError(nRb,H1),1e-5));
+  EXPECT(assert_equal(numericalDerivative11<Vector, Rot3> //
       (std::bind(&MagFactor1::evaluateError, &f1, std::placeholders::_1, none), nRb), H1, 1e-7));
-// MagFactor2
+  // MagFactor2
   MagFactor2 f2(1, 2, measured, nRb, model);
-  EXPECT( assert_equal(Z_3x1,f2.evaluateError(scaled,bias,H1,H2),1e-5));
-  EXPECT( assert_equal(numericalDerivative11<Vector,Point3> //
+  EXPECT(assert_equal(Z_3x1,f2.evaluateError(scaled,bias,H1,H2),1e-5));
+  EXPECT(assert_equal(numericalDerivative11<Vector, Point3> //
       (std::bind(&MagFactor2::evaluateError, &f2, std::placeholders::_1, bias, none, none), scaled),//
       H1, 1e-7));
-  EXPECT( assert_equal(numericalDerivative11<Vector,Point3> //
+  EXPECT(assert_equal(numericalDerivative11<Vector, Point3> //
       (std::bind(&MagFactor2::evaluateError, &f2, scaled, std::placeholders::_1, none, none), bias),//
       H2, 1e-7));
-// MagFactor2
+  // MagFactor3
   MagFactor3 f3(1, 2, 3, measured, nRb, model);
   EXPECT(assert_equal((Matrix)numericalDerivative11<Vector,double> //
diff --git a/gtsam/nonlinear/Marginals.h b/gtsam/nonlinear/Marginals.h
index 028545d019..3c5aa9cabc 100644
--- a/gtsam/nonlinear/Marginals.h
+++ b/gtsam/nonlinear/Marginals.h
@@ -121,7 +121,7 @@ class GTSAM_EXPORT Marginals {
   /** Optimize the bayes tree */
   VectorValues optimize() const;
   /** Compute the Bayes Tree as a helper function to the constructor */
diff --git a/gtsam/nonlinear/PriorFactor.h b/gtsam/nonlinear/PriorFactor.h
index c745f7bd91..a490162ac3 100644
--- a/gtsam/nonlinear/PriorFactor.h
+++ b/gtsam/nonlinear/PriorFactor.h
@@ -94,7 +94,6 @@ namespace gtsam {
     Vector evaluateError(const T& x, boost::optional<Matrix&> H = boost::none) const override {
       if (H) (*H) = Matrix::Identity(traits<T>::GetDimension(x),traits<T>::GetDimension(x));
       // manifold equivalent of z-x -> Local(x,z)
-      // TODO(ASL) Add Jacobians.
       return -traits<T>::Local(x, prior_);
diff --git a/gtsam/nonlinear/Values-inl.h b/gtsam/nonlinear/Values-inl.h
index dfcb7e174c..0370c5ceea 100644
--- a/gtsam/nonlinear/Values-inl.h
+++ b/gtsam/nonlinear/Values-inl.h
@@ -279,10 +279,11 @@ namespace gtsam {
    template <typename ValueType>
    struct handle {
      ValueType operator()(Key j, const Value* const pointer) {
-       try {
+       auto ptr = dynamic_cast<const GenericValue<ValueType>*>(pointer);
+       if (ptr) {
          // value returns a const ValueType&, and the return makes a copy !!!!!
-         return dynamic_cast<const GenericValue<ValueType>&>(*pointer).value();
-       } catch (std::bad_cast&) {
+         return ptr->value();
+       } else {
          throw ValuesIncorrectType(j, typeid(*pointer), typeid(ValueType));
@@ -294,11 +295,12 @@ namespace gtsam {
    // Handle dynamic matrices
    template <int M, int N>
    struct handle_matrix<Eigen::Matrix<double, M, N>, true> {
-     Eigen::Matrix<double, M, N> operator()(Key j, const Value* const pointer) {
-       try {
+     inline Eigen::Matrix<double, M, N> operator()(Key j, const Value* const pointer) {
+       auto ptr = dynamic_cast<const GenericValue<Eigen::Matrix<double, M, N>>*>(pointer);
+       if (ptr) {
          // value returns a const Matrix&, and the return makes a copy !!!!!
-         return dynamic_cast<const GenericValue<Eigen::Matrix<double, M, N>>&>(*pointer).value();
-       } catch (std::bad_cast&) {
+         return ptr->value();
+       } else {
          // If a fixed matrix was stored, we end up here as well.
          throw ValuesIncorrectType(j, typeid(*pointer), typeid(Eigen::Matrix<double, M, N>));
@@ -308,16 +310,18 @@ namespace gtsam {
    // Handle fixed matrices
    template <int M, int N>
    struct handle_matrix<Eigen::Matrix<double, M, N>, false> {
-     Eigen::Matrix<double, M, N> operator()(Key j, const Value* const pointer) {
-       try {
+     inline Eigen::Matrix<double, M, N> operator()(Key j, const Value* const pointer) {
+       auto ptr = dynamic_cast<const GenericValue<Eigen::Matrix<double, M, N>>*>(pointer);
+       if (ptr) {
          // value returns a const MatrixMN&, and the return makes a copy !!!!!
-         return dynamic_cast<const GenericValue<Eigen::Matrix<double, M, N>>&>(*pointer).value();
-       } catch (std::bad_cast&) {
+         return ptr->value();
+       } else {
          Matrix A;
-         try {
-           // Check if a dynamic matrix was stored
-           A = handle_matrix<Eigen::MatrixXd, true>()(j, pointer);  // will throw if not....
-         } catch (const ValuesIncorrectType&) {
+         // Check if a dynamic matrix was stored
+         auto ptr = dynamic_cast<const GenericValue<Eigen::MatrixXd>*>(pointer);
+         if (ptr) {
+           A = ptr->value();
+         } else {
            // Or a dynamic vector
            A = handle_matrix<Eigen::VectorXd, true>()(j, pointer);  // will throw if not....
@@ -364,10 +368,10 @@ namespace gtsam {
     if(item != values_.end()) {
       // dynamic cast the type and throw exception if incorrect
-      const Value& value = *item->second;
-      try {
-        return dynamic_cast<const GenericValue<ValueType>&>(value).value();
-      } catch (std::bad_cast &) {
+      auto ptr = dynamic_cast<const GenericValue<ValueType>*>(item->second);
+      if (ptr) {
+        return ptr->value();
+      } else {
         // NOTE(abe): clang warns about potential side effects if done in typeid
         const Value* value = item->second;
         throw ValuesIncorrectType(j, typeid(*value), typeid(ValueType));
diff --git a/gtsam/nonlinear/nonlinear.i b/gtsam/nonlinear/nonlinear.i
index 326b84d16c..033e5ced25 100644
--- a/gtsam/nonlinear/nonlinear.i
+++ b/gtsam/nonlinear/nonlinear.i
@@ -226,6 +226,10 @@ class Values {
   void insert(size_t j, const gtsam::PinholeCamera<gtsam::Cal3Bundler>& camera);
   void insert(size_t j, const gtsam::PinholeCamera<gtsam::Cal3Fisheye>& camera);
   void insert(size_t j, const gtsam::PinholeCamera<gtsam::Cal3Unified>& camera);
+  void insert(size_t j, const gtsam::PinholePose<gtsam::Cal3_S2>& camera);
+  void insert(size_t j, const gtsam::PinholePose<gtsam::Cal3Bundler>& camera);
+  void insert(size_t j, const gtsam::PinholePose<gtsam::Cal3Fisheye>& camera);
+  void insert(size_t j, const gtsam::PinholePose<gtsam::Cal3Unified>& camera);
   void insert(size_t j, const gtsam::imuBias::ConstantBias& constant_bias);
   void insert(size_t j, const gtsam::NavState& nav_state);
   void insert(size_t j, double c);
@@ -245,6 +249,10 @@ class Values {
   void insert(size_t j, const gtsam::ParameterMatrix<14>& X);
   void insert(size_t j, const gtsam::ParameterMatrix<15>& X);
+  template <T = {gtsam::Point2,
+                 gtsam::Point3}>
+  void insert(size_t j, const T& val);
   void update(size_t j, const gtsam::Point2& point2);
   void update(size_t j, const gtsam::Point3& point3);
   void update(size_t j, const gtsam::Rot2& rot2);
@@ -265,6 +273,10 @@ class Values {
   void update(size_t j, const gtsam::PinholeCamera<gtsam::Cal3Bundler>& camera);
   void update(size_t j, const gtsam::PinholeCamera<gtsam::Cal3Fisheye>& camera);
   void update(size_t j, const gtsam::PinholeCamera<gtsam::Cal3Unified>& camera);
+  void update(size_t j, const gtsam::PinholePose<gtsam::Cal3_S2>& camera);
+  void update(size_t j, const gtsam::PinholePose<gtsam::Cal3Bundler>& camera);
+  void update(size_t j, const gtsam::PinholePose<gtsam::Cal3Fisheye>& camera);
+  void update(size_t j, const gtsam::PinholePose<gtsam::Cal3Unified>& camera);
   void update(size_t j, const gtsam::imuBias::ConstantBias& constant_bias);
   void update(size_t j, const gtsam::NavState& nav_state);
   void update(size_t j, Vector vector);
@@ -306,6 +318,10 @@ class Values {
   void insert_or_assign(size_t j, const gtsam::PinholeCamera<gtsam::Cal3Bundler>& camera);
   void insert_or_assign(size_t j, const gtsam::PinholeCamera<gtsam::Cal3Fisheye>& camera);
   void insert_or_assign(size_t j, const gtsam::PinholeCamera<gtsam::Cal3Unified>& camera);
+  void insert_or_assign(size_t j, const gtsam::PinholePose<gtsam::Cal3_S2>& camera);
+  void insert_or_assign(size_t j, const gtsam::PinholePose<gtsam::Cal3Bundler>& camera);
+  void insert_or_assign(size_t j, const gtsam::PinholePose<gtsam::Cal3Fisheye>& camera);
+  void insert_or_assign(size_t j, const gtsam::PinholePose<gtsam::Cal3Unified>& camera);
   void insert_or_assign(size_t j, const gtsam::imuBias::ConstantBias& constant_bias);
   void insert_or_assign(size_t j, const gtsam::NavState& nav_state);
   void insert_or_assign(size_t j, Vector vector);
@@ -347,6 +363,10 @@ class Values {
+                 gtsam::PinholePose<gtsam::Cal3_S2>,
+                 gtsam::PinholePose<gtsam::Cal3Bundler>,
+                 gtsam::PinholePose<gtsam::Cal3Fisheye>,
+                 gtsam::PinholePose<gtsam::Cal3Unified>,
@@ -464,6 +484,9 @@ virtual class NonlinearOptimizerParams {
   bool isSequential() const;
   bool isCholmod() const;
   bool isIterative() const;
+  // This only applies to python since matlab does not have lambda machinery.
+  gtsam::NonlinearOptimizerParams::IterationHook iterationHook;
 bool checkConvergence(double relativeErrorTreshold,
diff --git a/gtsam/sfm/SfmData.h b/gtsam/sfm/SfmData.h
index afce122051..430e107ad8 100644
--- a/gtsam/sfm/SfmData.h
+++ b/gtsam/sfm/SfmData.h
@@ -77,10 +77,14 @@ struct GTSAM_EXPORT SfmData {
   size_t numberCameras() const { return cameras.size(); }
   /// The track formed by series of landmark measurements
-  SfmTrack track(size_t idx) const { return tracks[idx]; }
+  const SfmTrack& track(size_t idx) const { return tracks[idx]; }
   /// The camera pose at frame index `idx`
-  SfmCamera camera(size_t idx) const { return cameras[idx]; }
+  const SfmCamera& camera(size_t idx) const { return cameras[idx]; }
+  /// Getters
+  const std::vector<SfmCamera>& cameraList() const { return cameras; }
+  const std::vector<SfmTrack>& trackList() const { return tracks; }
    * @brief Create projection factors using keys i and P(j)
diff --git a/gtsam/sfm/TranslationRecovery.cpp b/gtsam/sfm/TranslationRecovery.cpp
index 2e81c2d561..810fe7de98 100644
--- a/gtsam/sfm/TranslationRecovery.cpp
+++ b/gtsam/sfm/TranslationRecovery.cpp
@@ -21,13 +21,16 @@
 #include <gtsam/geometry/Pose3.h>
 #include <gtsam/geometry/Unit3.h>
 #include <gtsam/linear/NoiseModel.h>
+#include <gtsam/nonlinear/ExpressionFactor.h>
 #include <gtsam/nonlinear/LevenbergMarquardtOptimizer.h>
 #include <gtsam/nonlinear/NonlinearFactor.h>
 #include <gtsam/nonlinear/NonlinearFactorGraph.h>
 #include <gtsam/nonlinear/Values.h>
 #include <gtsam/sfm/TranslationFactor.h>
 #include <gtsam/sfm/TranslationRecovery.h>
+#include <gtsam/slam/BetweenFactor.h>
 #include <gtsam/slam/PriorFactor.h>
+#include <gtsam/slam/expressions.h>
 #include <set>
 #include <utility>
@@ -38,16 +41,13 @@ using namespace std;
 // In Wrappers we have no access to this so have a default ready.
 static std::mt19937 kRandomNumberGenerator(42);
-    const TranslationRecovery::TranslationEdges &relativeTranslations,
-    const LevenbergMarquardtParams &lmParams)
-    : params_(lmParams) {
-  // Some relative translations may be zero. We treat nodes that have a zero
-  // relativeTranslation as a single node.
-  // A DSFMap is used to find sets of nodes that have a zero relative
-  // translation. Add the nodes in each edge to the DSFMap, and merge nodes that
-  // are connected by a zero relative translation.
+// Some relative translations may be zero. We treat nodes that have a zero
+// relativeTranslation as a single node.
+// A DSFMap is used to find sets of nodes that have a zero relative
+// translation. Add the nodes in each edge to the DSFMap, and merge nodes that
+// are connected by a zero relative translation.
+DSFMap<Key> getSameTranslationDSFMap(
+    const std::vector<BinaryMeasurement<Unit3>> &relativeTranslations) {
   DSFMap<Key> sameTranslationDSF;
   for (const auto &edge : relativeTranslations) {
     Key key1 = sameTranslationDSF.find(edge.key1());
@@ -56,94 +56,152 @@ TranslationRecovery::TranslationRecovery(
       sameTranslationDSF.merge(key1, key2);
-  // Use only those edges for which two keys have a distinct root in the DSFMap.
-  for (const auto &edge : relativeTranslations) {
-    Key key1 = sameTranslationDSF.find(edge.key1());
-    Key key2 = sameTranslationDSF.find(edge.key2());
+  return sameTranslationDSF;
+// Removes zero-translation edges from measurements, and combines the nodes in
+// these edges into a single node.
+template <typename T>
+std::vector<BinaryMeasurement<T>> removeSameTranslationNodes(
+    const std::vector<BinaryMeasurement<T>> &edges,
+    const DSFMap<Key> &sameTranslationDSFMap) {
+  std::vector<BinaryMeasurement<T>> newEdges;
+  for (const auto &edge : edges) {
+    Key key1 = sameTranslationDSFMap.find(edge.key1());
+    Key key2 = sameTranslationDSFMap.find(edge.key2());
     if (key1 == key2) continue;
-    relativeTranslations_.emplace_back(key1, key2, edge.measured(),
-                                       edge.noiseModel());
+    newEdges.emplace_back(key1, key2, edge.measured(), edge.noiseModel());
+  }
+  return newEdges;
+// Adds nodes that were not optimized for because they were connected
+// to another node with a zero-translation edge in the input.
+Values addSameTranslationNodes(const Values &result,
+                               const DSFMap<Key> &sameTranslationDSFMap) {
+  Values final_result = result;
+  // Nodes that were not optimized are stored in sameTranslationNodes_ as a map
+  // from a key that was optimized to keys that were not optimized. Iterate over
+  // map and add results for keys not optimized.
+  for (const auto &optimizedAndDuplicateKeys : sameTranslationDSFMap.sets()) {
+    Key optimizedKey = optimizedAndDuplicateKeys.first;
+    std::set<Key> duplicateKeys = optimizedAndDuplicateKeys.second;
+    // Add the result for the duplicate key if it does not already exist.
+    for (const Key duplicateKey : duplicateKeys) {
+      if (final_result.exists(duplicateKey)) continue;
+      final_result.insert<Point3>(duplicateKey,
+                                  final_result.at<Point3>(optimizedKey));
+    }
-  // Store the DSF map for post-processing results.
-  sameTranslationNodes_ = sameTranslationDSF.sets();
+  return final_result;
-NonlinearFactorGraph TranslationRecovery::buildGraph() const {
+NonlinearFactorGraph TranslationRecovery::buildGraph(
+    const std::vector<BinaryMeasurement<Unit3>> &relativeTranslations) const {
   NonlinearFactorGraph graph;
-  // Add all relative translation edges
-  for (auto edge : relativeTranslations_) {
+  // Add translation factors for input translation directions.
+  for (auto edge : relativeTranslations) {
     graph.emplace_shared<TranslationFactor>(edge.key1(), edge.key2(),
                                             edge.measured(), edge.noiseModel());
   return graph;
 void TranslationRecovery::addPrior(
-    const double scale, NonlinearFactorGraph *graph,
+    const std::vector<BinaryMeasurement<Unit3>> &relativeTranslations,
+    const double scale,
+    const std::vector<BinaryMeasurement<Point3>> &betweenTranslations,
+    NonlinearFactorGraph *graph,
     const SharedNoiseModel &priorNoiseModel) const {
-  auto edge = relativeTranslations_.begin();
-  if (edge == relativeTranslations_.end()) return;
-  graph->emplace_shared<PriorFactor<Point3> >(edge->key1(), Point3(0, 0, 0),
-                                              priorNoiseModel);
-  graph->emplace_shared<PriorFactor<Point3> >(
-      edge->key2(), scale * edge->measured().point3(), edge->noiseModel());
+  auto edge = relativeTranslations.begin();
+  if (edge == relativeTranslations.end()) return;
+  graph->emplace_shared<PriorFactor<Point3>>(edge->key1(), Point3(0, 0, 0),
+                                             priorNoiseModel);
+  // Add between factors for optional relative translations.
+  for (auto edge : betweenTranslations) {
+    graph->emplace_shared<BetweenFactor<Point3>>(
+        edge.key1(), edge.key2(), edge.measured(), edge.noiseModel());
+  }
+  // Add a scale prior only if no other between factors were added.
+  if (betweenTranslations.empty()) {
+    graph->emplace_shared<PriorFactor<Point3>>(
+        edge->key2(), scale * edge->measured().point3(), edge->noiseModel());
+  }
-Values TranslationRecovery::initializeRandomly(std::mt19937 *rng) const {
+Values TranslationRecovery::initializeRandomly(
+    const std::vector<BinaryMeasurement<Unit3>> &relativeTranslations,
+    std::mt19937 *rng, const Values &initialValues) const {
   uniform_real_distribution<double> randomVal(-1, 1);
   // Create a lambda expression that checks whether value exists and randomly
   // initializes if not.
   Values initial;
   auto insert = [&](Key j) {
-    if (!initial.exists(j)) {
+    if (initial.exists(j)) return;
+    if (initialValues.exists(j)) {
+      initial.insert<Point3>(j, initialValues.at<Point3>(j));
+    } else {
           j, Point3(randomVal(*rng), randomVal(*rng), randomVal(*rng)));
+    // Assumes all nodes connected by zero-edges have the same initialization.
   // Loop over measurements and add a random translation
-  for (auto edge : relativeTranslations_) {
+  for (auto edge : relativeTranslations) {
+  return initial;
+Values TranslationRecovery::initializeRandomly(
+    const std::vector<BinaryMeasurement<Unit3>> &relativeTranslations,
+    const Values &initialValues) const {
+  return initializeRandomly(relativeTranslations, &kRandomNumberGenerator,
+                            initialValues);
+Values TranslationRecovery::run(
+    const TranslationEdges &relativeTranslations, const double scale,
+    const std::vector<BinaryMeasurement<Point3>> &betweenTranslations,
+    const Values &initialValues) const {
+  // Find edges that have a zero-translation, and recompute relativeTranslations
+  // and betweenTranslations by retaining only one node for every zero-edge.
+  DSFMap<Key> sameTranslationDSFMap =
+      getSameTranslationDSFMap(relativeTranslations);
+  const TranslationEdges nonzeroRelativeTranslations =
+      removeSameTranslationNodes(relativeTranslations, sameTranslationDSFMap);
+  const std::vector<BinaryMeasurement<Point3>> nonzeroBetweenTranslations =
+      removeSameTranslationNodes(betweenTranslations, sameTranslationDSFMap);
+  // Create graph of translation factors.
+  NonlinearFactorGraph graph = buildGraph(nonzeroRelativeTranslations);
+  // Add global frame prior and scale (either from betweenTranslations or
+  // scale).
+  addPrior(nonzeroRelativeTranslations, scale, nonzeroBetweenTranslations,
+           &graph);
+  // Uses initial values from params if provided.
+  Values initial =
+      initializeRandomly(nonzeroRelativeTranslations, initialValues);
   // If there are no valid edges, but zero-distance edges exist, initialize one
   // of the nodes in a connected component of zero-distance edges.
-  if (initial.empty() && !sameTranslationNodes_.empty()) {
-    for (const auto &optimizedAndDuplicateKeys : sameTranslationNodes_) {
+  if (initial.empty() && !sameTranslationDSFMap.sets().empty()) {
+    for (const auto &optimizedAndDuplicateKeys : sameTranslationDSFMap.sets()) {
       Key optimizedKey = optimizedAndDuplicateKeys.first;
       initial.insert<Point3>(optimizedKey, Point3(0, 0, 0));
-  return initial;
-Values TranslationRecovery::initializeRandomly() const {
-  return initializeRandomly(&kRandomNumberGenerator);
-Values TranslationRecovery::run(const double scale) const {
-  auto graph = buildGraph();
-  addPrior(scale, &graph);
-  const Values initial = initializeRandomly();
-  LevenbergMarquardtOptimizer lm(graph, initial, params_);
+  LevenbergMarquardtOptimizer lm(graph, initial, lmParams_);
   Values result = lm.optimize();
-  // Nodes that were not optimized are stored in sameTranslationNodes_ as a map
-  // from a key that was optimized to keys that were not optimized. Iterate over
-  // map and add results for keys not optimized.
-  for (const auto &optimizedAndDuplicateKeys : sameTranslationNodes_) {
-    Key optimizedKey = optimizedAndDuplicateKeys.first;
-    std::set<Key> duplicateKeys = optimizedAndDuplicateKeys.second;
-    // Add the result for the duplicate key if it does not already exist.
-    for (const Key duplicateKey : duplicateKeys) {
-      if (result.exists(duplicateKey)) continue;
-      result.insert<Point3>(duplicateKey, result.at<Point3>(optimizedKey));
-    }
-  }
-  return result;
+  return addSameTranslationNodes(result, sameTranslationDSFMap);
 TranslationRecovery::TranslationEdges TranslationRecovery::SimulateMeasurements(
diff --git a/gtsam/sfm/TranslationRecovery.h b/gtsam/sfm/TranslationRecovery.h
index 30c9a14e39..7863f51339 100644
--- a/gtsam/sfm/TranslationRecovery.h
+++ b/gtsam/sfm/TranslationRecovery.h
@@ -11,7 +11,7 @@
  * @file TranslationRecovery.h
- * @author Frank Dellaert
+ * @author Frank Dellaert, Akshay Krishnan
  * @date March 2020
  * @brief Recovering translations in an epipolar graph when rotations are given.
@@ -57,68 +57,99 @@ class TranslationRecovery {
   // Translation directions between camera pairs.
   TranslationEdges relativeTranslations_;
-  // Parameters used by the LM Optimizer.
-  LevenbergMarquardtParams params_;
-  // Map from a key in the graph to a set of keys that share the same
-  // translation.
-  std::map<Key, std::set<Key>> sameTranslationNodes_;
+  // Parameters.
+  LevenbergMarquardtParams lmParams_;
    * @brief Construct a new Translation Recovery object
-   * @param relativeTranslations the relative translations, in world coordinate
-   * frames, vector of BinaryMeasurements of Unit3, where each key of a
-   * measurement is a point in 3D.
-   * @param lmParams (optional) gtsam::LavenbergMarquardtParams that can be
-   * used to modify the parameters for the LM optimizer. By default, uses the
-   * default LM parameters.
+   * @param lmParams parameters for optimization.
+   */
+  TranslationRecovery(const LevenbergMarquardtParams &lmParams)
+      : lmParams_(lmParams) {}
+  /**
+   * @brief Default constructor.
-  TranslationRecovery(
-      const TranslationEdges &relativeTranslations,
-      const LevenbergMarquardtParams &lmParams = LevenbergMarquardtParams());
+  TranslationRecovery() = default;
    * @brief Build the factor graph to do the optimization.
+   * @param relativeTranslations unit translation directions between
+   * translations to be estimated
    * @return NonlinearFactorGraph
-  NonlinearFactorGraph buildGraph() const;
+  NonlinearFactorGraph buildGraph(
+      const std::vector<BinaryMeasurement<Unit3>> &relativeTranslations) const;
-   * @brief Add priors on ednpoints of first measurement edge.
+   * @brief Add 3 factors to the graph:
+   *    - A prior on the first point to lie at (0, 0, 0)
+   *    - If betweenTranslations is non-empty, between factors provided by it.
+   *    - If betweenTranslations is empty, a prior on scale of the first
+   * relativeTranslations edge.
+   * @param relativeTranslations unit translation directions between
+   * translations to be estimated
    * @param scale scale for first relative translation which fixes gauge.
    * @param graph factor graph to which prior is added.
+   * @param betweenTranslations relative translations (with scale) between 2
+   * points in world coordinate frame known a priori.
    * @param priorNoiseModel the noise model to use with the prior.
-  void addPrior(const double scale, NonlinearFactorGraph *graph,
-                const SharedNoiseModel &priorNoiseModel =
-                    noiseModel::Isotropic::Sigma(3, 0.01)) const;
+  void addPrior(
+      const std::vector<BinaryMeasurement<Unit3>> &relativeTranslations,
+      const double scale,
+      const std::vector<BinaryMeasurement<Point3>> &betweenTranslations,
+      NonlinearFactorGraph *graph,
+      const SharedNoiseModel &priorNoiseModel =
+          noiseModel::Isotropic::Sigma(3, 0.01)) const;
    * @brief Create random initial translations.
+   * @param relativeTranslations unit translation directions between
+   * translations to be estimated
    * @param rng random number generator
+   * @param intialValues (optional) initial values from a prior
    * @return Values
-  Values initializeRandomly(std::mt19937 *rng) const;
+  Values initializeRandomly(
+      const std::vector<BinaryMeasurement<Unit3>> &relativeTranslations,
+      std::mt19937 *rng, const Values &initialValues = Values()) const;
    * @brief Version of initializeRandomly with a fixed seed.
+   * @param relativeTranslations unit translation directions between
+   * translations to be estimated
+   * @param initialValues (optional) initial values from a prior
    * @return Values
-  Values initializeRandomly() const;
+  Values initializeRandomly(
+      const std::vector<BinaryMeasurement<Unit3>> &relativeTranslations,
+      const Values &initialValues = Values()) const;
    * @brief Build and optimize factor graph.
+   * @param relativeTranslations the relative translations, in world coordinate
+   * frames, vector of BinaryMeasurements of Unit3, where each key of a
+   * measurement is a point in 3D.
    * @param scale scale for first relative translation which fixes gauge.
+   * The scale is only used if betweenTranslations is empty.
+   * @param betweenTranslations relative translations (with scale) between 2
+   * points in world coordinate frame known a priori.
+   * @param initialValues intial values for optimization. Initializes randomly
+   * if not provided.
    * @return Values
-  Values run(const double scale = 1.0) const;
+  Values run(
+      const TranslationEdges &relativeTranslations, const double scale = 1.0,
+      const std::vector<BinaryMeasurement<Point3>> &betweenTranslations = {},
+      const Values &initialValues = Values()) const;
    * @brief Simulate translation direction measurements
diff --git a/gtsam/sfm/sfm.i b/gtsam/sfm/sfm.i
index bf9a73ac53..83bd07b13f 100644
--- a/gtsam/sfm/sfm.i
+++ b/gtsam/sfm/sfm.i
@@ -4,11 +4,15 @@
 namespace gtsam {
+#include <gtsam/nonlinear/NonlinearFactorGraph.h>
+#include <gtsam/nonlinear/Values.h>
 #include <gtsam/sfm/SfmTrack.h>
 class SfmTrack {
   SfmTrack(const gtsam::Point3& pt);
   const Point3& point3() const;
+  Point3 p;
   double r;
   double g;
@@ -34,12 +38,15 @@ class SfmData {
   static gtsam::SfmData FromBundlerFile(string filename);
   static gtsam::SfmData FromBalFile(string filename);
+  std::vector<gtsam::SfmTrack>& trackList() const;
+  std::vector<gtsam::PinholeCamera<gtsam::Cal3Bundler>>& cameraList() const;
   void addTrack(const gtsam::SfmTrack& t);
   void addCamera(const gtsam::SfmCamera& cam);
   size_t numberTracks() const;
   size_t numberCameras() const;
-  gtsam::SfmTrack track(size_t idx) const;
-  gtsam::PinholeCamera<gtsam::Cal3Bundler> camera(size_t idx) const;
+  gtsam::SfmTrack& track(size_t idx) const;
+  gtsam::PinholeCamera<gtsam::Cal3Bundler>& camera(size_t idx) const;
   gtsam::NonlinearFactorGraph generalSfmFactors(
       const gtsam::SharedNoiseModel& model =
@@ -83,6 +90,7 @@ class BinaryMeasurement {
 typedef gtsam::BinaryMeasurement<gtsam::Unit3> BinaryMeasurementUnit3;
 typedef gtsam::BinaryMeasurement<gtsam::Rot3> BinaryMeasurementRot3;
+typedef gtsam::BinaryMeasurement<gtsam::Point3> BinaryMeasurementPoint3;
 class BinaryMeasurementsUnit3 {
@@ -91,6 +99,20 @@ class BinaryMeasurementsUnit3 {
   void push_back(const gtsam::BinaryMeasurement<gtsam::Unit3>& measurement);
+class BinaryMeasurementsPoint3 {
+  BinaryMeasurementsPoint3();
+  size_t size() const;
+  gtsam::BinaryMeasurement<gtsam::Point3> at(size_t idx) const;
+  void push_back(const gtsam::BinaryMeasurement<gtsam::Point3>& measurement);
+class BinaryMeasurementsRot3 {
+  BinaryMeasurementsRot3();
+  size_t size() const;
+  gtsam::BinaryMeasurement<gtsam::Rot3> at(size_t idx) const;
+  void push_back(const gtsam::BinaryMeasurement<gtsam::Rot3>& measurement);
 #include <gtsam/sfm/ShonanAveraging.h>
 // TODO(frank): copy/pasta below until we have integer template paremeters in
@@ -142,8 +164,8 @@ class ShonanAveraging2 {
   ShonanAveraging2(string g2oFile);
   ShonanAveraging2(string g2oFile,
                    const gtsam::ShonanAveragingParameters2& parameters);
-  ShonanAveraging2(const gtsam::BetweenFactorPose2s &factors,
-                   const gtsam::ShonanAveragingParameters2 &parameters);
+  ShonanAveraging2(const gtsam::BetweenFactorPose2s& factors,
+                   const gtsam::ShonanAveragingParameters2& parameters);
   // Query properties
   size_t nrUnknowns() const;
@@ -184,6 +206,10 @@ class ShonanAveraging2 {
 class ShonanAveraging3 {
+  ShonanAveraging3(
+      const std::vector<gtsam::BinaryMeasurement<gtsam::Rot3>>& measurements,
+      const gtsam::ShonanAveragingParameters3& parameters =
+          gtsam::ShonanAveragingParameters3());
   ShonanAveraging3(string g2oFile);
   ShonanAveraging3(string g2oFile,
                    const gtsam::ShonanAveragingParameters3& parameters);
@@ -252,15 +278,36 @@ class MFAS {
 #include <gtsam/sfm/TranslationRecovery.h>
 class TranslationRecovery {
-  TranslationRecovery(
+  TranslationRecovery(const gtsam::LevenbergMarquardtParams& lmParams);
+  TranslationRecovery();  // default params.
+  void addPrior(const gtsam::BinaryMeasurementsUnit3& relativeTranslations,
+                const double scale,
+                const gtsam::BinaryMeasurementsPoint3& betweenTranslations,
+                gtsam::NonlinearFactorGraph @graph,
+                const gtsam::SharedNoiseModel& priorNoiseModel) const;
+  void addPrior(const gtsam::BinaryMeasurementsUnit3& relativeTranslations,
+                const double scale,
+                const gtsam::BinaryMeasurementsPoint3& betweenTranslations,
+                gtsam::NonlinearFactorGraph @graph) const;
+  gtsam::NonlinearFactorGraph buildGraph(
+      const gtsam::BinaryMeasurementsUnit3& relativeTranslations) const;
+  gtsam::Values run(const gtsam::BinaryMeasurementsUnit3& relativeTranslations,
+                    const double scale,
+                    const gtsam::BinaryMeasurementsPoint3& betweenTranslations,
+                    const gtsam::Values& initialValues) const;
+  // default random initial values
+  gtsam::Values run(
       const gtsam::BinaryMeasurementsUnit3& relativeTranslations,
-      const gtsam::LevenbergMarquardtParams& lmParams);
-  TranslationRecovery(
-      const gtsam::BinaryMeasurementsUnit3&
-          relativeTranslations);  // default LevenbergMarquardtParams
-  gtsam::Values run(const double scale) const;
-  gtsam::Values run() const;  // default scale = 1.0
+      const double scale,
+      const gtsam::BinaryMeasurementsPoint3& betweenTranslations) const;
+  // default empty betweenTranslations
+  gtsam::Values run(const gtsam::BinaryMeasurementsUnit3& relativeTranslations,
+                    const double scale) const;
+  // default scale = 1.0, empty betweenTranslations
+  gtsam::Values run(
+      const gtsam::BinaryMeasurementsUnit3& relativeTranslations) const;
 }  // namespace gtsam
diff --git a/gtsam/slam/KarcherMeanFactor-inl.h b/gtsam/slam/KarcherMeanFactor-inl.h
index c81a9adc5d..00f7417056 100644
--- a/gtsam/slam/KarcherMeanFactor-inl.h
+++ b/gtsam/slam/KarcherMeanFactor-inl.h
@@ -40,8 +40,7 @@ T FindKarcherMeanImpl(const vector<T, ALLOC>& rotations) {
   return result.at<T>(kKey);
-template <class T,
-        typename = typename std::enable_if< std::is_same<gtsam::Rot3, T>::value >::type >
+template <class T>
 T FindKarcherMean(const std::vector<T>& rotations) {
   return FindKarcherMeanImpl(rotations);
diff --git a/gtsam/slam/dataset.h b/gtsam/slam/dataset.h
index dc450a9f77..95e750674d 100644
--- a/gtsam/slam/dataset.h
+++ b/gtsam/slam/dataset.h
@@ -223,6 +223,8 @@ parse3DFactors(const std::string &filename,
                size_t maxIndex = 0);
 using BinaryMeasurementsUnit3 = std::vector<BinaryMeasurement<Unit3>>;
+using BinaryMeasurementsPoint3 = std::vector<BinaryMeasurement<Point3>>;
+using BinaryMeasurementsRot3 = std::vector<BinaryMeasurement<Rot3>>;
 inline boost::optional<IndexedPose> GTSAM_DEPRECATED
diff --git a/gtsam/slam/slam.i b/gtsam/slam/slam.i
index 4e943253ea..8e1e06d5b4 100644
--- a/gtsam/slam/slam.i
+++ b/gtsam/slam/slam.i
@@ -90,6 +90,22 @@ typedef gtsam::GeneralSFMFactor<gtsam::PinholeCamera<gtsam::Cal3Unified>,
+typedef gtsam::GeneralSFMFactor<gtsam::PinholePose<gtsam::Cal3_S2>,
+                                gtsam::Point3>
+    GeneralSFMFactorPoseCal3_S2;
+typedef gtsam::GeneralSFMFactor<gtsam::PinholePose<gtsam::Cal3DS2>,
+                                gtsam::Point3>
+    GeneralSFMFactorPoseCal3DS2;
+typedef gtsam::GeneralSFMFactor<gtsam::PinholePose<gtsam::Cal3Bundler>,
+                                gtsam::Point3>
+    GeneralSFMFactorPoseCal3Bundler;
+typedef gtsam::GeneralSFMFactor<gtsam::PinholePose<gtsam::Cal3Fisheye>,
+                                gtsam::Point3>
+    GeneralSFMFactorPoseCal3Fisheye;
+typedef gtsam::GeneralSFMFactor<gtsam::PinholePose<gtsam::Cal3Unified>,
+                                gtsam::Point3>
+    GeneralSFMFactorPoseCal3Unified;
 template <CALIBRATION = {gtsam::Cal3_S2, gtsam::Cal3DS2, gtsam::Cal3Bundler,
                          gtsam::Cal3Fisheye, gtsam::Cal3Unified}>
 virtual class GeneralSFMFactor2 : gtsam::NoiseModelFactor {
diff --git a/gtsam/slam/tests/testPriorFactor.cpp b/gtsam/slam/tests/testPriorFactor.cpp
index 2dc083cb23..d1a60e3461 100644
--- a/gtsam/slam/tests/testPriorFactor.cpp
+++ b/gtsam/slam/tests/testPriorFactor.cpp
@@ -5,12 +5,16 @@
  * @date   Nov 4, 2014
+#include <CppUnitLite/TestHarness.h>
 #include <gtsam/base/Vector.h>
+#include <gtsam/navigation/ImuBias.h>
 #include <gtsam/nonlinear/PriorFactor.h>
-#include <CppUnitLite/TestHarness.h>
+#include <gtsam/nonlinear/factorTesting.h>
 using namespace std;
+using namespace std::placeholders;
 using namespace gtsam;
+using namespace imuBias;
 /* ************************************************************************* */
@@ -23,16 +27,44 @@ TEST(PriorFactor, ConstructorScalar) {
 // Constructor vector3
 TEST(PriorFactor, ConstructorVector3) {
   SharedNoiseModel model = noiseModel::Isotropic::Sigma(3, 1.0);
-  PriorFactor<Vector3> factor(1, Vector3(1,2,3), model);
+  PriorFactor<Vector3> factor(1, Vector3(1, 2, 3), model);
 // Constructor dynamic sized vector
 TEST(PriorFactor, ConstructorDynamicSizeVector) {
-  Vector v(5); v << 1, 2, 3, 4, 5;
+  Vector v(5);
+  v << 1, 2, 3, 4, 5;
   SharedNoiseModel model = noiseModel::Isotropic::Sigma(5, 1.0);
   PriorFactor<Vector> factor(1, v, model);
+Vector callEvaluateError(const PriorFactor<ConstantBias>& factor,
+                         const ConstantBias& bias) {
+  return factor.evaluateError(bias);
+// Test for imuBias::ConstantBias
+TEST(PriorFactor, ConstantBias) {
+  Vector3 biasAcc(1, 2, 3);
+  Vector3 biasGyro(0.1, 0.2, 0.3);
+  ConstantBias bias(biasAcc, biasGyro);
+  PriorFactor<ConstantBias> factor(1, bias,
+                                   noiseModel::Isotropic::Sigma(6, 0.1));
+  Values values;
+  values.insert(1, bias);
+  EXPECT_DOUBLES_EQUAL(0.0, factor.error(values), 1e-8);
+  EXPECT_CORRECT_FACTOR_JACOBIANS(factor, values, 1e-7, 1e-5);
+  ConstantBias incorrectBias(
+      (Vector6() << 1.1, 2.1, 3.1, 0.2, 0.3, 0.4).finished());
+  values.clear();
+  values.insert(1, incorrectBias);
+  EXPECT_DOUBLES_EQUAL(3.0, factor.error(values), 1e-8);
+  EXPECT_CORRECT_FACTOR_JACOBIANS(factor, values, 1e-7, 1e-5);
 /* ************************************************************************* */
 int main() {
   TestResult tr;
diff --git a/gtsam/symbolic/tests/testSymbolicBayesNet.cpp b/gtsam/symbolic/tests/testSymbolicBayesNet.cpp
index 2e13be10eb..7795d5b89b 100644
--- a/gtsam/symbolic/tests/testSymbolicBayesNet.cpp
+++ b/gtsam/symbolic/tests/testSymbolicBayesNet.cpp
@@ -104,16 +104,16 @@ TEST(SymbolicBayesNet, Dot) {
          "digraph {\n"
          "  size=\"5,5\";\n"
-         "  vara1[label=\"a1\", pos=\"1,2!\", shape=box];\n"
-         "  vara2[label=\"a2\", pos=\"2,2!\", shape=box];\n"
-         "  varx1[label=\"x1\", pos=\"1,1!\"];\n"
-         "  varx2[label=\"x2\", pos=\"2,1!\"];\n"
-         "  varx3[label=\"x3\", pos=\"3,1!\"];\n"
+         "  var6989586621679009793[label=\"a1\", pos=\"1,2!\", shape=box];\n"
+         "  var6989586621679009794[label=\"a2\", pos=\"2,2!\", shape=box];\n"
+         "  var8646911284551352321[label=\"x1\", pos=\"1,1!\"];\n"
+         "  var8646911284551352322[label=\"x2\", pos=\"2,1!\"];\n"
+         "  var8646911284551352323[label=\"x3\", pos=\"3,1!\"];\n"
-         "  varx1->varx2\n"
-         "  vara1->varx2\n"
-         "  varx2->varx3\n"
-         "  vara2->varx3\n"
+         "  var8646911284551352321->var8646911284551352322\n"
+         "  var6989586621679009793->var8646911284551352322\n"
+         "  var8646911284551352322->var8646911284551352323\n"
+         "  var6989586621679009794->var8646911284551352323\n"
diff --git a/gtsam_unstable/gtsam_unstable.i b/gtsam_unstable/gtsam_unstable.i
index dd66e7a730..08cd45e186 100644
--- a/gtsam_unstable/gtsam_unstable.i
+++ b/gtsam_unstable/gtsam_unstable.i
@@ -797,4 +797,30 @@ virtual class ProjectionFactorPPPC : gtsam::NoiseModelFactor {
 typedef gtsam::ProjectionFactorPPPC<gtsam::Pose3, gtsam::Point3, gtsam::Cal3_S2> ProjectionFactorPPPCCal3_S2;
 typedef gtsam::ProjectionFactorPPPC<gtsam::Pose3, gtsam::Point3, gtsam::Cal3DS2> ProjectionFactorPPPCCal3DS2;
+#include <gtsam_unstable/slam/ProjectionFactorRollingShutter.h>
+virtual class ProjectionFactorRollingShutter : gtsam::NoiseModelFactor {
+  ProjectionFactorRollingShutter(const gtsam::Point2& measured, double alpha, const gtsam::noiseModel::Base* noiseModel,
+      size_t poseKey_a, size_t poseKey_b, size_t pointKey, const gtsam::Cal3_S2* K);
+  ProjectionFactorRollingShutter(const gtsam::Point2& measured, double alpha, const gtsam::noiseModel::Base* noiseModel,
+    size_t poseKey_a, size_t poseKey_b, size_t pointKey, const gtsam::Cal3_S2* K, gtsam::Pose3& body_P_sensor);
+  ProjectionFactorRollingShutter(const gtsam::Point2& measured, double alpha, const gtsam::noiseModel::Base* noiseModel,
+        size_t poseKey_a, size_t poseKey_b, size_t pointKey, const gtsam::Cal3_S2* K, bool throwCheirality,
+        bool verboseCheirality);
+  ProjectionFactorRollingShutter(const gtsam::Point2& measured, double alpha, const gtsam::noiseModel::Base* noiseModel,
+      size_t poseKey_a, size_t poseKey_b, size_t pointKey, const gtsam::Cal3_S2* K, bool throwCheirality,
+      bool verboseCheirality, gtsam::Pose3& body_P_sensor);
+  gtsam::Point2 measured() const;
+  double alpha() const;
+  gtsam::Cal3_S2* calibration() const;
+  bool verboseCheirality() const;
+  bool throwCheirality() const;
+  // enabling serialization functionality
+  void serialize() const;
 } //\namespace gtsam
diff --git a/gtsam_unstable/slam/InvDepthFactor3.h b/gtsam_unstable/slam/InvDepthFactor3.h
index 3fd86f271e..44d3b8fd04 100644
--- a/gtsam_unstable/slam/InvDepthFactor3.h
+++ b/gtsam_unstable/slam/InvDepthFactor3.h
@@ -92,7 +92,7 @@ class InvDepthFactor3: public NoiseModelFactor3<POSE, LANDMARK, INVDEPTH> {
     } catch( CheiralityException& e) {
       if (H1) *H1 = Matrix::Zero(2,6);
       if (H2) *H2 = Matrix::Zero(2,5);
-      if (H3) *H2 = Matrix::Zero(2,1);
+      if (H3) *H3 = Matrix::Zero(2,1);
       std::cout << e.what() << ": Landmark "<< DefaultKeyFormatter(this->key2()) <<
           " moved behind camera " << DefaultKeyFormatter(this->key1()) << std::endl;
       return Vector::Ones(2) * 2.0 * K_->fx();
diff --git a/gtsam_unstable/slam/tests/testInvDepthFactor3.cpp b/gtsam_unstable/slam/tests/testInvDepthFactor3.cpp
index 14ad43ae26..8a81c1f245 100644
--- a/gtsam_unstable/slam/tests/testInvDepthFactor3.cpp
+++ b/gtsam_unstable/slam/tests/testInvDepthFactor3.cpp
@@ -1,8 +1,18 @@
- * testInvDepthFactor.cpp
+/* ----------------------------------------------------------------------------
+ * GTSAM Copyright 2010, Georgia Tech Research Corporation,
+ * Atlanta, Georgia 30332-0415
+ * All Rights Reserved
+ * Authors: Frank Dellaert, et al. (see THANKS for the full author list)
+ * See LICENSE for the license information
+ * -------------------------------------------------------------------------- */
+ *  @file  testInvDepthFactor3.cpp
+ *  @brief Unit tests inverse depth parametrization
- *  Created on: Apr 13, 2012
- *      Author: cbeall3
+ *  @author cbeall3
+ *  @author Dominik Van Opdenbosch
+ *  @date   Apr 13, 2012
 #include <CppUnitLite/TestHarness.h>
@@ -12,6 +22,7 @@
 #include <gtsam/nonlinear/NonlinearFactorGraph.h>
 #include <gtsam/nonlinear/LevenbergMarquardtOptimizer.h>
 #include <gtsam/inference/Symbol.h>
+#include <gtsam/base/numericalDerivative.h>
 #include <gtsam_unstable/slam/InvDepthFactor3.h>
@@ -28,6 +39,11 @@ PinholeCamera<Cal3_S2> level_camera(level_pose, *K);
 typedef InvDepthFactor3<Pose3, Vector5, double> InverseDepthFactor;
 typedef NonlinearEquality<Pose3> PoseConstraint;
+Matrix factorError(const Pose3& pose, const Vector5& point, double invDepth,
+                     const InverseDepthFactor& factor) {
+  return factor.evaluateError(pose, point, invDepth);
 /* ************************************************************************* */
 TEST( InvDepthFactor, optimize) {
@@ -92,6 +108,55 @@ TEST( InvDepthFactor, optimize) {
+/* ************************************************************************* */
+TEST( InvDepthFactor, Jacobian3D ) {
+  // landmark 5 meters infront of camera (camera center at (0,0,1))
+  Point3 landmark(5, 0, 1);
+  // get expected projection using pinhole camera
+  Point2 expected_uv = level_camera.project(landmark);
+  // get expected landmark representation using backprojection
+  double inv_depth;
+  Vector5 inv_landmark;
+  InvDepthCamera3<Cal3_S2> inv_camera(level_pose, K);
+  std::tie(inv_landmark, inv_depth) = inv_camera.backproject(expected_uv, 5);
+  Vector5 expected_inv_landmark((Vector(5) << 0., 0., 1., 0., 0.).finished());
+  CHECK(assert_equal(expected_inv_landmark, inv_landmark, 1e-6));
+  CHECK(assert_equal(inv_depth, 1./5, 1e-6));
+  Symbol poseKey('x',1);
+  Symbol pointKey('l',1);
+  Symbol invDepthKey('d',1);
+  InverseDepthFactor factor(expected_uv, sigma, poseKey, pointKey, invDepthKey, K);
+  std::vector<Matrix> actualHs(3);
+  factor.unwhitenedError({{poseKey, genericValue(level_pose)},
+                          {pointKey, genericValue(inv_landmark)},
+                          {invDepthKey,genericValue(inv_depth)}},
+                         actualHs);
+  const Matrix& H1Actual = actualHs.at(0);
+  const Matrix& H2Actual = actualHs.at(1);
+  const Matrix& H3Actual = actualHs.at(2);
+  // Use numerical derivatives to verify the Jacobians
+  Matrix H1Expected, H2Expected, H3Expected;
+  std::function<Matrix(const Pose3 &, const Vector5 &, const double &)>
+      func = std::bind(&factorError, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, factor);
+  H1Expected = numericalDerivative31(func, level_pose, inv_landmark, inv_depth);
+  H2Expected = numericalDerivative32(func, level_pose, inv_landmark, inv_depth);
+  H3Expected = numericalDerivative33(func, level_pose, inv_landmark, inv_depth);
+  // Verify the Jacobians
+  CHECK(assert_equal(H1Expected, H1Actual, 1e-6))
+  CHECK(assert_equal(H2Expected, H2Actual, 1e-6))
+  CHECK(assert_equal(H3Expected, H3Actual, 1e-6))
 /* ************************************************************************* */
 int main() { TestResult tr; return TestRegistry::runAllTests(tr);}
 /* ************************************************************************* */
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index f5869b1450..c14f02ddab 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -47,7 +47,9 @@ set(ignore
+    gtsam::BinaryMeasurementsPoint3
+    gtsam::BinaryMeasurementsRot3
@@ -98,11 +100,23 @@ set(GTSAM_MODULE_PATH ${GTSAM_PYTHON_BUILD_DIRECTORY}/gtsam)
-# Hack to get python test files copied every time they are modified
+# Hack to get python test and util files copied every time they are modified
 foreach(test_file ${GTSAM_PYTHON_TEST_FILES})
         configure_file(${test_file} "${GTSAM_MODULE_PATH}/tests/${test_file}" COPYONLY)
+foreach(util_file ${GTSAM_PYTHON_UTIL_FILES})
+        configure_file(${util_file} "${GTSAM_MODULE_PATH}/utils/${test_file}" COPYONLY)
+foreach(util_file ${GTSAM_PYTHON_PREAMBLE_FILES})
+        configure_file(${util_file} "${GTSAM_MODULE_PATH}/preamble/${test_file}" COPYONLY)
+        configure_file(${util_file} "${GTSAM_MODULE_PATH}/specializations/${test_file}" COPYONLY)
 # Common directory for data/datasets stored with the package.
 # This will store the data in the Python site package directly.
@@ -124,7 +138,9 @@ if(GTSAM_UNSTABLE_BUILD_PYTHON)
+            gtsam::BinaryMeasurementsPoint3
+            gtsam::BinaryMeasurementsRot3
@@ -160,7 +176,7 @@ if(GTSAM_UNSTABLE_BUILD_PYTHON)
     # Hack to get python test files copied every time they are modified
-    foreach(test_file ${GTSAM_PYTHON_TEST_FILES})
+    foreach(test_file ${GTSAM_UNSTABLE_PYTHON_TEST_FILES})
         configure_file(${test_file} "${GTSAM_UNSTABLE_MODULE_PATH}/tests/${test_file}" COPYONLY)
@@ -172,7 +188,7 @@ endif()
 # Add custom target so we can install with `make python-install`
 set(GTSAM_PYTHON_INSTALL_TARGET python-install)
-        COMMAND ${PYTHON_EXECUTABLE} -m pip install --user .
+        COMMAND ${PYTHON_EXECUTABLE} -m pip install .
diff --git a/python/README.md b/python/README.md
index 54436df93b..278d620948 100644
--- a/python/README.md
+++ b/python/README.md
@@ -8,6 +8,7 @@ For instructions on updating the version of the [wrap library](https://github.co
 ## Requirements
+- Cmake >= 3.15
 - If you want to build the GTSAM python library for a specific python version (eg 3.6),
   use the `-DGTSAM_PYTHON_VERSION=3.6` option when running `cmake` otherwise the default interpreter will be used.
 - If the interpreter is inside an environment (such as an anaconda environment or virtualenv environment),
diff --git a/python/gtsam/examples/TranslationAveragingExample.py b/python/gtsam/examples/TranslationAveragingExample.py
index 054b61126c..92a7d04e35 100644
--- a/python/gtsam/examples/TranslationAveragingExample.py
+++ b/python/gtsam/examples/TranslationAveragingExample.py
@@ -123,7 +123,7 @@ def estimate_poses(i_iZj_list: gtsam.BinaryMeasurementsUnit3,
     w_iZj_inliers = filter_outliers(w_iZj_list)
     # Run the optimizer to obtain translations for normalized directions.
-    wtc_values = gtsam.TranslationRecovery(w_iZj_inliers).run()
+    wtc_values = gtsam.TranslationRecovery().run(w_iZj_inliers)
     wTc_values = gtsam.Values()
     for key in wRc_values.keys():
diff --git a/python/gtsam/notebooks/ellipses.ipynb b/python/gtsam/notebooks/ellipses.ipynb
new file mode 100644
index 0000000000..06938f6968
--- /dev/null
+++ b/python/gtsam/notebooks/ellipses.ipynb
@@ -0,0 +1,133 @@
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Ellipse Scaling\n",
+    "\n",
+    "The code to calculate the percentages included in ellipses with various values of \"k\" in `plot.py`.\n",
+    "\n",
+    "Thanks to @senselessDev, January 26, for providing the code in [PR #1067](https://github.com/borglab/gtsam/pull/1067)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import scipy\n",
+    "import scipy.stats\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pct_to_sigma(pct, dof):\n",
+    "    return np.sqrt(scipy.stats.chi2.ppf(pct / 100., df=dof))\n",
+    "\n",
+    "def sigma_to_pct(sigma, dof):\n",
+    "    return scipy.stats.chi2.cdf(sigma**2, df=dof) * 100."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0D\t    1    \t    2    \t    3    \t    4    \t    5    \n",
+      "1D\t68.26895%\t95.44997%\t99.73002%\t99.99367%\t99.99994%\n",
+      "2D\t39.34693%\t86.46647%\t98.88910%\t99.96645%\t99.99963%\n",
+      "3D\t19.87480%\t73.85359%\t97.07091%\t99.88660%\t99.99846%\n"
+     ]
+    }
+   ],
+   "source": [
+    "for dof in range(0, 4):\n",
+    "    print(\"{}D\".format(dof), end=\"\")\n",
+    "    for sigma in range(1, 6):\n",
+    "        if dof == 0: print(\"\\t    {}    \".format(sigma), end=\"\")\n",
+    "        else: print(\"\\t{:.5f}%\".format(sigma_to_pct(sigma, dof)), end=\"\")\n",
+    "    print()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1D\n",
+      "\n",
+      "pct=50.0 -> sigma=0.674489750196\n",
+      "pct=95.0 -> sigma=1.959963984540\n",
+      "pct=99.0 -> sigma=2.575829303549\n",
+      "\n",
+      "2D\n",
+      "\n",
+      "pct=50.0 -> sigma=1.177410022515\n",
+      "pct=95.0 -> sigma=2.447746830681\n",
+      "pct=99.0 -> sigma=3.034854258770\n",
+      "\n",
+      "3D\n",
+      "\n",
+      "pct=50.0 -> sigma=1.538172254455\n",
+      "pct=95.0 -> sigma=2.795483482915\n",
+      "pct=99.0 -> sigma=3.368214175219\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "for dof in range(1, 4):\n",
+    "    print(\"{}D\\n\".format(dof))\n",
+    "    for pct in [50, 95, 99]:\n",
+    "        print(\"pct={:.1f} -> sigma={:.12f}\".format(pct, pct_to_sigma(pct, dof)))\n",
+    "    print()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "4d608302ba82e7596903db5446e6fa05f049271852e8cc6e1cafaafe5fbd9fed"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.8.13 ('gtsfm-v1')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
diff --git a/python/gtsam/preamble/geometry.h b/python/gtsam/preamble/geometry.h
index 35fe2a577a..bd0441d067 100644
--- a/python/gtsam/preamble/geometry.h
+++ b/python/gtsam/preamble/geometry.h
@@ -23,8 +23,8 @@ PYBIND11_MAKE_OPAQUE(
     std::vector<gtsam::Point2, Eigen::aligned_allocator<gtsam::Point2>>);
-    gtsam::CameraSet<gtsam::PinholeCamera<gtsam::Cal3Bundler>>);
diff --git a/python/gtsam/preamble/sfm.h b/python/gtsam/preamble/sfm.h
index a34e730580..8ff0ea82ee 100644
--- a/python/gtsam/preamble/sfm.h
+++ b/python/gtsam/preamble/sfm.h
@@ -9,4 +9,18 @@
  * automatic STL binding, such that the raw objects can be accessed in Python.
  * Without this they will be automatically converted to a Python object, and all
  * mutations on Python side will not be reflected on C++.
- */
\ No newline at end of file
+ */
+// Including <stl.h> can cause some serious hard-to-debug bugs!!!
+// #include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+    std::vector<gtsam::SfmTrack>);
+    std::vector<gtsam::SfmCamera>);
+    std::vector<gtsam::BinaryMeasurement<gtsam::Unit3>>);
+    std::vector<gtsam::BinaryMeasurement<gtsam::Rot3>>);
diff --git a/python/gtsam/specializations/geometry.h b/python/gtsam/specializations/geometry.h
index a492ce8eb2..99f40253fb 100644
--- a/python/gtsam/specializations/geometry.h
+++ b/python/gtsam/specializations/geometry.h
@@ -16,10 +16,13 @@ py::bind_vector<
     m_, "Point2Vector");
 py::bind_vector<std::vector<gtsam::Point2Pair>>(m_, "Point2Pairs");
 py::bind_vector<std::vector<gtsam::Point3Pair>>(m_, "Point3Pairs");
+py::bind_vector<std::vector<gtsam::Pose2Pair>>(m_, "Pose2Pairs");
 py::bind_vector<std::vector<gtsam::Pose3Pair>>(m_, "Pose3Pairs");
 py::bind_vector<std::vector<gtsam::Pose3>>(m_, "Pose3Vector");
     m_, "CameraSetCal3_S2");
+    m_, "CameraSetCal3DS2");
     m_, "CameraSetCal3Bundler");
diff --git a/python/gtsam/specializations/sfm.h b/python/gtsam/specializations/sfm.h
index 6de15217fb..311b2c59b4 100644
--- a/python/gtsam/specializations/sfm.h
+++ b/python/gtsam/specializations/sfm.h
@@ -11,6 +11,23 @@
  * and saves one copy operation.
+py::bind_vector<std::vector<gtsam::BinaryMeasurement<gtsam::Point3> > >(
+    m_, "BinaryMeasurementsPoint3");
 py::bind_vector<std::vector<gtsam::BinaryMeasurement<gtsam::Unit3> > >(
     m_, "BinaryMeasurementsUnit3");
+py::bind_vector<std::vector<gtsam::BinaryMeasurement<gtsam::Rot3> > >(
+    m_, "BinaryMeasurementsRot3");
 py::bind_map<gtsam::KeyPairDoubleMap>(m_, "KeyPairDoubleMap");
+    std::vector<gtsam::SfmTrack> >(
+    m_, "SfmTracks");
+    std::vector<gtsam::SfmCamera> >(
+    m_, "SfmCameras");
+    std::vector<std::pair<size_t, gtsam::Point2>>>(
+        m_, "SfmMeasurementVector"
+    );
diff --git a/python/gtsam/tests/test_Cal3Unified.py b/python/gtsam/tests/test_Cal3Unified.py
index bafbacfa40..630109d667 100644
--- a/python/gtsam/tests/test_Cal3Unified.py
+++ b/python/gtsam/tests/test_Cal3Unified.py
@@ -139,6 +139,17 @@ def test_jacobian(self):
         self.gtsamAssertEquals(z, np.zeros(2))
         self.gtsamAssertEquals(H @ H.T, 4*np.eye(2))
+        Dcal = np.zeros((2, 10), order='F')
+        Dp = np.zeros((2, 2), order='F')
+        camera.calibrate(img_point, Dcal, Dp)
+        self.gtsamAssertEquals(Dcal, np.array(
+            [[ 0.,  0.,  0., -1.,  0.,  0.,  0.,  0.,  0.,  0.],
+            [ 0.,  0.,  0.,  0., -1.,  0.,  0.,  0.,  0.,  0.]]))
+        self.gtsamAssertEquals(Dp, np.array(
+            [[ 1., -0.],
+            [-0.,  1.]]))
     @unittest.skip("triangulatePoint3 currently seems to require perspective projections.")
     def test_triangulation(self):
         """Estimate spatial point from image measurements"""
diff --git a/python/gtsam/tests/test_DiscreteBayesNet.py b/python/gtsam/tests/test_DiscreteBayesNet.py
index 74191dcc7a..10c5db612a 100644
--- a/python/gtsam/tests/test_DiscreteBayesNet.py
+++ b/python/gtsam/tests/test_DiscreteBayesNet.py
@@ -11,12 +11,12 @@
 # pylint: disable=no-name-in-module, invalid-name
-import unittest
 import textwrap
+import unittest
 import gtsam
-from gtsam import (DiscreteBayesNet, DiscreteConditional, DiscreteFactorGraph,
-                   DiscreteKeys, DiscreteDistribution, DiscreteValues, Ordering)
+from gtsam import (DiscreteBayesNet, DiscreteConditional, DiscreteDistribution,
+                   DiscreteFactorGraph, DiscreteKeys, DiscreteValues, Ordering)
 from gtsam.utils.test_case import GtsamTestCase
 # Some keys:
@@ -152,10 +152,10 @@ def test_dot(self):
-              vara0[label="a0", pos="0,2!"];
+              var6989586621679009792[label="a0", pos="0,2!"];
-              vara0->var3
+              var6989586621679009792->var3
diff --git a/python/gtsam/tests/test_NonlinearOptimizer.py b/python/gtsam/tests/test_NonlinearOptimizer.py
index e2561ae52a..37afd9e1c0 100644
--- a/python/gtsam/tests/test_NonlinearOptimizer.py
+++ b/python/gtsam/tests/test_NonlinearOptimizer.py
@@ -15,12 +15,10 @@
 import unittest
 import gtsam
-from gtsam import (DoglegOptimizer, DoglegParams,
-                   DummyPreconditionerParameters, GaussNewtonOptimizer,
-                   GaussNewtonParams, GncLMParams, GncLMOptimizer,
-                   LevenbergMarquardtOptimizer, LevenbergMarquardtParams,
-                   NonlinearFactorGraph, Ordering,
-                   PCGSolverParameters, Point2, PriorFactorPoint2, Values)
+from gtsam import (DoglegOptimizer, DoglegParams, DummyPreconditionerParameters,
+                   GaussNewtonOptimizer, GaussNewtonParams, GncLMParams, GncLMOptimizer,
+                   LevenbergMarquardtOptimizer, LevenbergMarquardtParams, NonlinearFactorGraph,
+                   Ordering, PCGSolverParameters, Point2, PriorFactorPoint2, Values)
 from gtsam.utils.test_case import GtsamTestCase
 KEY1 = 1
@@ -28,63 +26,83 @@
 class TestScenario(GtsamTestCase):
-    def test_optimize(self):
-        """Do trivial test with three optimizer variants."""
-        fg = NonlinearFactorGraph()
+    """Do trivial test with three optimizer variants."""
+    def setUp(self):
+        """Set up the optimization problem and ordering"""
+        # create graph
+        self.fg = NonlinearFactorGraph()
         model = gtsam.noiseModel.Unit.Create(2)
-        fg.add(PriorFactorPoint2(KEY1, Point2(0, 0), model))
+        self.fg.add(PriorFactorPoint2(KEY1, Point2(0, 0), model))
         # test error at minimum
         xstar = Point2(0, 0)
-        optimal_values = Values()
-        optimal_values.insert(KEY1, xstar)
-        self.assertEqual(0.0, fg.error(optimal_values), 0.0)
+        self.optimal_values = Values()
+        self.optimal_values.insert(KEY1, xstar)
+        self.assertEqual(0.0, self.fg.error(self.optimal_values), 0.0)
         # test error at initial = [(1-cos(3))^2 + (sin(3))^2]*50 =
         x0 = Point2(3, 3)
-        initial_values = Values()
-        initial_values.insert(KEY1, x0)
-        self.assertEqual(9.0, fg.error(initial_values), 1e-3)
+        self.initial_values = Values()
+        self.initial_values.insert(KEY1, x0)
+        self.assertEqual(9.0, self.fg.error(self.initial_values), 1e-3)
         # optimize parameters
-        ordering = Ordering()
-        ordering.push_back(KEY1)
+        self.ordering = Ordering()
+        self.ordering.push_back(KEY1)
-        # Gauss-Newton
+    def test_gauss_newton(self):
         gnParams = GaussNewtonParams()
-        gnParams.setOrdering(ordering)
-        actual1 = GaussNewtonOptimizer(fg, initial_values, gnParams).optimize()
-        self.assertAlmostEqual(0, fg.error(actual1))
+        gnParams.setOrdering(self.ordering)
+        actual = GaussNewtonOptimizer(self.fg, self.initial_values, gnParams).optimize()
+        self.assertAlmostEqual(0, self.fg.error(actual))
-        # Levenberg-Marquardt
+    def test_levenberg_marquardt(self):
         lmParams = LevenbergMarquardtParams.CeresDefaults()
-        lmParams.setOrdering(ordering)
-        actual2 = LevenbergMarquardtOptimizer(
-            fg, initial_values, lmParams).optimize()
-        self.assertAlmostEqual(0, fg.error(actual2))
+        lmParams.setOrdering(self.ordering)
+        actual = LevenbergMarquardtOptimizer(self.fg, self.initial_values, lmParams).optimize()
+        self.assertAlmostEqual(0, self.fg.error(actual))
-        # Levenberg-Marquardt
+    def test_levenberg_marquardt_pcg(self):
         lmParams = LevenbergMarquardtParams.CeresDefaults()
         cgParams = PCGSolverParameters()
-        actual2 = LevenbergMarquardtOptimizer(
-            fg, initial_values, lmParams).optimize()
-        self.assertAlmostEqual(0, fg.error(actual2))
+        actual = LevenbergMarquardtOptimizer(self.fg, self.initial_values, lmParams).optimize()
+        self.assertAlmostEqual(0, self.fg.error(actual))
-        # Dogleg
+    def test_dogleg(self):
         dlParams = DoglegParams()
-        dlParams.setOrdering(ordering)
-        actual3 = DoglegOptimizer(fg, initial_values, dlParams).optimize()
-        self.assertAlmostEqual(0, fg.error(actual3))
-        # Graduated Non-Convexity (GNC)
-        gncParams = GncLMParams()
-        actual4 = GncLMOptimizer(fg, initial_values, gncParams).optimize()
-        self.assertAlmostEqual(0, fg.error(actual4))
+        dlParams.setOrdering(self.ordering)
+        actual = DoglegOptimizer(self.fg, self.initial_values, dlParams).optimize()
+        self.assertAlmostEqual(0, self.fg.error(actual))
+    def test_graduated_non_convexity(self):
+        gncParams = GncLMParams()
+        actual = GncLMOptimizer(self.fg, self.initial_values, gncParams).optimize()
+        self.assertAlmostEqual(0, self.fg.error(actual))
+    def test_iteration_hook(self):
+        # set up iteration hook to track some testable values
+        iteration_count = 0
+        final_error = 0
+        final_values = None
+        def iteration_hook(iter, error_before, error_after):
+            nonlocal iteration_count, final_error, final_values
+            iteration_count = iter
+            final_error = error_after
+            final_values = optimizer.values()
+        # optimize
+        params = LevenbergMarquardtParams.CeresDefaults()
+        params.setOrdering(self.ordering)
+        params.iterationHook = iteration_hook
+        optimizer = LevenbergMarquardtOptimizer(self.fg, self.initial_values, params)
+        actual = optimizer.optimize()
+        self.assertAlmostEqual(0, self.fg.error(actual))
+        self.gtsamAssertEquals(final_values, actual)
+        self.assertEqual(self.fg.error(actual), final_error)
+        self.assertEqual(optimizer.iterations(), iteration_count)
 if __name__ == "__main__":
diff --git a/python/gtsam/tests/test_PinholeCamera.py b/python/gtsam/tests/test_PinholeCamera.py
new file mode 100644
index 0000000000..392d48d3fb
--- /dev/null
+++ b/python/gtsam/tests/test_PinholeCamera.py
@@ -0,0 +1,46 @@
+GTSAM Copyright 2010-2019, Georgia Tech Research Corporation,
+Atlanta, Georgia 30332-0415
+All Rights Reserved
+See LICENSE for the license information
+PinholeCamera unit tests.
+Author: Fan Jiang
+import unittest
+from math import pi
+import numpy as np
+import gtsam
+from gtsam.utils.test_case import GtsamTestCase
+class TestPinholeCamera(GtsamTestCase):
+    """
+    Tests if we can correctly get the camera Jacobians in Python
+    """
+    def test_jacobian(self):
+        cam1 = gtsam.PinholeCameraCal3Bundler()
+        # order is important because Eigen is column major!
+        Dpose = np.zeros((2, 6), order='F')
+        Dpoint = np.zeros((2, 3), order='F')
+        Dcal = np.zeros((2, 3), order='F')
+        cam1.project(np.array([1, 1, 1]), Dpose, Dpoint, Dcal)
+        self.gtsamAssertEquals(Dpoint, np.array([[1, 0, -1], [0, 1, -1]]))
+        self.gtsamAssertEquals(
+            Dpose,
+            np.array([
+                [1., -2., 1., -1., 0., 1.],  #
+                [2., -1., -1., 0., -1., 1.]
+            ]))
+        self.gtsamAssertEquals(Dcal, np.array([[1., 2., 4.], [1., 2., 4.]]))
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/gtsam/tests/test_Pose2.py b/python/gtsam/tests/test_Pose2.py
index 860487db2e..d3a51d6382 100644
--- a/python/gtsam/tests/test_Pose2.py
+++ b/python/gtsam/tests/test_Pose2.py
@@ -70,27 +70,36 @@ def test_align(self) -> None:
         pts_a = [
-            Point2(3, 1),
-            Point2(1, 1),
-            Point2(1, 3),
-            Point2(3, 3),
-        ]
-        pts_b = [
             Point2(1, -3),
             Point2(1, -5),
             Point2(-1, -5),
             Point2(-1, -3),
+        pts_b = [
+            Point2(3, 1),
+            Point2(1, 1),
+            Point2(1, 3),
+            Point2(3, 3),
+        ]
         # fmt: on
         ab_pairs = Point2Pairs(list(zip(pts_a, pts_b)))
-        bTa = gtsam.align(ab_pairs)
-        aTb = bTa.inverse()
-        assert aTb is not None
+        aTb = Pose2.Align(ab_pairs)
+        self.assertIsNotNone(aTb)
+        for pt_a, pt_b in zip(pts_a, pts_b):
+            pt_a_ = aTb.transformFrom(pt_b)
+            np.testing.assert_allclose(pt_a, pt_a_)
+        # Matrix version
+        A = np.array(pts_a).T
+        B = np.array(pts_b).T
+        aTb = Pose2.Align(A, B)
+        self.assertIsNotNone(aTb)
         for pt_a, pt_b in zip(pts_a, pts_b):
             pt_a_ = aTb.transformFrom(pt_b)
-            assert np.allclose(pt_a, pt_a_)
+            np.testing.assert_allclose(pt_a, pt_a_)
 if __name__ == "__main__":
diff --git a/python/gtsam/tests/test_Sim2.py b/python/gtsam/tests/test_Sim2.py
new file mode 100644
index 0000000000..ea809b9656
--- /dev/null
+++ b/python/gtsam/tests/test_Sim2.py
@@ -0,0 +1,194 @@
+GTSAM Copyright 2010-2019, Georgia Tech Research Corporation,
+Atlanta, Georgia 30332-0415
+All Rights Reserved
+See LICENSE for the license information
+Sim3 unit tests.
+Author: John Lambert
+# pylint: disable=no-name-in-module
+import unittest
+import numpy as np
+from gtsam import Pose2, Pose2Pairs, Rot2, Similarity2
+from gtsam.utils.test_case import GtsamTestCase
+class TestSim2(GtsamTestCase):
+    """Test selected Sim2 methods."""
+    def test_align_poses_along_straight_line(self) -> None:
+        """Test Align Pose2Pairs method.
+        Scenario:
+           3 object poses
+           same scale (no gauge ambiguity)
+           world frame has poses rotated about 180 degrees.
+           world and egovehicle frame translated by 15 meters w.r.t. each other
+        """
+        R180 = Rot2.fromDegrees(180)
+        # Create source poses (three objects o1, o2, o3 living in the egovehicle "e" frame)
+        # Suppose they are 3d cuboids detected by an onboard sensor in the egovehicle frame
+        eTo0 = Pose2(Rot2(), np.array([5, 0]))
+        eTo1 = Pose2(Rot2(), np.array([10, 0]))
+        eTo2 = Pose2(Rot2(), np.array([15, 0]))
+        eToi_list = [eTo0, eTo1, eTo2]
+        # Create destination poses
+        # (same three objects, but instead living in the world "w" frame)
+        wTo0 = Pose2(R180, np.array([-10, 0]))
+        wTo1 = Pose2(R180, np.array([-5, 0]))
+        wTo2 = Pose2(R180, np.array([0, 0]))
+        wToi_list = [wTo0, wTo1, wTo2]
+        we_pairs = Pose2Pairs(list(zip(wToi_list, eToi_list)))
+        # Recover the transformation wSe (i.e. world_S_egovehicle)
+        wSe = Similarity2.Align(we_pairs)
+        for wToi, eToi in zip(wToi_list, eToi_list):
+            self.gtsamAssertEquals(wToi, wSe.transformFrom(eToi))
+    def test_align_poses_along_straight_line_gauge(self):
+        """Test if Align Pose3Pairs method can account for gauge ambiguity.
+        Scenario:
+           3 object poses
+           with gauge ambiguity (2x scale)
+           world frame has poses rotated by 90 degrees.
+           world and egovehicle frame translated by 11 meters w.r.t. each other
+        """
+        R90 = Rot2.fromDegrees(90)
+        # Create source poses (three objects o1, o2, o3 living in the egovehicle "e" frame)
+        # Suppose they are 3d cuboids detected by an onboard sensor in the egovehicle frame
+        eTo0 = Pose2(Rot2(), np.array([1, 0]))
+        eTo1 = Pose2(Rot2(), np.array([2, 0]))
+        eTo2 = Pose2(Rot2(), np.array([4, 0]))
+        eToi_list = [eTo0, eTo1, eTo2]
+        # Create destination poses
+        # (same three objects, but instead living in the world/city "w" frame)
+        wTo0 = Pose2(R90, np.array([0, 12]))
+        wTo1 = Pose2(R90, np.array([0, 14]))
+        wTo2 = Pose2(R90, np.array([0, 18]))
+        wToi_list = [wTo0, wTo1, wTo2]
+        we_pairs = Pose2Pairs(list(zip(wToi_list, eToi_list)))
+        # Recover the transformation wSe (i.e. world_S_egovehicle)
+        wSe = Similarity2.Align(we_pairs)
+        for wToi, eToi in zip(wToi_list, eToi_list):
+            self.gtsamAssertEquals(wToi, wSe.transformFrom(eToi))
+    def test_align_poses_scaled_squares(self):
+        """Test if Align Pose2Pairs method can account for gauge ambiguity.
+        Make sure a big and small square can be aligned.
+        The u's represent a big square (10x10), and v's represents a small square (4x4).
+        Scenario:
+           4 object poses
+           with gauge ambiguity (2.5x scale)
+        """
+        # 0, 90, 180, and 270 degrees yaw
+        R0 = Rot2.fromDegrees(0)
+        R90 = Rot2.fromDegrees(90)
+        R180 = Rot2.fromDegrees(180)
+        R270 = Rot2.fromDegrees(270)
+        aTi0 = Pose2(R0, np.array([2, 3]))
+        aTi1 = Pose2(R90, np.array([12, 3]))
+        aTi2 = Pose2(R180, np.array([12, 13]))
+        aTi3 = Pose2(R270, np.array([2, 13]))
+        aTi_list = [aTi0, aTi1, aTi2, aTi3]
+        bTi0 = Pose2(R0, np.array([4, 3]))
+        bTi1 = Pose2(R90, np.array([8, 3]))
+        bTi2 = Pose2(R180, np.array([8, 7]))
+        bTi3 = Pose2(R270, np.array([4, 7]))
+        bTi_list = [bTi0, bTi1, bTi2, bTi3]
+        ab_pairs = Pose2Pairs(list(zip(aTi_list, bTi_list)))
+        # Recover the transformation wSe (i.e. world_S_egovehicle)
+        aSb = Similarity2.Align(ab_pairs)
+        for aTi, bTi in zip(aTi_list, bTi_list):
+            self.gtsamAssertEquals(aTi, aSb.transformFrom(bTi))
+    def test_constructor(self) -> None:
+        """Sim(2) to perform p_b = bSa * p_a"""
+        bRa = Rot2()
+        bta = np.array([1, 2])
+        bsa = 3.0
+        bSa = Similarity2(R=bRa, t=bta, s=bsa)
+        self.assertIsInstance(bSa, Similarity2)
+        np.testing.assert_allclose(bSa.rotation().matrix(), bRa.matrix())
+        np.testing.assert_allclose(bSa.translation(), bta)
+        np.testing.assert_allclose(bSa.scale(), bsa)
+    def test_is_eq(self) -> None:
+        """Ensure object equality works properly (are equal)."""
+        bSa = Similarity2(R=Rot2(), t=np.array([1, 2]), s=3.0)
+        bSa_ = Similarity2(R=Rot2(), t=np.array([1.0, 2.0]), s=3)
+        self.gtsamAssertEquals(bSa, bSa_)
+    def test_not_eq_translation(self) -> None:
+        """Ensure object equality works properly (not equal translation)."""
+        bSa = Similarity2(R=Rot2(), t=np.array([2, 1]), s=3.0)
+        bSa_ = Similarity2(R=Rot2(), t=np.array([1.0, 2.0]), s=3)
+        self.assertNotEqual(bSa, bSa_)
+    def test_not_eq_rotation(self) -> None:
+        """Ensure object equality works properly (not equal rotation)."""
+        bSa = Similarity2(R=Rot2(), t=np.array([2, 1]), s=3.0)
+        bSa_ = Similarity2(R=Rot2.fromDegrees(180), t=np.array([2.0, 1.0]), s=3)
+        self.assertNotEqual(bSa, bSa_)
+    def test_not_eq_scale(self) -> None:
+        """Ensure object equality works properly (not equal scale)."""
+        bSa = Similarity2(R=Rot2(), t=np.array([2, 1]), s=3.0)
+        bSa_ = Similarity2(R=Rot2(), t=np.array([2.0, 1.0]), s=1.0)
+        self.assertNotEqual(bSa, bSa_)
+    def test_rotation(self) -> None:
+        """Ensure rotation component is returned properly."""
+        R = Rot2.fromDegrees(90)
+        t = np.array([1, 2])
+        bSa = Similarity2(R=R, t=t, s=3.0)
+        # evaluates to [[0, -1], [1, 0]]
+        expected_R = Rot2.fromDegrees(90)
+        np.testing.assert_allclose(expected_R.matrix(), bSa.rotation().matrix())
+    def test_translation(self) -> None:
+        """Ensure translation component is returned properly."""
+        R = Rot2.fromDegrees(90)
+        t = np.array([1, 2])
+        bSa = Similarity2(R=R, t=t, s=3.0)
+        expected_t = np.array([1, 2])
+        np.testing.assert_allclose(expected_t, bSa.translation())
+    def test_scale(self) -> None:
+        """Ensure the scale factor is returned properly."""
+        bRa = Rot2()
+        bta = np.array([1, 2])
+        bsa = 3.0
+        bSa = Similarity2(R=bRa, t=bta, s=bsa)
+        self.assertEqual(bSa.scale(), 3.0)
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/gtsam/tests/test_TranslationRecovery.py b/python/gtsam/tests/test_TranslationRecovery.py
index 0fb0518b60..99fbce89e2 100644
--- a/python/gtsam/tests/test_TranslationRecovery.py
+++ b/python/gtsam/tests/test_TranslationRecovery.py
@@ -34,8 +34,10 @@ class TestTranslationRecovery(unittest.TestCase):
     def test_constructor(self):
         """Construct from binary measurements."""
-        algorithm = gtsam.TranslationRecovery(gtsam.BinaryMeasurementsUnit3())
+        algorithm = gtsam.TranslationRecovery()
         self.assertIsInstance(algorithm, gtsam.TranslationRecovery)
+        algorithm_params = gtsam.TranslationRecovery(gtsam.LevenbergMarquardtParams())
+        self.assertIsInstance(algorithm_params, gtsam.TranslationRecovery)
     def test_run(self):
         gt_poses = ExampleValues()
@@ -45,9 +47,9 @@ def test_run(self):
         lmParams = gtsam.LevenbergMarquardtParams()
-        algorithm = gtsam.TranslationRecovery(measurements, lmParams)
+        algorithm = gtsam.TranslationRecovery(lmParams)
         scale = 2.0
-        result = algorithm.run(scale)
+        result = algorithm.run(measurements, scale)
         w_aTc = gt_poses.atPose3(2).translation() - gt_poses.atPose3(0).translation()
         w_aTb = gt_poses.atPose3(1).translation() - gt_poses.atPose3(0).translation()
diff --git a/python/gtsam/tests/test_Triangulation.py b/python/gtsam/tests/test_Triangulation.py
index 0a258a0afc..8630e1da75 100644
--- a/python/gtsam/tests/test_Triangulation.py
+++ b/python/gtsam/tests/test_Triangulation.py
@@ -8,26 +8,17 @@
 Test Triangulation
 Authors: Frank Dellaert & Fan Jiang (Python) & Sushmita Warrier & John Lambert
+# pylint: disable=no-name-in-module, invalid-name, no-member
 import unittest
 from typing import Iterable, List, Optional, Tuple, Union
 import numpy as np
 import gtsam
-from gtsam import (
-    Cal3_S2,
-    Cal3Bundler,
-    CameraSetCal3_S2,
-    CameraSetCal3Bundler,
-    PinholeCameraCal3_S2,
-    PinholeCameraCal3Bundler,
-    Point2,
-    Point2Vector,
-    Point3,
-    Pose3,
-    Pose3Vector,
-    Rot3,
+from gtsam import (Cal3_S2, Cal3Bundler, CameraSetCal3_S2,
+                   CameraSetCal3Bundler, PinholeCameraCal3_S2,
+                   PinholeCameraCal3Bundler, Point2, Point2Vector, Point3,
+                   Pose3, Pose3Vector, Rot3, TriangulationParameters,
+                   TriangulationResult)
 from gtsam.utils.test_case import GtsamTestCase
 UPRIGHT = Rot3.Ypr(-np.pi / 2, 0.0, -np.pi / 2)
@@ -218,6 +209,68 @@ def test_triangulation_robust_three_poses(self) -> None:
         # using the Huber loss we now have a quite small error!! nice!
         self.assertTrue(np.allclose(landmark, actual4, atol=0.05))
+    def test_outliers_and_far_landmarks(self) -> None:
+        """Check safe triangulation function."""
+        pose1, pose2 = self.poses
+        K1 = Cal3_S2(1500, 1200, 0, 640, 480)
+        # create first camera. Looking along X-axis, 1 meter above ground plane (x-y)
+        camera1 = PinholeCameraCal3_S2(pose1, K1)
+        # create second camera 1 meter to the right of first camera
+        K2 = Cal3_S2(1600, 1300, 0, 650, 440)
+        camera2 = PinholeCameraCal3_S2(pose2, K2)
+        # 1. Project two landmarks into two cameras and triangulate
+        z1 = camera1.project(self.landmark)
+        z2 = camera2.project(self.landmark)
+        cameras = CameraSetCal3_S2()
+        measurements = Point2Vector()
+        cameras.append(camera1)
+        cameras.append(camera2)
+        measurements.append(z1)
+        measurements.append(z2)
+        landmarkDistanceThreshold = 10  # landmark is closer than that
+        # all default except landmarkDistanceThreshold: 
+        params = TriangulationParameters(1.0, False, landmarkDistanceThreshold)
+        actual: TriangulationResult = gtsam.triangulateSafe(
+            cameras, measurements, params)
+        self.gtsamAssertEquals(actual.get(), self.landmark, 1e-2)
+        self.assertTrue(actual.valid())
+        landmarkDistanceThreshold = 4  # landmark is farther than that
+        params2 = TriangulationParameters(
+            1.0, False, landmarkDistanceThreshold)
+        actual = gtsam.triangulateSafe(cameras, measurements, params2)
+        self.assertTrue(actual.farPoint())
+        # 3. Add a slightly rotated third camera above with a wrong measurement
+        # (OUTLIER)
+        pose3 = pose1 * Pose3(Rot3.Ypr(0.1, 0.2, 0.1), Point3(0.1, -2, -.1))
+        K3 = Cal3_S2(700, 500, 0, 640, 480)
+        camera3 = PinholeCameraCal3_S2(pose3, K3)
+        z3 = camera3.project(self.landmark)
+        cameras.append(camera3)
+        measurements.append(z3 + Point2(10, -10))
+        landmarkDistanceThreshold = 10  # landmark is closer than that
+        outlierThreshold = 100   # loose, the outlier is going to pass
+        params3 = TriangulationParameters(1.0, False, landmarkDistanceThreshold,
+                                          outlierThreshold)
+        actual = gtsam.triangulateSafe(cameras, measurements, params3)
+        self.assertTrue(actual.valid())
+        # now set stricter threshold for outlier rejection
+        outlierThreshold = 5  # tighter, the outlier is not going to pass
+        params4 = TriangulationParameters(1.0, False, landmarkDistanceThreshold,
+                                          outlierThreshold)
+        actual = gtsam.triangulateSafe(cameras, measurements, params4)
+        self.assertTrue(actual.outlier())
 if __name__ == "__main__":
diff --git a/python/gtsam/tests/test_logging_optimizer.py b/python/gtsam/tests/test_logging_optimizer.py
index 47eb32e7b1..602aeffc94 100644
--- a/python/gtsam/tests/test_logging_optimizer.py
+++ b/python/gtsam/tests/test_logging_optimizer.py
@@ -18,7 +18,7 @@
 from gtsam import Rot3
 from gtsam.utils.test_case import GtsamTestCase
-from gtsam.utils.logging_optimizer import gtsam_optimize
+from gtsam.utils.logging_optimizer import gtsam_optimize, optimize_using
 KEY = 0
 MODEL = gtsam.noiseModel.Unit.Create(3)
@@ -34,19 +34,20 @@ def setUp(self):
         rotations = {R, R.inverse()}  # mean is the identity
         self.expected = Rot3()
-        graph = gtsam.NonlinearFactorGraph()
+        def check(actual):
+            # Check that optimizing yields the identity
+            self.gtsamAssertEquals(actual.atRot3(KEY), self.expected, tol=1e-6)
+            # Check that logging output prints out 3 lines (exact intermediate values differ by OS)
+            self.assertEqual(self.capturedOutput.getvalue().count('\n'), 3)
+            # reset stdout catcher
+            self.capturedOutput.truncate(0)
+        self.check = check
+        self.graph = gtsam.NonlinearFactorGraph()
         for R in rotations:
-            graph.add(gtsam.PriorFactorRot3(KEY, R, MODEL))
-        initial = gtsam.Values()
-        initial.insert(KEY, R)
-        self.params = gtsam.GaussNewtonParams()
-        self.optimizer = gtsam.GaussNewtonOptimizer(
-            graph, initial, self.params)
-        self.lmparams = gtsam.LevenbergMarquardtParams()
-        self.lmoptimizer = gtsam.LevenbergMarquardtOptimizer(
-            graph, initial, self.lmparams
-        )
+            self.graph.add(gtsam.PriorFactorRot3(KEY, R, MODEL))
+        self.initial = gtsam.Values()
+        self.initial.insert(KEY, R)
         # setup output capture
         self.capturedOutput = StringIO()
@@ -63,22 +64,29 @@ def test_simple_printing(self):
         def hook(_, error):
-        # Only thing we require from optimizer is an iterate method
-        gtsam_optimize(self.optimizer, self.params, hook)
-        # Check that optimizing yields the identity.
-        actual = self.optimizer.values()
-        self.gtsamAssertEquals(actual.atRot3(KEY), self.expected, tol=1e-6)
+        # Wrapper function sets the hook and calls optimizer.optimize() for us.
+        params = gtsam.GaussNewtonParams()
+        actual = optimize_using(gtsam.GaussNewtonOptimizer, hook, self.graph, self.initial)
+        self.check(actual)
+        actual = optimize_using(gtsam.GaussNewtonOptimizer, hook, self.graph, self.initial, params)
+        self.check(actual)
+        actual = gtsam_optimize(gtsam.GaussNewtonOptimizer(self.graph, self.initial, params),
+                                params, hook)
+        self.check(actual)
     def test_lm_simple_printing(self):
         """Make sure we are properly terminating LM"""
         def hook(_, error):
-        gtsam_optimize(self.lmoptimizer, self.lmparams, hook)
-        actual = self.lmoptimizer.values()
-        self.gtsamAssertEquals(actual.atRot3(KEY), self.expected, tol=1e-6)
+        params = gtsam.LevenbergMarquardtParams()
+        actual = optimize_using(gtsam.LevenbergMarquardtOptimizer, hook, self.graph, self.initial)
+        self.check(actual)
+        actual = optimize_using(gtsam.LevenbergMarquardtOptimizer, hook, self.graph, self.initial,
+                                params)
+        self.check(actual)
+        actual = gtsam_optimize(gtsam.LevenbergMarquardtOptimizer(self.graph, self.initial, params),
+                                params, hook)
     @unittest.skip("Not a test we want run every time, as needs comet.ml account")
     def test_comet(self):
diff --git a/python/gtsam/utils/logging_optimizer.py b/python/gtsam/utils/logging_optimizer.py
index 3d9175951a..fe2f717d8b 100644
--- a/python/gtsam/utils/logging_optimizer.py
+++ b/python/gtsam/utils/logging_optimizer.py
@@ -6,6 +6,53 @@
 from gtsam import NonlinearOptimizer, NonlinearOptimizerParams
 import gtsam
+from typing import Any, Callable
+    gtsam.GaussNewtonOptimizer: gtsam.GaussNewtonParams,
+    gtsam.LevenbergMarquardtOptimizer: gtsam.LevenbergMarquardtParams,
+    gtsam.DoglegOptimizer: gtsam.DoglegParams,
+    gtsam.GncGaussNewtonOptimizer: gtsam.GaussNewtonParams,
+    gtsam.GncLMOptimizer: gtsam.LevenbergMarquardtParams
+def optimize_using(OptimizerClass, hook, *args) -> gtsam.Values:
+    """ Wraps the constructor and "optimize()" call for an Optimizer together and adds an iteration
+        hook.
+        Example usage:
+            ```python
+            def hook(optimizer, error):
+                print("iteration {:}, error = {:}".format(optimizer.iterations(), error))
+            solution = optimize_using(gtsam.GaussNewtonOptimizer, hook, graph, init, params)
+            ```
+        Iteration hook's args are (optimizer, error) and return type should be None
+    Args:
+        OptimizerClass (T): A NonlinearOptimizer class (e.g. GaussNewtonOptimizer,
+            LevenbergMarquardtOptimizer)
+        hook ([T, double] -> None): Function to callback after each iteration.  Args are (optimizer,
+            error) and return should be None.
+        *args: Arguments that would be passed into the OptimizerClass constructor, usually:
+            graph, init, [params]
+    Returns:
+        (gtsam.Values): A Values object representing the optimization solution.
+    """
+    # Add the iteration hook to the NonlinearOptimizerParams
+    for arg in args:
+        if isinstance(arg, gtsam.NonlinearOptimizerParams):
+            arg.iterationHook = lambda iteration, error_before, error_after: hook(
+                optimizer, error_after)
+            break
+    else:
+        params = OPTIMIZER_PARAMS_MAP[OptimizerClass]()
+        params.iterationHook = lambda iteration, error_before, error_after: hook(
+            optimizer, error_after)
+        args = (*args, params)
+    # Construct Optimizer and optimize
+    optimizer = OptimizerClass(*args)
+    hook(optimizer, optimizer.error())  # Call hook once with init values to match behavior below
+    return optimizer.optimize()
 def optimize(optimizer, check_convergence, hook):
@@ -21,7 +68,8 @@ def optimize(optimizer, check_convergence, hook):
     current_error = optimizer.error()
     hook(optimizer, current_error)
-    # Iterative loop
+    # Iterative loop.  Cannot use `params.iterationHook` because we don't have access to params
+    # (backwards compatibility issue).
     while True:
         # Do next iteration
@@ -36,6 +84,7 @@ def gtsam_optimize(optimizer,
     """ Given an optimizer and params, iterate until convergence.
+        Recommend using optimize_using instead.
         After each iteration, hook(optimizer) is called.
         After the function, use values and errors to get the result.
diff --git a/python/gtsam/utils/plot.py b/python/gtsam/utils/plot.py
index 5ff7fd7aa7..a4d19f72be 100644
--- a/python/gtsam/utils/plot.py
+++ b/python/gtsam/utils/plot.py
@@ -12,13 +12,26 @@
 import gtsam
 from gtsam import Marginals, Point2, Point3, Pose2, Pose3, Values
-# For future reference: following
-# https://www.xarg.org/2018/04/how-to-plot-a-covariance-error-ellipse/
-# we have, in 2D:
-# def kk(p): return math.sqrt(-2*math.log(1-p)) # k to get p probability mass
-# def pp(k): return 1-math.exp(-float(k**2)/2.0) # p as a function of k
-# Some values:
-# k = 5 => p = 99.9996 %
+# For translation between a scaling of the uncertainty ellipse and the 
+# percentage of inliers see discussion in 
+#   [PR 1067](https://github.com/borglab/gtsam/pull/1067)
+# and the notebook python/gtsam/notebooks/ellipses.ipynb (needs scipy).
+# In the following, the default scaling is chosen for 95% inliers, which
+# translates to the following sigma values:
+# 1D: 1.959963984540
+# 2D: 2.447746830681
+# 3D: 2.795483482915
+# Further references are Stochastic Models, Estimation, and Control Vol 1 by Maybeck,
+# page 366 and https://www.xarg.org/2018/04/how-to-plot-a-covariance-error-ellipse/
+# For reference, here are the inlier percentages for some sigma values:
+#   	    1    	    2    	    3    	    4    	    5
+# 1D	68.26895	95.44997	99.73002	99.99367	99.99994
+# 2D	39.34693	86.46647	98.88910	99.96645	99.99963
+# 3D	19.87480	73.85359	97.07091	99.88660	99.99846
 def set_axes_equal(fignum: int) -> None:
@@ -81,9 +94,8 @@ def plot_covariance_ellipse_3d(axes,
     Plots a Gaussian as an uncertainty ellipse
-    Based on Maybeck Vol 1, page 366
-    k=2.296 corresponds to 1 std, 68.26% of all probability
-    k=11.82 corresponds to 3 std, 99.74% of all probability
+    The ellipse is scaled in such a way that 95% of drawn samples are inliers.
+    Derivation of the scaling factor is explained at the beginning of this file.
         axes (matplotlib.axes.Axes): Matplotlib axes.
@@ -94,7 +106,8 @@ def plot_covariance_ellipse_3d(axes,
         n: Defines the granularity of the ellipse. Higher values indicate finer ellipses.
         alpha: Transparency value for the plotted surface in the range [0, 1].
-    k = 11.82
+    # this corresponds to 95%, see note above
+    k = 2.795483482915
     U, S, _ = np.linalg.svd(P)
     radii = k * np.sqrt(S)
@@ -115,12 +128,48 @@ def plot_covariance_ellipse_3d(axes,
     axes.plot_surface(x, y, z, alpha=alpha, cmap='hot')
+def plot_covariance_ellipse_2d(axes,
+                               origin: Point2,
+                               covariance: np.ndarray) -> None:
+    """
+    Plots a Gaussian as an uncertainty ellipse
+    The ellipse is scaled in such a way that 95% of drawn samples are inliers.
+    Derivation of the scaling factor is explained at the beginning of this file.
+    Args:
+        axes (matplotlib.axes.Axes): Matplotlib axes.
+        origin: The origin in the world frame.
+        covariance: The marginal covariance matrix of the 2D point
+                    which will be represented as an ellipse.
+    """
+    w, v = np.linalg.eigh(covariance)
+    # this corresponds to 95%, see note above
+    k = 2.447746830681
+    angle = np.arctan2(v[1, 0], v[0, 0])
+    # We multiply k by 2 since k corresponds to the radius but Ellipse uses
+    # the diameter.
+    e1 = patches.Ellipse(origin,
+                         np.sqrt(w[0]) * 2 * k,
+                         np.sqrt(w[1]) * 2 * k,
+                         np.rad2deg(angle),
+                         fill=False)
+    axes.add_patch(e1)
 def plot_point2_on_axes(axes,
                         point: Point2,
                         linespec: str,
                         P: Optional[np.ndarray] = None) -> None:
-    Plot a 2D point on given axis `axes` with given `linespec`.
+    Plot a 2D point and its corresponding uncertainty ellipse on given axis
+    `axes` with given `linespec`.
+    The uncertainty ellipse (if covariance is given) is scaled in such a way
+    that 95% of drawn samples are inliers, see `plot_covariance_ellipse_2d`.
         axes (matplotlib.axes.Axes): Matplotlib axes.
@@ -130,19 +179,7 @@ def plot_point2_on_axes(axes,
     axes.plot([point[0]], [point[1]], linespec, marker='.', markersize=10)
     if P is not None:
-        w, v = np.linalg.eig(P)
-        # 5 sigma corresponds to 99.9996%, see note above
-        k = 5.0
-        angle = np.arctan2(v[1, 0], v[0, 0])
-        e1 = patches.Ellipse(point,
-                             np.sqrt(w[0] * k),
-                             np.sqrt(w[1] * k),
-                             np.rad2deg(angle),
-                             fill=False)
-        axes.add_patch(e1)
+        plot_covariance_ellipse_2d(axes, point, P)
 def plot_point2(
     fignum: int,
@@ -154,6 +191,9 @@ def plot_point2(
     Plot a 2D point on given figure with given `linespec`.
+    The uncertainty ellipse (if covariance is given) is scaled in such a way
+    that 95% of drawn samples are inliers, see `plot_covariance_ellipse_2d`.
         fignum: Integer representing the figure number to use for plotting.
         point: The point to be plotted.
@@ -182,6 +222,9 @@ def plot_pose2_on_axes(axes,
     Plot a 2D pose on given axis `axes` with given `axis_length`.
+    The ellipse is scaled in such a way that 95% of drawn samples are inliers,
+    see `plot_covariance_ellipse_2d`.
         axes (matplotlib.axes.Axes): Matplotlib axes.
         pose: The pose to be plotted.
@@ -206,19 +249,7 @@ def plot_pose2_on_axes(axes,
     if covariance is not None:
         pPp = covariance[0:2, 0:2]
         gPp = np.matmul(np.matmul(gRp, pPp), gRp.T)
-        w, v = np.linalg.eig(gPp)
-        # 5 sigma corresponds to 99.9996%, see note above
-        k = 5.0
-        angle = np.arctan2(v[1, 0], v[0, 0])
-        e1 = patches.Ellipse(origin,
-                             np.sqrt(w[0] * k),
-                             np.sqrt(w[1] * k),
-                             np.rad2deg(angle),
-                             fill=False)
-        axes.add_patch(e1)
+        plot_covariance_ellipse_2d(axes, origin, gPp)
 def plot_pose2(
@@ -231,6 +262,9 @@ def plot_pose2(
     Plot a 2D pose on given figure with given `axis_length`.
+    The uncertainty ellipse (if covariance is given) is scaled in such a way
+    that 95% of drawn samples are inliers, see `plot_covariance_ellipse_2d`.
         fignum: Integer representing the figure number to use for plotting.
         pose: The pose to be plotted.
@@ -260,6 +294,9 @@ def plot_point3_on_axes(axes,
     Plot a 3D point on given axis `axes` with given `linespec`.
+    The uncertainty ellipse (if covariance is given) is scaled in such a way
+    that 95% of drawn samples are inliers, see `plot_covariance_ellipse_3d`.
         axes (matplotlib.axes.Axes): Matplotlib axes.
         point: The point to be plotted.
@@ -281,6 +318,9 @@ def plot_point3(
     Plot a 3D point on given figure with given `linespec`.
+    The uncertainty ellipse (if covariance is given) is scaled in such a way
+    that 95% of drawn samples are inliers, see `plot_covariance_ellipse_3d`.
         fignum: Integer representing the figure number to use for plotting.
         point: The point to be plotted.
@@ -355,6 +395,9 @@ def plot_pose3_on_axes(axes, pose, axis_length=0.1, P=None, scale=1):
     Plot a 3D pose on given axis `axes` with given `axis_length`.
+    The uncertainty ellipse (if covariance is given) is scaled in such a way
+    that 95% of drawn samples are inliers, see `plot_covariance_ellipse_3d`.
         axes (matplotlib.axes.Axes): Matplotlib axes.
         point (gtsam.Point3): The point to be plotted.
@@ -397,6 +440,9 @@ def plot_pose3(
     Plot a 3D pose on given figure with given `axis_length`.
+    The uncertainty ellipse (if covariance is given) is scaled in such a way
+    that 95% of drawn samples are inliers, see `plot_covariance_ellipse_3d`.
         fignum: Integer representing the figure number to use for plotting.
         pose (gtsam.Pose3): 3D pose to be plotted.
diff --git a/python/gtsam_unstable/tests/test_ProjectionFactorRollingShutter.py b/python/gtsam_unstable/tests/test_ProjectionFactorRollingShutter.py
new file mode 100644
index 0000000000..0e4db3faff
--- /dev/null
+++ b/python/gtsam_unstable/tests/test_ProjectionFactorRollingShutter.py
@@ -0,0 +1,59 @@
+GTSAM Copyright 2010-2019, Georgia Tech Research Corporation,
+Atlanta, Georgia 30332-0415
+All Rights Reserved
+See LICENSE for the license information
+ProjectionFactorRollingShutter unit tests.
+Author: Yotam Stern
+import unittest
+import numpy as np
+import gtsam
+import gtsam_unstable
+from gtsam.utils.test_case import GtsamTestCase
+pose1 = gtsam.Pose3()
+pose2 = gtsam.Pose3(np.array([[ 0.9999375 ,  0.00502487,  0.00998725,  0.1       ],
+                              [-0.00497488,  0.999975  , -0.00502487,  0.02      ],
+                              [-0.01001225,  0.00497488,  0.9999375 ,  1.        ],
+                              [ 0.        ,  0.        ,  0.        ,  1.        ]]))
+point = np.array([2, 0, 15])
+point_noise = gtsam.noiseModel.Diagonal.Sigmas(np.ones(2))
+cal = gtsam.Cal3_S2()
+body_p_sensor = gtsam.Pose3()
+alpha = 0.1
+measured = np.array([0.13257015, 0.0004157])
+class TestProjectionFactorRollingShutter(GtsamTestCase):
+    def test_constructor(self):
+        '''
+        test constructor for the ProjectionFactorRollingShutter
+        '''
+        factor = gtsam_unstable.ProjectionFactorRollingShutter(measured, alpha, point_noise, 0, 1, 2, cal)
+        factor = gtsam_unstable.ProjectionFactorRollingShutter(measured, alpha, point_noise, 0, 1, 2, cal,
+                                                               body_p_sensor)
+        factor = gtsam_unstable.ProjectionFactorRollingShutter(measured, alpha, point_noise, 0, 1, 2, cal, True, False)
+        factor = gtsam_unstable.ProjectionFactorRollingShutter(measured, alpha, point_noise, 0, 1, 2, cal, True, False,
+                                                               body_p_sensor)
+    def test_error(self):
+        '''
+        test the factor error for a specific example
+        '''
+        values = gtsam.Values()
+        values.insert(0, pose1)
+        values.insert(1, pose2)
+        values.insert(2, point)
+        factor = gtsam_unstable.ProjectionFactorRollingShutter(measured, alpha, point_noise, 0, 1, 2, cal)
+        self.gtsamAssertEquals(factor.error(values), np.array(0), tol=1e-9)
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/testNonlinearFactorGraph.cpp b/tests/testNonlinearFactorGraph.cpp
index 05a6e7f45e..e1a88d6169 100644
--- a/tests/testNonlinearFactorGraph.cpp
+++ b/tests/testNonlinearFactorGraph.cpp
@@ -335,21 +335,21 @@ TEST(NonlinearFactorGraph, dot) {
       "graph {\n"
       "  size=\"5,5\";\n"
-      "  varl1[label=\"l1\"];\n"
-      "  varx1[label=\"x1\"];\n"
-      "  varx2[label=\"x2\"];\n"
+      "  var7782220156096217089[label=\"l1\"];\n"
+      "  var8646911284551352321[label=\"x1\"];\n"
+      "  var8646911284551352322[label=\"x2\"];\n"
       "  factor0[label=\"\", shape=point];\n"
-      "  varx1--factor0;\n"
+      "  var8646911284551352321--factor0;\n"
       "  factor1[label=\"\", shape=point];\n"
-      "  varx1--factor1;\n"
-      "  varx2--factor1;\n"
+      "  var8646911284551352321--factor1;\n"
+      "  var8646911284551352322--factor1;\n"
       "  factor2[label=\"\", shape=point];\n"
-      "  varx1--factor2;\n"
-      "  varl1--factor2;\n"
+      "  var8646911284551352321--factor2;\n"
+      "  var7782220156096217089--factor2;\n"
       "  factor3[label=\"\", shape=point];\n"
-      "  varx2--factor3;\n"
-      "  varl1--factor3;\n"
+      "  var8646911284551352322--factor3;\n"
+      "  var7782220156096217089--factor3;\n"
   const NonlinearFactorGraph fg = createNonlinearFactorGraph();
@@ -363,21 +363,21 @@ TEST(NonlinearFactorGraph, dot_extra) {
       "graph {\n"
       "  size=\"5,5\";\n"
-      "  varl1[label=\"l1\", pos=\"0,0!\"];\n"
-      "  varx1[label=\"x1\", pos=\"1,0!\"];\n"
-      "  varx2[label=\"x2\", pos=\"1,1.5!\"];\n"
+      "  var7782220156096217089[label=\"l1\", pos=\"0,0!\"];\n"
+      "  var8646911284551352321[label=\"x1\", pos=\"1,0!\"];\n"
+      "  var8646911284551352322[label=\"x2\", pos=\"1,1.5!\"];\n"
       "  factor0[label=\"\", shape=point];\n"
-      "  varx1--factor0;\n"
+      "  var8646911284551352321--factor0;\n"
       "  factor1[label=\"\", shape=point];\n"
-      "  varx1--factor1;\n"
-      "  varx2--factor1;\n"
+      "  var8646911284551352321--factor1;\n"
+      "  var8646911284551352322--factor1;\n"
       "  factor2[label=\"\", shape=point];\n"
-      "  varx1--factor2;\n"
-      "  varl1--factor2;\n"
+      "  var8646911284551352321--factor2;\n"
+      "  var7782220156096217089--factor2;\n"
       "  factor3[label=\"\", shape=point];\n"
-      "  varx2--factor3;\n"
-      "  varl1--factor3;\n"
+      "  var8646911284551352322--factor3;\n"
+      "  var7782220156096217089--factor3;\n"
   const NonlinearFactorGraph fg = createNonlinearFactorGraph();
diff --git a/tests/testTranslationRecovery.cpp b/tests/testTranslationRecovery.cpp
index 833f11355d..5dd319d301 100644
--- a/tests/testTranslationRecovery.cpp
+++ b/tests/testTranslationRecovery.cpp
@@ -17,8 +17,8 @@
 #include <CppUnitLite/TestHarness.h>
-#include <gtsam/sfm/TranslationRecovery.h>
 #include <gtsam/sfm/SfmData.h>
+#include <gtsam/sfm/TranslationRecovery.h>
 #include <gtsam/slam/dataset.h>
 using namespace std;
@@ -62,13 +62,13 @@ TEST(TranslationRecovery, BAL) {
-  TranslationRecovery algorithm(relativeTranslations);
-  const auto graph = algorithm.buildGraph();
+  TranslationRecovery algorithm;
+  const auto graph = algorithm.buildGraph(relativeTranslations);
   EXPECT_LONGS_EQUAL(3, graph.size());
   // Run translation recovery
   const double scale = 2.0;
-  const auto result = algorithm.run(scale);
+  const auto result = algorithm.run(relativeTranslations, scale);
   // Check result for first two translations, determined by prior
   EXPECT(assert_equal(Point3(0, 0, 0), result.at<Point3>(0)));
@@ -107,12 +107,12 @@ TEST(TranslationRecovery, TwoPoseTest) {
-  TranslationRecovery algorithm(relativeTranslations);
-  const auto graph = algorithm.buildGraph();
+  TranslationRecovery algorithm;
+  const auto graph = algorithm.buildGraph(relativeTranslations);
   EXPECT_LONGS_EQUAL(1, graph.size());
   // Run translation recovery
-  const auto result = algorithm.run(/*scale=*/3.0);
+  const auto result = algorithm.run(relativeTranslations, /*scale=*/3.0);
   // Check result for first two translations, determined by prior
   EXPECT(assert_equal(Point3(0, 0, 0), result.at<Point3>(0), 1e-8));
@@ -145,11 +145,11 @@ TEST(TranslationRecovery, ThreePoseTest) {
-  TranslationRecovery algorithm(relativeTranslations);
-  const auto graph = algorithm.buildGraph();
+  TranslationRecovery algorithm;
+  const auto graph = algorithm.buildGraph(relativeTranslations);
   EXPECT_LONGS_EQUAL(3, graph.size());
-  const auto result = algorithm.run(/*scale=*/3.0);
+  const auto result = algorithm.run(relativeTranslations, /*scale=*/3.0);
   // Check result
   EXPECT(assert_equal(Point3(0, 0, 0), result.at<Point3>(0), 1e-8));
@@ -180,13 +180,9 @@ TEST(TranslationRecovery, ThreePosesIncludingZeroTranslation) {
-  TranslationRecovery algorithm(relativeTranslations);
-  const auto graph = algorithm.buildGraph();
-  // There is only 1 non-zero translation edge.
-  EXPECT_LONGS_EQUAL(1, graph.size());
+  TranslationRecovery algorithm;
   // Run translation recovery
-  const auto result = algorithm.run(/*scale=*/3.0);
+  const auto result = algorithm.run(relativeTranslations, /*scale=*/3.0);
   // Check result
   EXPECT(assert_equal(Point3(0, 0, 0), result.at<Point3>(0), 1e-8));
@@ -222,12 +218,10 @@ TEST(TranslationRecovery, FourPosesIncludingZeroTranslation) {
-  TranslationRecovery algorithm(relativeTranslations);
-  const auto graph = algorithm.buildGraph();
-  EXPECT_LONGS_EQUAL(3, graph.size());
+  TranslationRecovery algorithm;
   // Run translation recovery
-  const auto result = algorithm.run(/*scale=*/4.0);
+  const auto result = algorithm.run(relativeTranslations, /*scale=*/4.0);
   // Check result
   EXPECT(assert_equal(Point3(0, 0, 0), result.at<Point3>(0), 1e-8));
@@ -251,13 +245,10 @@ TEST(TranslationRecovery, ThreePosesWithZeroTranslation) {
-  TranslationRecovery algorithm(relativeTranslations);
-  const auto graph = algorithm.buildGraph();
-  // Graph size will be zero as there no 'non-zero distance' edges.
-  EXPECT_LONGS_EQUAL(0, graph.size());
+  TranslationRecovery algorithm;
   // Run translation recovery
-  const auto result = algorithm.run(/*scale=*/4.0);
+  const auto result = algorithm.run(relativeTranslations, /*scale=*/4.0);
   // Check result
   EXPECT(assert_equal(Point3(0, 0, 0), result.at<Point3>(0), 1e-8));
@@ -265,6 +256,73 @@ TEST(TranslationRecovery, ThreePosesWithZeroTranslation) {
   EXPECT(assert_equal(Point3(0, 0, 0), result.at<Point3>(2), 1e-8));
+TEST(TranslationRecovery, ThreePosesWithOneSoftConstraint) {
+  // Create a dataset with 3 poses.
+  // __      __
+  // \/      \/
+  //  0 _____ 1
+  //    \ __ /
+  //     \\//
+  //       3
+  //
+  // 0 and 1 face in the same direction but have a translation offset. 3 is in
+  // the same direction as 0 and 1, in between 0 and 1, with some Y axis offset.
+  Values poses;
+  poses.insert<Pose3>(0, Pose3(Rot3(), Point3(0, 0, 0)));
+  poses.insert<Pose3>(1, Pose3(Rot3(), Point3(2, 0, 0)));
+  poses.insert<Pose3>(3, Pose3(Rot3(), Point3(1, -1, 0)));
+  auto relativeTranslations = TranslationRecovery::SimulateMeasurements(
+      poses, {{0, 1}, {0, 3}, {1, 3}});
+  std::vector<BinaryMeasurement<Point3>> betweenTranslations;
+  betweenTranslations.emplace_back(0, 3, Point3(1, -1, 0),
+                                   noiseModel::Isotropic::Sigma(3, 1e-2));
+  TranslationRecovery algorithm;
+  auto result =
+      algorithm.run(relativeTranslations, /*scale=*/0.0, betweenTranslations);
+  // Check result
+  EXPECT(assert_equal(Point3(0, 0, 0), result.at<Point3>(0), 1e-4));
+  EXPECT(assert_equal(Point3(2, 0, 0), result.at<Point3>(1), 1e-4));
+  EXPECT(assert_equal(Point3(1, -1, 0), result.at<Point3>(3), 1e-4));
+TEST(TranslationRecovery, ThreePosesWithOneHardConstraint) {
+  // Create a dataset with 3 poses.
+  // __      __
+  // \/      \/
+  //  0 _____ 1
+  //    \ __ /
+  //     \\//
+  //       3
+  //
+  // 0 and 1 face in the same direction but have a translation offset. 3 is in
+  // the same direction as 0 and 1, in between 0 and 1, with some Y axis offset.
+  Values poses;
+  poses.insert<Pose3>(0, Pose3(Rot3(), Point3(0, 0, 0)));
+  poses.insert<Pose3>(1, Pose3(Rot3(), Point3(2, 0, 0)));
+  poses.insert<Pose3>(3, Pose3(Rot3(), Point3(1, -1, 0)));
+  auto relativeTranslations = TranslationRecovery::SimulateMeasurements(
+      poses, {{0, 1}, {0, 3}, {1, 3}});
+  std::vector<BinaryMeasurement<Point3>> betweenTranslations;
+  betweenTranslations.emplace_back(0, 1, Point3(2, 0, 0),
+                                   noiseModel::Constrained::All(3, 1e2));
+  TranslationRecovery algorithm;
+  auto result =
+      algorithm.run(relativeTranslations, /*scale=*/0.0, betweenTranslations);
+  // Check result
+  EXPECT(assert_equal(Point3(0, 0, 0), result.at<Point3>(0), 1e-4));
+  EXPECT(assert_equal(Point3(2, 0, 0), result.at<Point3>(1), 1e-4));
+  EXPECT(assert_equal(Point3(1, -1, 0), result.at<Point3>(3), 1e-4));
 /* ************************************************************************* */
 int main() {
   TestResult tr;
diff --git a/wrap/pybind11/.appveyor.yml b/wrap/pybind11/.appveyor.yml
index 149a8a3dc9..85445d41a2 100644
--- a/wrap/pybind11/.appveyor.yml
+++ b/wrap/pybind11/.appveyor.yml
@@ -19,7 +19,7 @@ install:
     if ($env:PLATFORM -eq "x64") { $env:PYTHON = "$env:PYTHON-x64" }
     $env:PATH = "C:\Python$env:PYTHON\;C:\Python$env:PYTHON\Scripts\;$env:PATH"
     python -W ignore -m pip install --upgrade pip wheel
-    python -W ignore -m pip install pytest numpy --no-warn-script-location
+    python -W ignore -m pip install pytest numpy --no-warn-script-location pytest-timeout
 - ps: |
     Start-FileDownload 'https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.zip'
     7z x eigen-3.3.7.zip -y > $null
diff --git a/wrap/pybind11/.clang-format b/wrap/pybind11/.clang-format
new file mode 100644
index 0000000000..8e0fd8b014
--- /dev/null
+++ b/wrap/pybind11/.clang-format
@@ -0,0 +1,19 @@
+# See all possible options and defaults with:
+# clang-format --style=llvm --dump-config
+BasedOnStyle: LLVM
+AccessModifierOffset: -4
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: false
+BinPackParameters: false
+BreakBeforeBinaryOperators: All
+BreakConstructorInitializers: BeforeColon
+ColumnLimit: 99
+IndentCaseLabels: true
+IndentPPDirectives: AfterHash
+IndentWidth: 4
+Language: Cpp
+SpaceAfterCStyleCast: true
+Standard: Cpp11
+TabWidth: 4
diff --git a/wrap/pybind11/.clang-tidy b/wrap/pybind11/.clang-tidy
index e29d929897..d853a703ce 100644
--- a/wrap/pybind11/.clang-tidy
+++ b/wrap/pybind11/.clang-tidy
@@ -1,13 +1,66 @@
 FormatStyle: file
 Checks: '
+- key:             performance-for-range-copy.WarnOnAllAutoCopies
+  value:           true
+- key:             performance-unnecessary-value-param.AllowedTypes
+  value:           'exception_ptr$;'
+- key:             readability-implicit-bool-conversion.AllowPointerConditions
+  value:           true
 HeaderFilterRegex: 'pybind11/.*h'
+WarningsAsErrors: '*'
diff --git a/wrap/pybind11/.github/CODEOWNERS b/wrap/pybind11/.github/CODEOWNERS
new file mode 100644
index 0000000000..4e2c66902e
--- /dev/null
+++ b/wrap/pybind11/.github/CODEOWNERS
@@ -0,0 +1,9 @@
+*.cmake @henryiii
+CMakeLists.txt @henryiii
+*.yml @henryiii
+*.yaml @henryiii
+/tools/ @henryiii
+/pybind11/ @henryiii
+noxfile.py @henryiii
+.clang-format @henryiii
+.clang-tidy @henryiii
diff --git a/wrap/pybind11/.github/CONTRIBUTING.md b/wrap/pybind11/.github/CONTRIBUTING.md
index 4ced21baaa..e8294c83c3 100644
--- a/wrap/pybind11/.github/CONTRIBUTING.md
+++ b/wrap/pybind11/.github/CONTRIBUTING.md
@@ -53,6 +53,33 @@ derivative works thereof, in binary and source code form.
 ## Development of pybind11
+### Quick setup
+To setup a quick development environment, use [`nox`](https://nox.thea.codes).
+This will allow you to do some common tasks with minimal setup effort, but will
+take more time to run and be less flexible than a full development environment.
+If you use [`pipx run nox`](https://pipx.pypa.io), you don't even need to
+install `nox`. Examples:
+# List all available sessions
+nox -l
+# Run linters
+nox -s lint
+# Run tests on Python 3.9
+nox -s tests-3.9
+# Build and preview docs
+nox -s docs -- serve
+# Build SDists and wheels
+nox -s build
+### Full setup
 To setup an ideal development environment, run the following commands on a
 system with CMake 3.14+:
@@ -93,7 +120,7 @@ The valid options are:
 * `-DPYBIND11_NOPYTHON=ON`: Disable all Python searching (disables tests)
 * `-DBUILD_TESTING=ON`: Enable the tests
 * `-DDOWNLOAD_CATCH=ON`: Download catch to build the C++ tests
-* `-DOWNLOAD_EIGEN=ON`: Download Eigen for the NumPy tests
+* `-DDOWNLOAD_EIGEN=ON`: Download Eigen for the NumPy tests
 * `-DPYBIND11_INSTALL=ON/OFF`: Enable the install target (on by default for the
   master project)
 * `-DUSE_PYTHON_INSTALL_DIR=ON`: Try to install into the python dir
@@ -126,13 +153,26 @@ cmake --build build --target check
 `--target` can be spelled `-t` in CMake 3.15+. You can also run individual
 tests with these targets:
-* `pytest`: Python tests only
+* `pytest`: Python tests only, using the
+[pytest](https://docs.pytest.org/en/stable/) framework
 * `cpptest`: C++ tests only
 * `test_cmake_build`: Install / subdirectory tests
 If you want to build just a subset of tests, use
-`-DPYBIND11_TEST_OVERRIDE="test_callbacks.cpp;test_pickling.cpp"`. If this is
-empty, all tests will be built.
+`-DPYBIND11_TEST_OVERRIDE="test_callbacks;test_pickling"`. If this is
+empty, all tests will be built. Tests are specified without an extension if they need both a .py and
+.cpp file.
+You may also pass flags to the `pytest` target by editing `tests/pytest.ini` or
+by using the `PYTEST_ADDOPTS` environment variable
+(see [`pytest` docs](https://docs.pytest.org/en/2.7.3/customize.html#adding-default-options)). As an example:
+env PYTEST_ADDOPTS="--capture=no --exitfirst" \
+    cmake --build build --target pytest
+# Or using abbreviated flags
+env PYTEST_ADDOPTS="-s -x" cmake --build build --target pytest
 ### Formatting
@@ -164,16 +204,42 @@ name, pre-commit):
 pre-commit install
+### Clang-Format
+As of v2.6.2, pybind11 ships with a [`clang-format`][clang-format]
+configuration file at the top level of the repo (the filename is
+`.clang-format`). Currently, formatting is NOT applied automatically, but
+manually using `clang-format` for newly developed files is highly encouraged.
+To check if a file needs formatting:
+clang-format -style=file --dry-run some.cpp
+The output will show things to be fixed, if any. To actually format the file:
+clang-format -style=file -i some.cpp
+Note that the `-style-file` option searches the parent directories for the
+`.clang-format` file, i.e. the commands above can be run in any subdirectory
+of the pybind11 repo.
 ### Clang-Tidy
-To run Clang tidy, the following recipe should work. Files will be modified in
-place, so you can use git to monitor the changes.
+[`clang-tidy`][clang-tidy] performs deeper static code analyses and is
+more complex to run, compared to `clang-format`, but support for `clang-tidy`
+is built into the pybind11 CMake configuration. To run `clang-tidy`, the
+following recipe should work. Run the `docker` command from the top-level
+directory inside your pybind11 git clone. Files will be modified in place,
+so you can use git to monitor the changes.
-docker run --rm -v $PWD:/pybind11 -it silkeh/clang:10
-apt-get update && apt-get install python3-dev python3-pytest
-cmake -S pybind11/ -B build -DCMAKE_CXX_CLANG_TIDY="$(which clang-tidy);-fix"
-cmake --build build
+docker run --rm -v $PWD:/mounted_pybind11 -it silkeh/clang:12
+apt-get update && apt-get install -y python3-dev python3-pytest
+cmake -S /mounted_pybind11/ -B build -DCMAKE_CXX_CLANG_TIDY="$(which clang-tidy);-fix" -DDOWNLOAD_EIGEN=ON -DDOWNLOAD_CATCH=ON -DCMAKE_CXX_STANDARD=17
+cmake --build build -j 2 -- --keep-going
 ### Include what you use
@@ -186,7 +252,7 @@ cmake -S . -B build-iwyu -DCMAKE_CXX_INCLUDE_WHAT_YOU_USE=$(which include-what-y
 cmake --build build
-The report is sent to stderr; you can pip it into a file if you wish.
+The report is sent to stderr; you can pipe it into a file if you wish.
 ### Build recipes
@@ -313,6 +379,8 @@ if you really want to.
 [pre-commit]: https://pre-commit.com
+[clang-format]: https://clang.llvm.org/docs/ClangFormat.html
+[clang-tidy]: https://clang.llvm.org/extra/clang-tidy/
 [pybind11.readthedocs.org]: http://pybind11.readthedocs.org/en/latest
 [issue tracker]: https://github.com/pybind/pybind11/issues
 [gitter]: https://gitter.im/pybind/Lobby
diff --git a/wrap/pybind11/.github/ISSUE_TEMPLATE/bug-report.md b/wrap/pybind11/.github/ISSUE_TEMPLATE/bug-report.md
deleted file mode 100644
index ae36ea6508..0000000000
--- a/wrap/pybind11/.github/ISSUE_TEMPLATE/bug-report.md
+++ /dev/null
@@ -1,28 +0,0 @@
-name: Bug Report
-about: File an issue about a bug
-title: "[BUG] "
-Make sure you've completed the following steps before submitting your issue -- thank you!
-1. Make sure you've read the [documentation][]. Your issue may be addressed there.
-2. Search the [issue tracker][] to verify that this hasn't already been reported. +1 or comment there if it has.
-3. Consider asking first in the [Gitter chat room][].
-4. Include a self-contained and minimal piece of code that reproduces the problem. If that's not possible, try to make the description as clear as possible.
-    a. If possible, make a PR with a new, failing test to give us a starting point to work on!
-[documentation]: https://pybind11.readthedocs.io
-[issue tracker]: https://github.com/pybind/pybind11/issues
-[Gitter chat room]: https://gitter.im/pybind/Lobby
-*After reading, remove this checklist and the template text in parentheses below.*
-## Issue description
-(Provide a short description, state the expected behavior and what actually happens.)
-## Reproducible example code
-(The code should be minimal, have no external dependencies, isolate the function(s) that cause breakage. Submit matched and complete C++ and Python snippets that can be easily compiled and run to diagnose the issue.)
diff --git a/wrap/pybind11/.github/ISSUE_TEMPLATE/bug-report.yml b/wrap/pybind11/.github/ISSUE_TEMPLATE/bug-report.yml
new file mode 100644
index 0000000000..bd6a9a8e22
--- /dev/null
+++ b/wrap/pybind11/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -0,0 +1,45 @@
+name: Bug Report
+description: File an issue about a bug
+title: "[BUG]: "
+labels: [triage]
+  - type: markdown
+    attributes:
+      value: |
+        Maintainers will only make a best effort to triage PRs. Please do your best to make the issue as easy to act on as possible, and only open if clearly a problem with pybind11 (ask first if unsure).
+  - type: checkboxes
+    id: steps
+    attributes:
+      label: Required prerequisites
+      description: Make sure you've completed the following steps before submitting your issue -- thank you!
+      options:
+        - label: Make sure you've read the [documentation](https://pybind11.readthedocs.io). Your issue may be addressed there.
+          required: true
+        - label: Search the [issue tracker](https://github.com/pybind/pybind11/issues) and [Discussions](https:/pybind/pybind11/discussions) to verify that this hasn't already been reported. +1 or comment there if it has.
+          required: true
+        - label: Consider asking first in the [Gitter chat room](https://gitter.im/pybind/Lobby) or in a [Discussion](https:/pybind/pybind11/discussions/new).
+          required: false
+  - type: textarea
+    id: description
+    attributes:
+      label: Problem description
+      placeholder: >-
+        Provide a short description, state the expected behavior and what
+        actually happens. Include relevant information like what version of
+        pybind11 you are using, what system you are on, and any useful commands
+        / output.
+    validations:
+      required: true
+  - type: textarea
+    id: code
+    attributes:
+      label: Reproducible example code
+      placeholder: >-
+        The code should be minimal, have no external dependencies, isolate the
+        function(s) that cause breakage. Submit matched and complete C++ and
+        Python snippets that can be easily compiled and run to diagnose the
+        issue. If possible, make a PR with a new, failing test to give us a
+        starting point to work on!
+      render: text
diff --git a/wrap/pybind11/.github/ISSUE_TEMPLATE/config.yml b/wrap/pybind11/.github/ISSUE_TEMPLATE/config.yml
index 20e743136f..27f9a80441 100644
--- a/wrap/pybind11/.github/ISSUE_TEMPLATE/config.yml
+++ b/wrap/pybind11/.github/ISSUE_TEMPLATE/config.yml
@@ -1,5 +1,8 @@
 blank_issues_enabled: false
+  - name: Ask a question
+    url: https://github.com/pybind/pybind11/discussions/new
+    about: Please ask and answer questions here, or propose new ideas.
   - name: Gitter room
     url: https://gitter.im/pybind/Lobby
     about: A room for discussing pybind11 with an active community
diff --git a/wrap/pybind11/.github/ISSUE_TEMPLATE/feature-request.md b/wrap/pybind11/.github/ISSUE_TEMPLATE/feature-request.md
deleted file mode 100644
index 5f6ec81ec9..0000000000
--- a/wrap/pybind11/.github/ISSUE_TEMPLATE/feature-request.md
+++ /dev/null
@@ -1,16 +0,0 @@
-name: Feature Request
-about: File an issue about adding a feature
-title: "[FEAT] "
-Make sure you've completed the following steps before submitting your issue -- thank you!
-1. Check if your feature has already been mentioned / rejected / planned in other issues.
-2. If those resources didn't help, consider asking in the [Gitter chat room][] to see if this is interesting / useful to a larger audience and possible to implement reasonably,
-4. If you have a useful feature that passes the previous items (or not suitable for chat), please fill in the details below.
-[Gitter chat room]: https://gitter.im/pybind/Lobby
-*After reading, remove this checklist.*
diff --git a/wrap/pybind11/.github/ISSUE_TEMPLATE/question.md b/wrap/pybind11/.github/ISSUE_TEMPLATE/question.md
deleted file mode 100644
index b199b6ee8a..0000000000
--- a/wrap/pybind11/.github/ISSUE_TEMPLATE/question.md
+++ /dev/null
@@ -1,21 +0,0 @@
-name: Question
-about: File an issue about unexplained behavior
-title: "[QUESTION] "
-If you have a question, please check the following first:
-1. Check if your question has already been answered in the [FAQ][] section.
-2. Make sure you've read the [documentation][]. Your issue may be addressed there.
-3. If those resources didn't help and you only have a short question (not a bug report), consider asking in the [Gitter chat room][]
-4. Search the [issue tracker][], including the closed issues, to see if your question has already been asked/answered. +1 or comment if it has been asked but has no answer.
-5. If you have a more complex question which is not answered in the previous items (or not suitable for chat), please fill in the details below.
-6. Include a self-contained and minimal piece of code that illustrates your question. If that's not possible, try to make the description as clear as possible.
-[FAQ]: http://pybind11.readthedocs.io/en/latest/faq.html
-[documentation]: https://pybind11.readthedocs.io
-[issue tracker]: https://github.com/pybind/pybind11/issues
-[Gitter chat room]: https://gitter.im/pybind/Lobby
-*After reading, remove this checklist.*
diff --git a/wrap/pybind11/.github/dependabot.yml b/wrap/pybind11/.github/dependabot.yml
new file mode 100644
index 0000000000..73273365c0
--- /dev/null
+++ b/wrap/pybind11/.github/dependabot.yml
@@ -0,0 +1,16 @@
+version: 2
+  # Maintain dependencies for GitHub Actions
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "daily"
+    ignore:
+      # Official actions have moving tags like v1
+      # that are used, so they don't need updates here
+      - dependency-name: "actions/checkout"
+      - dependency-name: "actions/setup-python"
+      - dependency-name: "actions/cache"
+      - dependency-name: "actions/upload-artifact"
+      - dependency-name: "actions/download-artifact"
+      - dependency-name: "actions/labeler"
diff --git a/wrap/pybind11/.github/labeler.yml b/wrap/pybind11/.github/labeler.yml
new file mode 100644
index 0000000000..abb0d05aaa
--- /dev/null
+++ b/wrap/pybind11/.github/labeler.yml
@@ -0,0 +1,8 @@
+- any:
+  - 'docs/**/*.rst'
+  - '!docs/changelog.rst'
+  - '!docs/upgrade.rst'
+- '.github/workflows/*.yml'
diff --git a/wrap/pybind11/.github/labeler_merged.yml b/wrap/pybind11/.github/labeler_merged.yml
new file mode 100644
index 0000000000..2374ad42e4
--- /dev/null
+++ b/wrap/pybind11/.github/labeler_merged.yml
@@ -0,0 +1,3 @@
+needs changelog:
+- all:
+  - '!docs/changelog.rst'
diff --git a/wrap/pybind11/.github/pull_request_template.md b/wrap/pybind11/.github/pull_request_template.md
new file mode 100644
index 0000000000..54b7f5100d
--- /dev/null
+++ b/wrap/pybind11/.github/pull_request_template.md
@@ -0,0 +1,19 @@
+Title (above): please place [branch_name] at the beginning if you are targeting a branch other than master. *Do not target stable*.
+It is recommended to use conventional commit format, see conventionalcommits.org, but not required.
+## Description
+<!-- Include relevant issues or PRs here, describe what changed and why -->
+## Suggested changelog entry:
+<!-- Fill in the below block with the expected RestructuredText entry. Delete if no entry needed;
+     but do not delete header or rst block if an entry is needed! Will be collected via a script. -->
+<!-- If the upgrade guide needs updating, note that here too -->
diff --git a/wrap/pybind11/.github/workflows/ci.yml b/wrap/pybind11/.github/workflows/ci.yml
index 1749d07f02..050c525cee 100644
--- a/wrap/pybind11/.github/workflows/ci.yml
+++ b/wrap/pybind11/.github/workflows/ci.yml
@@ -9,6 +9,13 @@ on:
       - stable
       - v*
+  group: test-${{ github.ref }}
+  cancel-in-progress: true
   # This is the "main" test suite, which tests a large number of different
   # versions of default compilers and Python versions in GitHub Actions.
@@ -16,71 +23,42 @@ jobs:
       fail-fast: false
-        runs-on: [ubuntu-latest, windows-latest, macos-latest]
-        arch: [x64]
+        runs-on: [ubuntu-latest, windows-2022, macos-latest]
-        - 2.7
-        - 3.5
-        - 3.8
-        - pypy2
-        - pypy3
+        - '2.7'
+        - '3.5'
+        - '3.6'
+        - '3.9'
+        - '3.10'
+        - 'pypy-3.7-v7.3.7'
+        - 'pypy-3.8-v7.3.7'
         # Items in here will either be added to the build matrix (if not
         # present), or add new keys to an existing matrix element if all the
         # existing keys match.
-        # We support three optional keys: args (both build), args1 (first
-        # build), and args2 (second build).
+        # We support an optional key: args, for cmake args
+          # Just add a key
           - runs-on: ubuntu-latest
-            python: 3.6
-            arch: x64
+            python: '3.6'
             args: >
-          - runs-on: windows-2016
-            python: 3.7
-            arch: x86
-            args2: >
-              -DCMAKE_CXX_FLAGS="/permissive- /EHsc /GR"
+              -DCMAKE_CXX_FLAGS="-D_=1"
           - runs-on: windows-latest
-            python: 3.6
-            arch: x64
+            python: '3.6'
             args: >
-          - runs-on: windows-latest
-            python: 3.7
-            arch: x64
-          - runs-on: ubuntu-latest
-            python: 3.9-dev
-            arch: x64
           - runs-on: macos-latest
-            python: 3.9-dev
-            arch: x64
-            args: >
-              -DPYBIND11_FINDPYTHON=ON
-        # These items will be removed from the build matrix, keys must match.
-        exclude:
-            # Currently 32bit only, and we build 64bit
-          - runs-on: windows-latest
-            python: pypy2
-            arch: x64
-          - runs-on: windows-latest
-            python: pypy3
-            arch: x64
-            # Currently broken on embed_test
-          - runs-on: windows-latest
-            python: 3.8
-            arch: x64
-          - runs-on: windows-latest
-            python: 3.9-dev
-            arch: x64
-    name: "🐍 ${{ matrix.python }} • ${{ matrix.runs-on }} • ${{ matrix.arch }} ${{ matrix.args }}"
+            python: 'pypy-2.7'
+          # Inject a couple Windows 2019 runs
+          - runs-on: windows-2019
+            python: '3.9'
+          - runs-on: windows-2019
+            python: '2.7'
+    name: "🐍 ${{ matrix.python }} • ${{ matrix.runs-on }} • x64 ${{ matrix.args }}"
     runs-on: ${{ matrix.runs-on }}
-    continue-on-error: ${{ endsWith(matrix.python, 'dev') }}
     - uses: actions/checkout@v2
@@ -89,13 +67,18 @@ jobs:
       uses: actions/setup-python@v2
         python-version: ${{ matrix.python }}
-        architecture: ${{ matrix.arch }}
-    - name: Setup Boost (Windows / Linux latest)
-      run: echo "::set-env name=BOOST_ROOT::$BOOST_ROOT_1_72_0"
+    - name: Setup Boost (Linux)
+      # Can't use boost + define _
+      if: runner.os == 'Linux' && matrix.python != '3.6'
+      run: sudo apt-get install libboost-dev
+    - name: Setup Boost (macOS)
+      if: runner.os == 'macOS'
+      run: brew install boost
     - name: Update CMake
-      uses: jwlawson/actions-setup-cmake@v1.3
+      uses: jwlawson/actions-setup-cmake@v1.12
     - name: Cache wheels
       if: runner.os == 'macOS'
@@ -106,10 +89,11 @@ jobs:
         # for ways to do this more generally
         path: ~/Library/Caches/pip
         # Look to see if there is a cache hit for the corresponding requirements file
-        key: ${{ runner.os }}-pip-${{ matrix.python }}-${{ matrix.arch }}-${{ hashFiles('tests/requirements.txt') }}
+        key: ${{ runner.os }}-pip-${{ matrix.python }}-x64-${{ hashFiles('tests/requirements.txt') }}
     - name: Prepare env
-      run: python -m pip install -r tests/requirements.txt --prefer-binary
+      run: |
+        python -m pip install -r tests/requirements.txt
     - name: Setup annotations on Linux
       if: runner.os == 'Linux'
@@ -132,6 +116,8 @@ jobs:
       run: cmake --build . --target pytest -j 2
     - name: C++11 tests
+      # TODO: Figure out how to load the DLL on Python 3.8+
+      if: "!(runner.os == 'Windows' && (matrix.python == 3.8 || matrix.python == 3.9 || matrix.python == '3.10' || matrix.python == '3.11-dev' || matrix.python == 'pypy-3.8'))"
       run: cmake --build .  --target cpptest -j 2
     - name: Interface test C++11
@@ -141,7 +127,7 @@ jobs:
       run: git clean -fdx
     # Second build - C++17 mode and in a build directory
-    - name: Configure ${{ matrix.args2 }}
+    - name: Configure C++17
       run: >
         cmake -S . -B build2
@@ -149,7 +135,6 @@ jobs:
         ${{ matrix.args }}
-        ${{ matrix.args2 }}
     - name: Build
       run: cmake --build build2 -j 2
@@ -158,8 +143,28 @@ jobs:
       run: cmake --build build2 --target pytest
     - name: C++ tests
+      # TODO: Figure out how to load the DLL on Python 3.8+
+      if: "!(runner.os == 'Windows' && (matrix.python == 3.8 || matrix.python == 3.9 || matrix.python == '3.10' || matrix.python == '3.11-dev' || matrix.python == 'pypy-3.8'))"
       run: cmake --build build2 --target cpptest
+    # Third build - C++17 mode with unstable ABI
+    - name: Configure (unstable ABI)
+      run: >
+        cmake -S . -B build3
+        -DPYBIND11_INTERNALS_VERSION=10000000
+        "-DPYBIND11_TEST_OVERRIDE=test_call_policies.cpp;test_gil_scoped.cpp;test_thread.cpp"
+        ${{ matrix.args }}
+    - name: Build (unstable ABI)
+      run: cmake --build build3 -j 2
+    - name: Python tests (unstable ABI)
+      run: cmake --build build3 --target pytest
     - name: Interface test
       run: cmake --build build2 --target test_cmake_build
@@ -167,21 +172,105 @@ jobs:
     # MSVC, but for now, this action works:
     - name: Prepare compiler environment for Windows 🐍 2.7
       if: matrix.python == 2.7 && runner.os == 'Windows'
-      uses: ilammy/msvc-dev-cmd@v1
+      uses: ilammy/msvc-dev-cmd@v1.10.0
         arch: x64
     # This makes two environment variables available in the following step(s)
     - name: Set Windows 🐍 2.7 environment variables
       if: matrix.python == 2.7 && runner.os == 'Windows'
+      shell: bash
       run: |
-        echo "::set-env name=DISTUTILS_USE_SDK::1"
-        echo "::set-env name=MSSdk::1"
+        echo "DISTUTILS_USE_SDK=1" >> $GITHUB_ENV
+        echo "MSSdk=1" >> $GITHUB_ENV
     # This makes sure the setup_helpers module can build packages using
     # setuptools
     - name: Setuptools helpers test
       run: pytest tests/extra_setuptools
+      if: "!(matrix.python == '3.5' && matrix.runs-on == 'windows-2022')"
+  deadsnakes:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        # TODO: Fails on 3.10, investigate
+        - python-version: "3.9"
+          python-debug: true
+          valgrind: true
+      # - python-version: "3.11-dev"
+      #   python-debug: false
+    name: "🐍 ${{ matrix.python-version }}${{ matrix.python-debug && '-dbg' || '' }} (deadsnakes)${{ matrix.valgrind && ' • Valgrind' || '' }} • x64"
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Setup Python ${{ matrix.python-version }} (deadsnakes)
+      uses: deadsnakes/action@v2.1.1
+      with:
+        python-version: ${{ matrix.python-version }}
+        debug: ${{ matrix.python-debug }}
+    - name: Update CMake
+      uses: jwlawson/actions-setup-cmake@v1.12
+    - name: Valgrind cache
+      if: matrix.valgrind
+      uses: actions/cache@v2
+      id: cache-valgrind
+      with:
+        path: valgrind
+        key: 3.16.1 # Valgrind version
+    - name: Compile Valgrind
+      if: matrix.valgrind && steps.cache-valgrind.outputs.cache-hit != 'true'
+      run: |
+        VALGRIND_VERSION=3.16.1
+        curl https://sourceware.org/pub/valgrind/valgrind-$VALGRIND_VERSION.tar.bz2 -o - | tar xj
+        mv valgrind-$VALGRIND_VERSION valgrind
+        cd valgrind
+        ./configure
+        make -j 2 > /dev/null
+    - name: Install Valgrind
+      if: matrix.valgrind
+      working-directory: valgrind
+      run: |
+        sudo make install
+        sudo apt-get update
+        sudo apt-get install libc6-dbg  # Needed by Valgrind
+    - name: Prepare env
+      run: |
+        python -m pip install -r tests/requirements.txt
+    - name: Configure
+      env:
+      run: >
+        cmake -S . -B build
+        -DCMAKE_BUILD_TYPE=Debug
+    - name: Build
+      run: cmake --build build -j 2
+    - name: Python tests
+      run: cmake --build build --target pytest
+    - name: C++ tests
+      run: cmake --build build --target cpptest
+    - name: Run Valgrind on Python tests
+      if: matrix.valgrind
+      run: cmake --build build --target memcheck
   # Testing on clang using the excellent silkeh clang docker images
@@ -194,12 +283,20 @@ jobs:
           - 3.6
           - 3.7
           - 3.9
-          - 5
           - 7
           - 9
           - dev
-    name: "🐍 3 • Clang ${{ matrix.clang }} • x64"
+        std:
+          - 11
+        include:
+          - clang: 5
+            std: 14
+          - clang: 10
+            std: 20
+          - clang: 10
+            std: 17
+    name: "🐍 3 • Clang ${{ matrix.clang }} • C++${{ matrix.std }} • x64"
     container: "silkeh/clang:${{ matrix.clang }}"
@@ -214,6 +311,7 @@ jobs:
         cmake -S . -B build
+        -DCMAKE_CXX_STANDARD=${{ matrix.std }}
         -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)")
     - name: Build
@@ -252,50 +350,54 @@ jobs:
       run: cmake --build build --target pytest
-  # Testing CentOS 8 + PGI compilers
-  centos-nvhpc8:
-    runs-on: ubuntu-latest
-    name: "🐍 3 • CentOS8 / PGI 20.7 • x64"
-    container: centos:8
-    steps:
-    - uses: actions/checkout@v2
-    - name: Add Python 3 and a few requirements
-      run: yum update -y && yum install -y git python3-devel python3-numpy python3-pytest make environment-modules
-    - name: Install CMake with pip
-      run: |
-        python3 -m pip install --upgrade pip
-        python3 -m pip install cmake --prefer-binary
-    - name: Install NVidia HPC SDK
-      run: yum -y install https://developer.download.nvidia.com/hpc-sdk/nvhpc-20-7-20.7-1.x86_64.rpm https://developer.download.nvidia.com/hpc-sdk/nvhpc-2020-20.7-1.x86_64.rpm
-    - name: Configure
-      shell: bash
-      run: |
-        source /etc/profile.d/modules.sh
-        module load /opt/nvidia/hpc_sdk/modulefiles/nvhpc/20.7
-        cmake -S . -B build -DDOWNLOAD_CATCH=ON -DCMAKE_CXX_STANDARD=14 -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)")
-    - name: Build
-      run: cmake --build build -j 2 --verbose
-    - name: Python tests
-      run: cmake --build build --target pytest
-    - name: C++ tests
-      run: cmake --build build --target cpptest
-    - name: Interface test
-      run: cmake --build build --target test_cmake_build
+# TODO: Internal compiler error - report to NVidia
+#  # Testing CentOS 8 + PGI compilers
+#  centos-nvhpc8:
+#    runs-on: ubuntu-latest
+#    name: "🐍 3 • CentOS8 / PGI 20.11 • x64"
+#    container: centos:8
+#    steps:
+#    - uses: actions/checkout@v2
+#    - name: Add Python 3 and a few requirements
+#      run: yum update -y && yum install -y git python3-devel python3-numpy python3-pytest make environment-modules
+#    - name: Install CMake with pip
+#      run: |
+#        python3 -m pip install --upgrade pip
+#        python3 -m pip install cmake --prefer-binary
+#    - name: Install NVidia HPC SDK
+#      run: >
+#        yum -y install
+#        https://developer.download.nvidia.com/hpc-sdk/20.11/nvhpc-20-11-20.11-1.x86_64.rpm
+#        https://developer.download.nvidia.com/hpc-sdk/20.11/nvhpc-2020-20.11-1.x86_64.rpm
+#    - name: Configure
+#      shell: bash
+#      run: |
+#        source /etc/profile.d/modules.sh
+#        module load /opt/nvidia/hpc_sdk/modulefiles/nvhpc/20.11
+#        cmake -S . -B build -DDOWNLOAD_CATCH=ON -DCMAKE_CXX_STANDARD=14 -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)")
+#    - name: Build
+#      run: cmake --build build -j 2 --verbose
+#    - name: Python tests
+#      run: cmake --build build --target pytest
+#    - name: C++ tests
+#      run: cmake --build build --target cpptest
+#    - name: Interface test
+#      run: cmake --build build --target test_cmake_build
   # Testing on CentOS 7 + PGI compilers, which seems to require more workarounds
     runs-on: ubuntu-latest
-    name: "🐍 3 • CentOS7 / PGI 20.7 • x64"
+    name: "🐍 3 • CentOS7 / PGI 20.9 • x64"
     container: centos:7
@@ -305,17 +407,17 @@ jobs:
       run: yum update -y && yum install -y epel-release && yum install -y git python3-devel make environment-modules cmake3
     - name: Install NVidia HPC SDK
-      run:  yum -y install https://developer.download.nvidia.com/hpc-sdk/nvhpc-20-7-20.7-1.x86_64.rpm https://developer.download.nvidia.com/hpc-sdk/nvhpc-2020-20.7-1.x86_64.rpm
+      run:  yum -y install https://developer.download.nvidia.com/hpc-sdk/20.9/nvhpc-20-9-20.9-1.x86_64.rpm https://developer.download.nvidia.com/hpc-sdk/20.9/nvhpc-2020-20.9-1.x86_64.rpm
     # On CentOS 7, we have to filter a few tests (compiler internal error)
-    # and allow deeper templete recursion (not needed on CentOS 8 with a newer
+    # and allow deeper template recursion (not needed on CentOS 8 with a newer
     # standard library). On some systems, you many need further workarounds:
     # https://github.com/pybind/pybind11/pull/2475
     - name: Configure
       shell: bash
       run: |
         source /etc/profile.d/modules.sh
-        module load /opt/nvidia/hpc_sdk/modulefiles/nvhpc/20.7
+        module load /opt/nvidia/hpc_sdk/modulefiles/nvhpc/20.9
         cmake3 -S . -B build -DDOWNLOAD_CATCH=ON \
                             -DCMAKE_CXX_STANDARD=11 \
                             -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)") \
@@ -340,6 +442,7 @@ jobs:
     - name: Interface test
       run: cmake3 --build build --target test_cmake_build
   # Testing on GCC using the GCC docker images (only recent images supported)
     runs-on: ubuntu-latest
@@ -349,8 +452,13 @@ jobs:
           - 7
           - latest
+        std:
+          - 11
+        include:
+          - gcc: 10
+            std: 20
-    name: "🐍 3 • GCC ${{ matrix.gcc }} • x64"
+    name: "🐍 3 • GCC ${{ matrix.gcc }} • C++${{ matrix.std }}• x64"
     container: "gcc:${{ matrix.gcc }}"
@@ -362,10 +470,8 @@ jobs:
     - name: Update pip
       run: python3 -m pip install --upgrade pip
-    - name: Setup CMake 3.18
-      uses: jwlawson/actions-setup-cmake@v1.3
-      with:
-        cmake-version: 3.18
+    - name: Update CMake
+      uses: jwlawson/actions-setup-cmake@v1.12
     - name: Configure
       shell: bash
@@ -373,7 +479,7 @@ jobs:
         cmake -S . -B build
+        -DCMAKE_CXX_STANDARD=${{ matrix.std }}
         -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)")
     - name: Build
@@ -389,6 +495,103 @@ jobs:
       run: cmake --build build --target test_cmake_build
+  # Testing on ICC using the oneAPI apt repo
+  icc:
+    runs-on: ubuntu-20.04
+    strategy:
+      fail-fast: false
+    name: "🐍 3 • ICC latest • x64"
+    steps:
+    - uses: actions/checkout@v2
+    - name: Add apt repo
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y wget build-essential pkg-config cmake ca-certificates gnupg
+        wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB
+        sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB
+        echo "deb https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
+    - name: Add ICC & Python 3
+      run: sudo apt-get update; sudo apt-get install -y intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic cmake python3-dev python3-numpy python3-pytest python3-pip
+    - name: Update pip
+      run: |
+        set +e; source /opt/intel/oneapi/setvars.sh; set -e
+        python3 -m pip install --upgrade pip
+    - name: Install dependencies
+      run: |
+        set +e; source /opt/intel/oneapi/setvars.sh; set -e
+        python3 -m pip install -r tests/requirements.txt
+    - name: Configure C++11
+      run: |
+        set +e; source /opt/intel/oneapi/setvars.sh; set -e
+        cmake -S . -B build-11     \
+        -DPYBIND11_WERROR=ON    \
+        -DDOWNLOAD_CATCH=ON     \
+        -DDOWNLOAD_EIGEN=OFF    \
+        -DCMAKE_CXX_STANDARD=11             \
+        -DCMAKE_CXX_COMPILER=$(which icpc)  \
+        -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)")
+    - name: Build C++11
+      run: |
+        set +e; source /opt/intel/oneapi/setvars.sh; set -e
+        cmake --build build-11 -j 2 -v
+    - name: Python tests C++11
+      run: |
+        set +e; source /opt/intel/oneapi/setvars.sh; set -e
+        sudo service apport stop
+        cmake --build build-11 --target check
+    - name: C++ tests C++11
+      run: |
+        set +e; source /opt/intel/oneapi/setvars.sh; set -e
+        cmake --build build-11 --target cpptest
+    - name: Interface test C++11
+      run: |
+        set +e; source /opt/intel/oneapi/setvars.sh; set -e
+        cmake --build build-11 --target test_cmake_build
+    - name: Configure C++17
+      run: |
+        set +e; source /opt/intel/oneapi/setvars.sh; set -e
+        cmake -S . -B build-17     \
+        -DPYBIND11_WERROR=ON    \
+        -DDOWNLOAD_CATCH=ON     \
+        -DDOWNLOAD_EIGEN=OFF    \
+        -DCMAKE_CXX_STANDARD=17             \
+        -DCMAKE_CXX_COMPILER=$(which icpc)  \
+        -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)")
+    - name: Build C++17
+      run: |
+        set +e; source /opt/intel/oneapi/setvars.sh; set -e
+        cmake --build build-17 -j 2 -v
+    - name: Python tests C++17
+      run: |
+        set +e; source /opt/intel/oneapi/setvars.sh; set -e
+        sudo service apport stop
+        cmake --build build-17 --target check
+    - name: C++ tests C++17
+      run: |
+        set +e; source /opt/intel/oneapi/setvars.sh; set -e
+        cmake --build build-17 --target cpptest
+    - name: Interface test C++17
+      run: |
+        set +e; source /opt/intel/oneapi/setvars.sh; set -e
+        cmake --build build-17 --target test_cmake_build
   # Testing on CentOS (manylinux uses a centos base, and this is an easy way
   # to get GCC 4.8, which is the manylinux1 compiler).
@@ -397,11 +600,11 @@ jobs:
       fail-fast: false
-          - 7  # GCC 4.8
-          - 8
+          - centos7  # GCC 4.8
+          - stream8
     name: "🐍 3 • CentOS ${{ matrix.centos }} • x64"
-    container: "centos:${{ matrix.centos }}"
+    container: "quay.io/centos/centos:${{ matrix.centos }}"
     - uses: actions/checkout@v2
@@ -413,12 +616,14 @@ jobs:
       run: python3 -m pip install --upgrade pip
     - name: Install dependencies
-      run: python3 -m pip install cmake -r tests/requirements.txt --prefer-binary
+      run: |
+        python3 -m pip install cmake -r tests/requirements.txt
     - name: Configure
       shell: bash
       run: >
         cmake -S . -B build
+        -DCMAKE_BUILD_TYPE=MinSizeRel
@@ -476,7 +681,7 @@ jobs:
         -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)")
       working-directory: /build-tests
-    - name: Run tests
+    - name: Python tests
       run: make pytest -j 2
       working-directory: /build-tests
@@ -493,16 +698,13 @@ jobs:
     - uses: actions/setup-python@v2
     - name: Install Doxygen
-      run: sudo apt install -y doxygen
-    - name: Install docs & setup requirements
-      run: python3 -m pip install -r docs/requirements.txt
+      run: sudo apt-get install -y doxygen librsvg2-bin # Changed to rsvg-convert in 20.04
     - name: Build docs
-      run: python3 -m sphinx -W -b html docs docs/.build
+      run: pipx run nox -s docs
     - name: Make SDist
-      run: python3 setup.py sdist
+      run: pipx run nox -s build -- --sdist
     - run: git status --ignored
@@ -514,6 +716,250 @@ jobs:
     - name: Compare Dists (headers only)
       working-directory: include
       run: |
-        python3 -m pip install --user -U ../dist/*
+        python3 -m pip install --user -U ../dist/*.tar.gz
         installed=$(python3 -c "import pybind11; print(pybind11.get_include() + '/pybind11')")
         diff -rq $installed ./pybind11
+  win32:
+    strategy:
+      fail-fast: false
+      matrix:
+        python:
+        - 3.5
+        - 3.6
+        - 3.7
+        - 3.8
+        - 3.9
+        - pypy-3.6
+        include:
+          - python: 3.9
+          - python: 3.8
+            args: -DCMAKE_CXX_STANDARD=17
+    name: "🐍 ${{ matrix.python }} • MSVC 2019 • x86 ${{ matrix.args }}"
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Setup Python ${{ matrix.python }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python }}
+        architecture: x86
+    - name: Update CMake
+      uses: jwlawson/actions-setup-cmake@v1.12
+    - name: Prepare MSVC
+      uses: ilammy/msvc-dev-cmd@v1.10.0
+      with:
+        arch: x86
+    - name: Prepare env
+      run: |
+        python -m pip install -r tests/requirements.txt
+    # First build - C++11 mode and inplace
+    - name: Configure ${{ matrix.args }}
+      run: >
+        cmake -S . -B build
+        -G "Visual Studio 16 2019" -A Win32
+        ${{ matrix.args }}
+    - name: Build C++11
+      run: cmake --build build -j 2
+    - name: Python tests
+      run: cmake --build build -t pytest
+  win32-msvc2015:
+    name: "🐍 ${{ matrix.python }} • MSVC 2015 • x64"
+    runs-on: windows-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python:
+          - 2.7
+          - 3.6
+          - 3.7
+          # todo: check/cpptest does not support 3.8+ yet
+    steps:
+    - uses: actions/checkout@v2
+    - name: Setup 🐍 ${{ matrix.python }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python }}
+    - name: Update CMake
+      uses: jwlawson/actions-setup-cmake@v1.12
+    - name: Prepare MSVC
+      uses: ilammy/msvc-dev-cmd@v1.10.0
+      with:
+        toolset: 14.0
+    - name: Prepare env
+      run: |
+        python -m pip install -r tests/requirements.txt
+    # First build - C++11 mode and inplace
+    - name: Configure
+      run: >
+        cmake -S . -B build
+        -G "Visual Studio 14 2015" -A x64
+    - name: Build C++14
+      run: cmake --build build -j 2
+    - name: Run all checks
+      run: cmake --build build -t check
+  win32-msvc2017:
+    name: "🐍 ${{ matrix.python }} • MSVC 2017 • x64"
+    runs-on: windows-2016
+    strategy:
+      fail-fast: false
+      matrix:
+        python:
+          - 2.7
+          - 3.5
+          - 3.7
+        std:
+          - 14
+        include:
+          - python: 2.7
+            std: 17
+            args: >
+              -DCMAKE_CXX_FLAGS="/permissive- /EHsc /GR"
+          - python: 3.7
+            std: 17
+            args: >
+              -DCMAKE_CXX_FLAGS="/permissive- /EHsc /GR"
+    steps:
+    - uses: actions/checkout@v2
+    - name: Setup 🐍 ${{ matrix.python }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python }}
+    - name: Update CMake
+      uses: jwlawson/actions-setup-cmake@v1.12
+    - name: Prepare env
+      run: |
+        python -m pip install -r tests/requirements.txt
+    # First build - C++11 mode and inplace
+    - name: Configure
+      run: >
+        cmake -S . -B build
+        -G "Visual Studio 15 2017" -A x64
+        -DCMAKE_CXX_STANDARD=${{ matrix.std }}
+        ${{ matrix.args }}
+    - name: Build ${{ matrix.std }}
+      run: cmake --build build -j 2
+    - name: Run all checks
+      run: cmake --build build -t check
+  mingw:
+    name: "🐍 3 • windows-latest • ${{ matrix.sys }}"
+    runs-on: windows-latest
+    defaults:
+      run:
+        shell: msys2 {0}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - { sys: mingw64, env: x86_64 }
+          - { sys: mingw32, env: i686 }
+    steps:
+    - uses: msys2/setup-msys2@v2
+      with:
+        msystem: ${{matrix.sys}}
+        install: >-
+          git
+          mingw-w64-${{matrix.env}}-gcc
+          mingw-w64-${{matrix.env}}-python-pip
+          mingw-w64-${{matrix.env}}-python-numpy
+          mingw-w64-${{matrix.env}}-python-scipy
+          mingw-w64-${{matrix.env}}-cmake
+          mingw-w64-${{matrix.env}}-make
+          mingw-w64-${{matrix.env}}-python-pytest
+          mingw-w64-${{matrix.env}}-eigen3
+          mingw-w64-${{matrix.env}}-boost
+          mingw-w64-${{matrix.env}}-catch
+    - uses: actions/checkout@v2
+    - name: Configure C++11
+      # LTO leads to many undefined reference like
+      # `pybind11::detail::function_call::function_call(pybind11::detail::function_call&&)
+      run: cmake -G "MinGW Makefiles" -DCMAKE_CXX_STANDARD=11 -S . -B build
+    - name: Build C++11
+      run: cmake --build build -j 2
+    - name: Python tests C++11
+      run: cmake --build build --target pytest -j 2
+    - name: C++11 tests
+      run: PYTHONHOME=/${{matrix.sys}} PYTHONPATH=/${{matrix.sys}} cmake --build build --target cpptest -j 2
+    - name: Interface test C++11
+      run: PYTHONHOME=/${{matrix.sys}} PYTHONPATH=/${{matrix.sys}} cmake --build build --target test_cmake_build
+    - name: Clean directory
+      run: git clean -fdx
+    - name: Configure C++14
+      run: cmake -G "MinGW Makefiles" -DCMAKE_CXX_STANDARD=14 -S . -B build2
+    - name: Build C++14
+      run: cmake --build build2 -j 2
+    - name: Python tests C++14
+      run: cmake --build build2 --target pytest -j 2
+    - name: C++14 tests
+      run: PYTHONHOME=/${{matrix.sys}} PYTHONPATH=/${{matrix.sys}} cmake --build build2 --target cpptest -j 2
+    - name: Interface test C++14
+      run: PYTHONHOME=/${{matrix.sys}} PYTHONPATH=/${{matrix.sys}} cmake --build build2 --target test_cmake_build
+    - name: Clean directory
+      run: git clean -fdx
+    - name: Configure C++17
+      run: cmake -G "MinGW Makefiles" -DCMAKE_CXX_STANDARD=17 -S . -B build3
+    - name: Build C++17
+      run: cmake --build build3 -j 2
+    - name: Python tests C++17
+      run: cmake --build build3 --target pytest -j 2
+    - name: C++17 tests
+      run: PYTHONHOME=/${{matrix.sys}} PYTHONPATH=/${{matrix.sys}} cmake --build build3 --target cpptest -j 2
+    - name: Interface test C++17
+      run: PYTHONHOME=/${{matrix.sys}} PYTHONPATH=/${{matrix.sys}} cmake --build build3 --target test_cmake_build
diff --git a/wrap/pybind11/.github/workflows/configure.yml b/wrap/pybind11/.github/workflows/configure.yml
index 3dd248e04a..66ab0e3d78 100644
--- a/wrap/pybind11/.github/workflows/configure.yml
+++ b/wrap/pybind11/.github/workflows/configure.yml
@@ -18,7 +18,7 @@ jobs:
         runs-on: [ubuntu-latest, macos-latest, windows-latest]
         arch: [x64]
-        cmake: [3.18]
+        cmake: ["3.21"]
         - runs-on: ubuntu-latest
@@ -55,7 +55,7 @@ jobs:
     # An action for adding a specific version of CMake:
     #   https://github.com/jwlawson/actions-setup-cmake
     - name: Setup CMake ${{ matrix.cmake }}
-      uses: jwlawson/actions-setup-cmake@v1.3
+      uses: jwlawson/actions-setup-cmake@v1.12
         cmake-version: ${{ matrix.cmake }}
@@ -82,57 +82,3 @@ jobs:
       working-directory: build dir
       if: github.event_name == 'workflow_dispatch'
       run: cmake --build . --config Release --target check
-  # This builds the sdists and wheels and makes sure the files are exactly as
-  # expected. Using Windows and Python 2.7, since that is often the most
-  # challenging matrix element.
-  test-packaging:
-    name: 🐍 2.7 • 📦 tests • windows-latest
-    runs-on: windows-latest
-    steps:
-    - uses: actions/checkout@v2
-    - name: Setup 🐍 2.7
-      uses: actions/setup-python@v2
-      with:
-        python-version: 2.7
-    - name: Prepare env
-      run: python -m pip install -r tests/requirements.txt --prefer-binary
-    - name: Python Packaging tests
-      run: pytest tests/extra_python_package/
-  # This runs the packaging tests and also builds and saves the packages as
-  # artifacts.
-  packaging:
-    name: 🐍 3.8 • 📦 & 📦 tests • ubuntu-latest
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@v2
-    - name: Setup 🐍 3.8
-      uses: actions/setup-python@v2
-      with:
-        python-version: 3.8
-    - name: Prepare env
-      run: python -m pip install -r tests/requirements.txt build twine --prefer-binary
-    - name: Python Packaging tests
-      run: pytest tests/extra_python_package/
-    - name: Build SDist and wheels
-      run: |
-        python -m build -s -w .
-        PYBIND11_GLOBAL_SDIST=1 python -m build -s -w .
-    - name: Check metadata
-      run: twine check dist/*
-    - uses: actions/upload-artifact@v2
-      with:
-        path: dist/*
diff --git a/wrap/pybind11/.github/workflows/format.yml b/wrap/pybind11/.github/workflows/format.yml
index 28cfeb9b7d..ab7b40503a 100644
--- a/wrap/pybind11/.github/workflows/format.yml
+++ b/wrap/pybind11/.github/workflows/format.yml
@@ -19,15 +19,17 @@ jobs:
     - uses: actions/checkout@v2
     - uses: actions/setup-python@v2
-    - uses: pre-commit/action@v2.0.0
+    - uses: pre-commit/action@v2.0.3
         # Slow hooks are marked with manual - slow is okay here, run them too
-        extra_args: --hook-stage manual
+        extra_args: --hook-stage manual --all-files
+    # When making changes here, please also review the "Clang-Tidy" section
+    # in .github/CONTRIBUTING.md and update as needed.
     name: Clang-Tidy
     runs-on: ubuntu-latest
-    container: silkeh/clang:10
+    container: silkeh/clang:12
     - uses: actions/checkout@v2
@@ -35,7 +37,12 @@ jobs:
       run: apt-get update && apt-get install -y python3-dev python3-pytest
     - name: Configure
-      run: cmake -S . -B build -DCMAKE_CXX_CLANG_TIDY="$(which clang-tidy);--warnings-as-errors=*"
+      run: >
+        cmake -S . -B build
+        -DCMAKE_CXX_CLANG_TIDY="$(which clang-tidy)"
     - name: Build
-      run: cmake --build build -j 2
+      run: cmake --build build -j 2 -- --keep-going
diff --git a/wrap/pybind11/.github/workflows/labeler.yml b/wrap/pybind11/.github/workflows/labeler.yml
new file mode 100644
index 0000000000..d2b5979681
--- /dev/null
+++ b/wrap/pybind11/.github/workflows/labeler.yml
@@ -0,0 +1,16 @@
+name: Labeler
+  pull_request_target:
+    types: [closed]
+  label:
+    name: Labeler
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/labeler@main
+      if: github.event.pull_request.merged == true
+      with:
+        repo-token: ${{ secrets.GITHUB_TOKEN }}
+        configuration-path: .github/labeler_merged.yml
diff --git a/wrap/pybind11/.github/workflows/pip.yml b/wrap/pybind11/.github/workflows/pip.yml
new file mode 100644
index 0000000000..f74b79f0ce
--- /dev/null
+++ b/wrap/pybind11/.github/workflows/pip.yml
@@ -0,0 +1,108 @@
+name: Pip
+  workflow_dispatch:
+  pull_request:
+  push:
+    branches:
+    - master
+    - stable
+    - v*
+  release:
+    types:
+    - published
+  # This builds the sdists and wheels and makes sure the files are exactly as
+  # expected. Using Windows and Python 2.7, since that is often the most
+  # challenging matrix element.
+  test-packaging:
+    name: 🐍 2.7 • 📦 tests • windows-latest
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Setup 🐍 2.7
+      uses: actions/setup-python@v2
+      with:
+        python-version: 2.7
+    - name: Prepare env
+      run: |
+        python -m pip install -r tests/requirements.txt
+    - name: Python Packaging tests
+      run: pytest tests/extra_python_package/
+  # This runs the packaging tests and also builds and saves the packages as
+  # artifacts.
+  packaging:
+    name: 🐍 3.8 • 📦 & 📦 tests • ubuntu-latest
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Setup 🐍 3.8
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.8
+    - name: Prepare env
+      run: |
+        python -m pip install -r tests/requirements.txt build twine
+    - name: Python Packaging tests
+      run: pytest tests/extra_python_package/
+    - name: Build SDist and wheels
+      run: |
+        python -m build
+        PYBIND11_GLOBAL_SDIST=1 python -m build
+    - name: Check metadata
+      run: twine check dist/*
+    - name: Save standard package
+      uses: actions/upload-artifact@v2
+      with:
+        name: standard
+        path: dist/pybind11-*
+    - name: Save global package
+      uses: actions/upload-artifact@v2
+      with:
+        name: global
+        path: dist/pybind11_global-*
+  # When a GitHub release is made, upload the artifacts to PyPI
+  upload:
+    name: Upload to PyPI
+    runs-on: ubuntu-latest
+    if: github.event_name == 'release' && github.event.action == 'published'
+    needs: [packaging]
+    steps:
+    - uses: actions/setup-python@v2
+    # Downloads all to directories matching the artifact names
+    - uses: actions/download-artifact@v2
+    - name: Publish standard package
+      uses: pypa/gh-action-pypi-publish@v1.5.0
+      with:
+        password: ${{ secrets.pypi_password }}
+        packages_dir: standard/
+    - name: Publish global package
+      uses: pypa/gh-action-pypi-publish@v1.5.0
+      with:
+        password: ${{ secrets.pypi_password_global }}
+        packages_dir: global/
diff --git a/wrap/pybind11/.github/workflows/upstream.yml b/wrap/pybind11/.github/workflows/upstream.yml
new file mode 100644
index 0000000000..138c9ad292
--- /dev/null
+++ b/wrap/pybind11/.github/workflows/upstream.yml
@@ -0,0 +1,112 @@
+name: Upstream
+  workflow_dispatch:
+  pull_request:
+  group: upstream-${{ github.ref }}
+  cancel-in-progress: true
+  standard:
+    name: "🐍 3.11 dev • ubuntu-latest • x64"
+    runs-on: ubuntu-latest
+    if: "contains(github.event.pull_request.labels.*.name, 'python dev')"
+    steps:
+    - uses: actions/checkout@v2
+    - name: Setup Python 3.11
+      uses: actions/setup-python@v2
+      with:
+        python-version: "3.11-dev"
+    - name: Setup Boost (Linux)
+      if: runner.os == 'Linux'
+      run: sudo apt-get install libboost-dev
+    - name: Update CMake
+      uses: jwlawson/actions-setup-cmake@v1.12
+    - name: Prepare env
+      run: |
+        python -m pip install -r tests/requirements.txt
+    - name: Setup annotations on Linux
+      if: runner.os == 'Linux'
+      run: python -m pip install pytest-github-actions-annotate-failures
+    # First build - C++11 mode and inplace
+    - name: Configure C++11
+      run: >
+        cmake -S . -B .
+    - name: Build C++11
+      run: cmake --build . -j 2
+    - name: Python tests C++11
+      run: cmake --build . --target pytest -j 2
+    - name: C++11 tests
+      run: cmake --build .  --target cpptest -j 2
+    - name: Interface test C++11
+      run: cmake --build . --target test_cmake_build
+    - name: Clean directory
+      run: git clean -fdx
+    # Second build - C++17 mode and in a build directory
+    - name: Configure C++17
+      run: >
+        cmake -S . -B build2
+        ${{ matrix.args }}
+        ${{ matrix.args2 }}
+    - name: Build
+      run: cmake --build build2 -j 2
+    - name: Python tests
+      run: cmake --build build2 --target pytest
+    - name: C++ tests
+      run: cmake --build build2 --target cpptest
+    # Third build - C++17 mode with unstable ABI
+    - name: Configure (unstable ABI)
+      run: >
+        cmake -S . -B build3
+        -DPYBIND11_INTERNALS_VERSION=10000000
+        "-DPYBIND11_TEST_OVERRIDE=test_call_policies.cpp;test_gil_scoped.cpp;test_thread.cpp"
+        ${{ matrix.args }}
+    - name: Build (unstable ABI)
+      run: cmake --build build3 -j 2
+    - name: Python tests (unstable ABI)
+      run: cmake --build build3 --target pytest
+    - name: Interface test
+      run: cmake --build build2 --target test_cmake_build
+    # This makes sure the setup_helpers module can build packages using
+    # setuptools
+    - name: Setuptools helpers test
+      run: pytest tests/extra_setuptools
diff --git a/wrap/pybind11/.gitignore b/wrap/pybind11/.gitignore
index 3f36b89e0c..3cf4fbbda0 100644
--- a/wrap/pybind11/.gitignore
+++ b/wrap/pybind11/.gitignore
@@ -41,3 +41,5 @@ pybind11Targets.cmake
diff --git a/wrap/pybind11/.pre-commit-config.yaml b/wrap/pybind11/.pre-commit-config.yaml
index 71513c991c..2014cb2b42 100644
--- a/wrap/pybind11/.pre-commit-config.yaml
+++ b/wrap/pybind11/.pre-commit-config.yaml
@@ -15,12 +15,14 @@
 # Standard hooks
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v3.2.0
+  rev: v4.1.0
   - id: check-added-large-files
   - id: check-case-conflict
+  - id: check-docstring-first
   - id: check-merge-conflict
   - id: check-symlinks
+  - id: check-toml
   - id: check-yaml
   - id: debug-statements
   - id: end-of-file-fixer
@@ -28,54 +30,115 @@ repos:
   - id: requirements-txt-fixer
   - id: trailing-whitespace
   - id: fix-encoding-pragma
+    exclude: ^noxfile.py$
+- repo: https://github.com/asottile/pyupgrade
+  rev: v2.31.0
+  hooks:
+  - id: pyupgrade
+- repo: https://github.com/PyCQA/isort
+  rev: 5.10.1
+  hooks:
+  - id: isort
 # Black, the code formatter, natively supports pre-commit
 - repo: https://github.com/psf/black
-  rev: 20.8b1
+  rev: 21.12b0 # Keep in sync with blacken-docs
   - id: black
-    # Not all Python files are Blacked, yet
-    files: ^(setup.py|pybind11|tests/extra)
+- repo: https://github.com/asottile/blacken-docs
+  rev: v1.12.0
+  hooks:
+  - id: blacken-docs
+    additional_dependencies:
+    - black==21.12b0 # keep in sync with black hook
 # Changes tabs to spaces
 - repo: https://github.com/Lucas-C/pre-commit-hooks
-  rev: v1.1.9
+  rev: v1.1.10
   - id: remove-tabs
+# Autoremoves unused imports
+- repo: https://github.com/hadialqattan/pycln
+  rev: v1.1.0
+  hooks:
+  - id: pycln
+- repo: https://github.com/pre-commit/pygrep-hooks
+  rev: v1.9.0
+  hooks:
+  - id: python-check-blanket-noqa
+  - id: python-check-blanket-type-ignore
+  - id: python-no-log-warn
+  - id: rst-backticks
+  - id: rst-directive-colons
+  - id: rst-inline-touching-normal
 # Flake8 also supports pre-commit natively (same author)
-- repo: https://gitlab.com/pycqa/flake8
-  rev: 3.8.3
+- repo: https://github.com/PyCQA/flake8
+  rev: 4.0.1
   - id: flake8
-    additional_dependencies: [flake8-bugbear, pep8-naming]
+    additional_dependencies: &flake8_dependencies
+      - flake8-bugbear
+      - pep8-naming
     exclude: ^(docs/.*|tools/.*)$
+- repo: https://github.com/asottile/yesqa
+  rev: v1.3.0
+  hooks:
+  - id: yesqa
+    additional_dependencies: *flake8_dependencies
 # CMake formatting
 - repo: https://github.com/cheshirekow/cmake-format-precommit
-  rev: v0.6.11
+  rev: v0.6.13
   - id: cmake-format
     additional_dependencies: [pyyaml]
     types: [file]
     files: (\.cmake|CMakeLists.txt)(.in)?$
+# Check static types with mypy
+- repo: https://github.com/pre-commit/mirrors-mypy
+  rev: v0.931
+  hooks:
+  - id: mypy
+    # Running per-file misbehaves a bit, so just run on all files, it's fast
+    pass_filenames: false
+    additional_dependencies: [typed_ast]
 # Checks the manifest for missing files (native support)
 - repo: https://github.com/mgedmin/check-manifest
-  rev: "0.42"
+  rev: "0.47"
   - id: check-manifest
     # This is a slow hook, so only run this if --hook-stage manual is passed
     stages: [manual]
     additional_dependencies: [cmake, ninja]
+- repo: https://github.com/codespell-project/codespell
+  rev: v2.1.0
+  hooks:
+  - id: codespell
+    exclude: ".supp$"
+    args: ["-L", "nd,ot,thist"]
+- repo: https://github.com/shellcheck-py/shellcheck-py
+  rev: v0.8.0.3
+  hooks:
+  - id: shellcheck
 # The original pybind11 checks for a few C++ style items
 - repo: local
   - id: disallow-caps
     name: Disallow improper capitalization
     language: pygrep
-    entry: PyBind|Numpy|Cmake
+    entry: PyBind|Numpy|Cmake|CCache|PyTest
     exclude: .pre-commit-config.yaml
 - repo: local
diff --git a/wrap/pybind11/CMakeLists.txt b/wrap/pybind11/CMakeLists.txt
index 123abf77d1..3787982cbd 100644
--- a/wrap/pybind11/CMakeLists.txt
+++ b/wrap/pybind11/CMakeLists.txt
@@ -7,13 +7,18 @@
 cmake_minimum_required(VERSION 3.4)
-# The `cmake_minimum_required(VERSION 3.4...3.18)` syntax does not work with
+# The `cmake_minimum_required(VERSION 3.4...3.22)` syntax does not work with
 # some versions of VS that have a patched CMake 3.11. This forces us to emulate
 # the behavior using the following workaround:
-  cmake_policy(VERSION 3.18)
+  cmake_policy(VERSION 3.22)
+# Avoid infinite recursion if tests include this as a subdirectory
+  return()
 # Extract project version from source
+  set(pybind11_system "")
   set(pybind11_system SYSTEM)
@@ -82,6 +91,9 @@ endif()
 option(PYBIND11_INSTALL "Install pybind11 header files?" ${PYBIND11_MASTER_PROJECT})
 option(PYBIND11_TEST "Build pybind11 test suite?" ${PYBIND11_MASTER_PROJECT})
 option(PYBIND11_NOPYTHON "Disable search for Python" OFF)
+    ""
+    CACHE STRING "Override the ABI version, may be used to enable the unstable ABI.")
@@ -98,6 +110,7 @@ set(PYBIND11_HEADERS
+    include/pybind11/detail/type_caster_base.h
@@ -109,6 +122,7 @@ set(PYBIND11_HEADERS
+    include/pybind11/gil.h
@@ -116,7 +130,8 @@ set(PYBIND11_HEADERS
-    include/pybind11/stl_bind.h)
+    include/pybind11/stl_bind.h
+    include/pybind11/stl/filesystem.h)
 # Compare with grep and warn if mismatched
@@ -142,22 +157,45 @@ endif()
 string(REPLACE "include/" "${CMAKE_CURRENT_SOURCE_DIR}/include/" PYBIND11_HEADERS
-# Cache variables so pybind11_add_module can be used in parent projects
+# Cache variable so this can be used in parent projects
+    CACHE INTERNAL "Directory where pybind11 headers are located")
+# Backward compatible variable for add_subdirectory mode
+      "${pybind11_INCLUDE_DIR}"
 # Note: when creating targets, you cannot use if statements at configure time -
 # you need generator expressions, because those will be placed in the target file.
 # You can also place ifs *in* the Config.in, but not here.
 # This section builds targets, but does *not* touch Python
-# Build the headers-only target (no Python included):
-# (long name used here to keep this from clashing in subdirectory mode)
-add_library(pybind11_headers INTERFACE)
-add_library(pybind11::pybind11_headers ALIAS pybind11_headers) # to match exported target
-add_library(pybind11::headers ALIAS pybind11_headers) # easier to use/remember
+# Non-IMPORT targets cannot be defined twice
+if(NOT TARGET pybind11_headers)
+  # Build the headers-only target (no Python included):
+  # (long name used here to keep this from clashing in subdirectory mode)
+  add_library(pybind11_headers INTERFACE)
+  add_library(pybind11::pybind11_headers ALIAS pybind11_headers) # to match exported target
+  add_library(pybind11::headers ALIAS pybind11_headers) # easier to use/remember
+  target_include_directories(
+    pybind11_headers ${pybind11_system} INTERFACE $<BUILD_INTERFACE:${pybind11_INCLUDE_DIR}>
+                                                  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+  target_compile_features(pybind11_headers INTERFACE cxx_inheriting_constructors cxx_user_literals
+                                                     cxx_right_angle_brackets)
+    target_compile_definitions(
+  endif()
+  # It is invalid to install a target twice, too.
-# Fill in headers target
-  pybind11_headers ${pybind11_system} INTERFACE $<BUILD_INTERFACE:${PYBIND11_INCLUDE_DIR}>
-                                                $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
-target_compile_features(pybind11_headers INTERFACE cxx_inheriting_constructors cxx_user_literals
-                                                   cxx_right_angle_brackets)
-  # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
-      "share/cmake/${PROJECT_NAME}"
       CACHE STRING "install path for pybind11Config.cmake")
+  else()
+  endif()
     tools/${PROJECT_NAME}Config.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
@@ -260,8 +295,5 @@ endif()
-      CACHE INTERNAL "true if pybind11 and all required components found on the system")
-  set(pybind11_INCLUDE_DIR
-      "${PYBIND11_INCLUDE_DIR}"
-      CACHE INTERNAL "Directory where pybind11 headers are located")
+      CACHE INTERNAL "True if pybind11 and all required components found on the system")
diff --git a/wrap/pybind11/MANIFEST.in b/wrap/pybind11/MANIFEST.in
index 9336b60302..aed183e874 100644
--- a/wrap/pybind11/MANIFEST.in
+++ b/wrap/pybind11/MANIFEST.in
@@ -1,4 +1,6 @@
 recursive-include pybind11/include/pybind11 *.h
 recursive-include pybind11 *.py
+recursive-include pybind11 py.typed
+recursive-include pybind11 *.pyi
 include pybind11/share/cmake/pybind11/*.cmake
-include LICENSE README.md pyproject.toml setup.py setup.cfg
+include LICENSE README.rst pyproject.toml setup.py setup.cfg
diff --git a/wrap/pybind11/README.md b/wrap/pybind11/README.md
deleted file mode 100644
index 69a0fc90b2..0000000000
--- a/wrap/pybind11/README.md
+++ /dev/null
@@ -1,145 +0,0 @@
-![pybind11 logo](https://github.com/pybind/pybind11/raw/master/docs/pybind11-logo.png)
-# pybind11 — Seamless operability between C++11 and Python
-[![Documentation Status](https://readthedocs.org/projects/pybind11/badge/?version=master)](http://pybind11.readthedocs.org/en/master/?badge=master)
-[![Documentation Status](https://readthedocs.org/projects/pybind11/badge/?version=stable)](http://pybind11.readthedocs.org/en/stable/?badge=stable)
-[![Gitter chat](https://img.shields.io/gitter/room/gitterHQ/gitter.svg)](https://gitter.im/pybind/Lobby)
-[![Build status](https://ci.appveyor.com/api/projects/status/riaj54pn4h08xy40?svg=true)](https://ci.appveyor.com/project/wjakob/pybind11)
-**pybind11** is a lightweight header-only library that exposes C++ types in
-Python and vice versa, mainly to create Python bindings of existing C++ code.
-Its goals and syntax are similar to the excellent [Boost.Python][] library by
-David Abrahams: to minimize boilerplate code in traditional extension modules
-by inferring type information using compile-time introspection.
-The main issue with Boost.Python—and the reason for creating such a similar
-project—is Boost. Boost is an enormously large and complex suite of utility
-libraries that works with almost every C++ compiler in existence. This
-compatibility has its cost: arcane template tricks and workarounds are
-necessary to support the oldest and buggiest of compiler specimens. Now that
-C++11-compatible compilers are widely available, this heavy machinery has
-become an excessively large and unnecessary dependency.
-Think of this library as a tiny self-contained version of Boost.Python with
-everything stripped away that isn't relevant for binding generation. Without
-comments, the core header files only require ~4K lines of code and depend on
-Python (2.7 or 3.5+, or PyPy) and the C++ standard library. This compact
-implementation was possible thanks to some of the new C++11 language features
-(specifically: tuples, lambda functions and variadic templates). Since its
-creation, this library has grown beyond Boost.Python in many ways, leading to
-dramatically simpler binding code in many common situations.
-Tutorial and reference documentation is provided at
-[pybind11.readthedocs.org][].  A PDF version of the manual is available
-## Core features
-pybind11 can map the following core C++ features to Python:
-- Functions accepting and returning custom data structures per value, reference, or pointer
-- Instance methods and static methods
-- Overloaded functions
-- Instance attributes and static attributes
-- Arbitrary exception types
-- Enumerations
-- Callbacks
-- Iterators and ranges
-- Custom operators
-- Single and multiple inheritance
-- STL data structures
-- Smart pointers with reference counting like `std::shared_ptr`
-- Internal references with correct reference counting
-- C++ classes with virtual (and pure virtual) methods can be extended in Python
-## Goodies
-In addition to the core functionality, pybind11 provides some extra goodies:
-- Python 2.7, 3.5+, and PyPy (tested on 7.3) are supported with an implementation-agnostic
-  interface.
-- It is possible to bind C++11 lambda functions with captured variables. The
-  lambda capture data is stored inside the resulting Python function object.
-- pybind11 uses C++11 move constructors and move assignment operators whenever
-  possible to efficiently transfer custom data types.
-- It's easy to expose the internal storage of custom data types through
-  Pythons' buffer protocols. This is handy e.g. for fast conversion between
-  C++ matrix classes like Eigen and NumPy without expensive copy operations.
-- pybind11 can automatically vectorize functions so that they are transparently
-  applied to all entries of one or more NumPy array arguments.
-- Python's slice-based access and assignment operations can be supported with
-  just a few lines of code.
-- Everything is contained in just a few header files; there is no need to link
-  against any additional libraries.
-- Binaries are generally smaller by a factor of at least 2 compared to
-  equivalent bindings generated by Boost.Python. A recent pybind11 conversion
-  of PyRosetta, an enormous Boost.Python binding project,
-  [reported][pyrosetta-report] a binary size reduction of **5.4x** and compile
-  time reduction by **5.8x**.
-- Function signatures are precomputed at compile time (using `constexpr`),
-  leading to smaller binaries.
-- With little extra effort, C++ types can be pickled and unpickled similar to
-  regular Python objects.
-## Supported compilers
-1. Clang/LLVM 3.3 or newer (for Apple Xcode's clang, this is 5.0.0 or newer)
-2. GCC 4.8 or newer
-3. Microsoft Visual Studio 2015 Update 3 or newer
-4. Intel C++ compiler 17 or newer (16 with pybind11 v2.0 and 15 with pybind11
-   v2.0 and a [workaround][intel-15-workaround])
-5. Cygwin/GCC (tested on 2.5.1)
-6. NVCC (CUDA 11 tested)
-7. NVIDIA PGI (20.7 tested)
-## About
-This project was created by [Wenzel Jakob](http://rgl.epfl.ch/people/wjakob).
-Significant features and/or improvements to the code were contributed by
-Jonas Adler,
-Lori A. Burns,
-Sylvain Corlay,
-Trent Houliston,
-Axel Huebl,
-Sergey Lyskov
-Johan Mabille,
-Tomasz Miąsko,
-Dean Moldovan,
-Ben Pritchard,
-Jason Rhinelander,
-Boris Schäling,
-Pim Schellart,
-Henry Schreiner,
-Ivan Smirnov, and
-Patrick Stewart.
-### Contributing
-See the [contributing guide][] for information on building and contributing to
-### License
-pybind11 is provided under a BSD-style license that can be found in the
-[`LICENSE`][] file. By using, distributing, or contributing to this project,
-you agree to the terms and conditions of this license.
-[pybind11.readthedocs.org]: http://pybind11.readthedocs.org/en/master
-[docs-pdf]: https://media.readthedocs.org/pdf/pybind11/master/pybind11.pdf
-[Boost.Python]: http://www.boost.org/doc/libs/1_58_0/libs/python/doc/
-[pyrosetta-report]: http://graylab.jhu.edu/RosettaCon2016/PyRosetta-4.pdf
-[contributing guide]:  https://github.com/pybind/pybind11/blob/master/.github/CONTRIBUTING.md
-[`LICENSE`]: https://github.com/pybind/pybind11/blob/master/LICENSE
-[intel-15-workaround]: https://github.com/pybind/pybind11/issues/276
diff --git a/wrap/pybind11/README.rst b/wrap/pybind11/README.rst
new file mode 100644
index 0000000000..45c4af5a60
--- /dev/null
+++ b/wrap/pybind11/README.rst
@@ -0,0 +1,180 @@
+.. figure:: https://github.com/pybind/pybind11/raw/master/docs/pybind11-logo.png
+   :alt: pybind11 logo
+**pybind11 — Seamless operability between C++11 and Python**
+|Latest Documentation Status| |Stable Documentation Status| |Gitter chat| |GitHub Discussions| |CI| |Build status|
+|Repology| |PyPI package| |Conda-forge| |Python Versions|
+`Setuptools example <https://github.com/pybind/python_example>`_
+• `Scikit-build example <https://github.com/pybind/scikit_build_example>`_
+• `CMake example <https://github.com/pybind/cmake_example>`_
+.. start
+**pybind11** is a lightweight header-only library that exposes C++ types
+in Python and vice versa, mainly to create Python bindings of existing
+C++ code. Its goals and syntax are similar to the excellent
+`Boost.Python <http://www.boost.org/doc/libs/1_58_0/libs/python/doc/>`_
+library by David Abrahams: to minimize boilerplate code in traditional
+extension modules by inferring type information using compile-time
+The main issue with Boost.Python—and the reason for creating such a
+similar project—is Boost. Boost is an enormously large and complex suite
+of utility libraries that works with almost every C++ compiler in
+existence. This compatibility has its cost: arcane template tricks and
+workarounds are necessary to support the oldest and buggiest of compiler
+specimens. Now that C++11-compatible compilers are widely available,
+this heavy machinery has become an excessively large and unnecessary
+Think of this library as a tiny self-contained version of Boost.Python
+with everything stripped away that isn’t relevant for binding
+generation. Without comments, the core header files only require ~4K
+lines of code and depend on Python (2.7 or 3.5+, or PyPy) and the C++
+standard library. This compact implementation was possible thanks to
+some of the new C++11 language features (specifically: tuples, lambda
+functions and variadic templates). Since its creation, this library has
+grown beyond Boost.Python in many ways, leading to dramatically simpler
+binding code in many common situations.
+Tutorial and reference documentation is provided at
+`pybind11.readthedocs.io <https://pybind11.readthedocs.io/en/latest>`_.
+A PDF version of the manual is available
+`here <https://pybind11.readthedocs.io/_/downloads/en/latest/pdf/>`_.
+And the source code is always available at
+`github.com/pybind/pybind11 <https://github.com/pybind/pybind11>`_.
+Core features
+pybind11 can map the following core C++ features to Python:
+- Functions accepting and returning custom data structures per value,
+  reference, or pointer
+- Instance methods and static methods
+- Overloaded functions
+- Instance attributes and static attributes
+- Arbitrary exception types
+- Enumerations
+- Callbacks
+- Iterators and ranges
+- Custom operators
+- Single and multiple inheritance
+- STL data structures
+- Smart pointers with reference counting like ``std::shared_ptr``
+- Internal references with correct reference counting
+- C++ classes with virtual (and pure virtual) methods can be extended
+  in Python
+In addition to the core functionality, pybind11 provides some extra
+- Python 2.7, 3.5+, and PyPy/PyPy3 7.3 are supported with an
+  implementation-agnostic interface.
+- It is possible to bind C++11 lambda functions with captured
+  variables. The lambda capture data is stored inside the resulting
+  Python function object.
+- pybind11 uses C++11 move constructors and move assignment operators
+  whenever possible to efficiently transfer custom data types.
+- It’s easy to expose the internal storage of custom data types through
+  Pythons’ buffer protocols. This is handy e.g. for fast conversion
+  between C++ matrix classes like Eigen and NumPy without expensive
+  copy operations.
+- pybind11 can automatically vectorize functions so that they are
+  transparently applied to all entries of one or more NumPy array
+  arguments.
+- Python's slice-based access and assignment operations can be
+  supported with just a few lines of code.
+- Everything is contained in just a few header files; there is no need
+  to link against any additional libraries.
+- Binaries are generally smaller by a factor of at least 2 compared to
+  equivalent bindings generated by Boost.Python. A recent pybind11
+  conversion of PyRosetta, an enormous Boost.Python binding project,
+  `reported <https://graylab.jhu.edu/Sergey/2016.RosettaCon/PyRosetta-4.pdf>`_
+  a binary size reduction of **5.4x** and compile time reduction by
+  **5.8x**.
+- Function signatures are precomputed at compile time (using
+  ``constexpr``), leading to smaller binaries.
+- With little extra effort, C++ types can be pickled and unpickled
+  similar to regular Python objects.
+Supported compilers
+1. Clang/LLVM 3.3 or newer (for Apple Xcode’s clang, this is 5.0.0 or
+   newer)
+2. GCC 4.8 or newer
+3. Microsoft Visual Studio 2015 Update 3 or newer
+4. Intel classic C++ compiler 18 or newer (ICC 20.2 tested in CI)
+5. Cygwin/GCC (previously tested on 2.5.1)
+6. NVCC (CUDA 11.0 tested in CI)
+7. NVIDIA PGI (20.9 tested in CI)
+This project was created by `Wenzel
+Jakob <http://rgl.epfl.ch/people/wjakob>`_. Significant features and/or
+improvements to the code were contributed by Jonas Adler, Lori A. Burns,
+Sylvain Corlay, Eric Cousineau, Aaron Gokaslan, Ralf Grosse-Kunstleve, Trent Houliston, Axel
+Huebl, @hulucc, Yannick Jadoul, Sergey Lyskov Johan Mabille, Tomasz Miąsko,
+Dean Moldovan, Ben Pritchard, Jason Rhinelander, Boris Schäling, Pim
+Schellart, Henry Schreiner, Ivan Smirnov, Boris Staletic, and Patrick Stewart.
+We thank Google for a generous financial contribution to the continuous
+integration infrastructure used by this project.
+See the `contributing
+guide <https://github.com/pybind/pybind11/blob/master/.github/CONTRIBUTING.md>`_
+for information on building and contributing to pybind11.
+pybind11 is provided under a BSD-style license that can be found in the
+`LICENSE <https://github.com/pybind/pybind11/blob/master/LICENSE>`_
+file. By using, distributing, or contributing to this project, you agree
+to the terms and conditions of this license.
+.. |Latest Documentation Status| image:: https://readthedocs.org/projects/pybind11/badge?version=latest
+   :target: http://pybind11.readthedocs.org/en/latest
+.. |Stable Documentation Status| image:: https://img.shields.io/badge/docs-stable-blue.svg
+   :target: http://pybind11.readthedocs.org/en/stable
+.. |Gitter chat| image:: https://img.shields.io/gitter/room/gitterHQ/gitter.svg
+   :target: https://gitter.im/pybind/Lobby
+.. |CI| image:: https://github.com/pybind/pybind11/workflows/CI/badge.svg
+   :target: https://github.com/pybind/pybind11/actions
+.. |Build status| image:: https://ci.appveyor.com/api/projects/status/riaj54pn4h08xy40?svg=true
+   :target: https://ci.appveyor.com/project/wjakob/pybind11
+.. |PyPI package| image:: https://img.shields.io/pypi/v/pybind11.svg
+   :target: https://pypi.org/project/pybind11/
+.. |Conda-forge| image:: https://img.shields.io/conda/vn/conda-forge/pybind11.svg
+   :target: https://github.com/conda-forge/pybind11-feedstock
+.. |Repology| image:: https://repology.org/badge/latest-versions/python:pybind11.svg
+   :target: https://repology.org/project/python:pybind11/versions
+.. |Python Versions| image:: https://img.shields.io/pypi/pyversions/pybind11.svg
+   :target: https://pypi.org/project/pybind11/
+.. |GitHub Discussions| image:: https://img.shields.io/static/v1?label=Discussions&message=Ask&color=blue&logo=github
+   :target: https://github.com/pybind/pybind11/discussions
diff --git a/wrap/pybind11/docs/Doxyfile b/wrap/pybind11/docs/Doxyfile
index 24ece0d8db..62c2675563 100644
--- a/wrap/pybind11/docs/Doxyfile
+++ b/wrap/pybind11/docs/Doxyfile
@@ -18,5 +18,5 @@ ALIASES               += "endrst=\endverbatim"
 QUIET                  = YES
 WARNINGS               = YES
-                         PY_MAJOR_VERSION=3
+                         PYBIND11_NOINLINE
diff --git a/wrap/pybind11/docs/advanced/cast/custom.rst b/wrap/pybind11/docs/advanced/cast/custom.rst
index a779444c24..1df4d3e14b 100644
--- a/wrap/pybind11/docs/advanced/cast/custom.rst
+++ b/wrap/pybind11/docs/advanced/cast/custom.rst
@@ -26,7 +26,9 @@ The following Python snippet demonstrates the intended usage from the Python sid
         def __int__(self):
             return 123
     from example import print
 To register the necessary conversion routines, it is necessary to add an
@@ -44,7 +46,7 @@ type is explicitly allowed.
              * function signatures and declares a local variable
              * 'value' of type inty
-            PYBIND11_TYPE_CASTER(inty, _("inty"));
+            PYBIND11_TYPE_CASTER(inty, const_name("inty"));
              * Conversion part 1 (Python->C++): convert a PyObject into a inty
diff --git a/wrap/pybind11/docs/advanced/cast/eigen.rst b/wrap/pybind11/docs/advanced/cast/eigen.rst
index e01472d5ae..a5c11a3f14 100644
--- a/wrap/pybind11/docs/advanced/cast/eigen.rst
+++ b/wrap/pybind11/docs/advanced/cast/eigen.rst
@@ -52,7 +52,7 @@ can be mapped *and* if the numpy array is writeable (that is
 the passed variable will be transparently carried out directly on the
-This means you can can write code such as the following and have it work as
+This means you can write code such as the following and have it work as
 .. code-block:: cpp
@@ -112,7 +112,7 @@ example:
 .. code-block:: python
     a = MyClass()
-    m = a.get_matrix()   # flags.writeable = True,  flags.owndata = False
+    m = a.get_matrix()  # flags.writeable = True,  flags.owndata = False
     v = a.view_matrix()  # flags.writeable = False, flags.owndata = False
     c = a.copy_matrix()  # flags.writeable = True,  flags.owndata = True
     # m[5,6] and v[5,6] refer to the same element, c[5,6] does not.
@@ -203,7 +203,7 @@ adding the ``order='F'`` option when creating an array:
 .. code-block:: python
-    myarray = np.array(source, order='F')
+    myarray = np.array(source, order="F")
 Such an object will be passable to a bound function accepting an
 ``Eigen::Ref<MatrixXd>`` (or similar column-major Eigen type).
diff --git a/wrap/pybind11/docs/advanced/cast/overview.rst b/wrap/pybind11/docs/advanced/cast/overview.rst
index b0e32a52f9..6a834a3e51 100644
--- a/wrap/pybind11/docs/advanced/cast/overview.rst
+++ b/wrap/pybind11/docs/advanced/cast/overview.rst
@@ -75,91 +75,97 @@ The following basic data types are supported out of the box (some may require
 an additional extension header to be included). To pass other data structures
 as arguments and return values, refer to the section on binding :ref:`classes`.
-|  Data type                         |  Description              | Header file                   |
-| ``int8_t``, ``uint8_t``            | 8-bit integers            | :file:`pybind11/pybind11.h`   |
-| ``int16_t``, ``uint16_t``          | 16-bit integers           | :file:`pybind11/pybind11.h`   |
-| ``int32_t``, ``uint32_t``          | 32-bit integers           | :file:`pybind11/pybind11.h`   |
-| ``int64_t``, ``uint64_t``          | 64-bit integers           | :file:`pybind11/pybind11.h`   |
-| ``ssize_t``, ``size_t``            | Platform-dependent size   | :file:`pybind11/pybind11.h`   |
-| ``float``, ``double``              | Floating point types      | :file:`pybind11/pybind11.h`   |
-| ``bool``                           | Two-state Boolean type    | :file:`pybind11/pybind11.h`   |
-| ``char``                           | Character literal         | :file:`pybind11/pybind11.h`   |
-| ``char16_t``                       | UTF-16 character literal  | :file:`pybind11/pybind11.h`   |
-| ``char32_t``                       | UTF-32 character literal  | :file:`pybind11/pybind11.h`   |
-| ``wchar_t``                        | Wide character literal    | :file:`pybind11/pybind11.h`   |
-| ``const char *``                   | UTF-8 string literal      | :file:`pybind11/pybind11.h`   |
-| ``const char16_t *``               | UTF-16 string literal     | :file:`pybind11/pybind11.h`   |
-| ``const char32_t *``               | UTF-32 string literal     | :file:`pybind11/pybind11.h`   |
-| ``const wchar_t *``                | Wide string literal       | :file:`pybind11/pybind11.h`   |
-| ``std::string``                    | STL dynamic UTF-8 string  | :file:`pybind11/pybind11.h`   |
-| ``std::u16string``                 | STL dynamic UTF-16 string | :file:`pybind11/pybind11.h`   |
-| ``std::u32string``                 | STL dynamic UTF-32 string | :file:`pybind11/pybind11.h`   |
-| ``std::wstring``                   | STL dynamic wide string   | :file:`pybind11/pybind11.h`   |
-| ``std::string_view``,              | STL C++17 string views    | :file:`pybind11/pybind11.h`   |
-| ``std::u16string_view``, etc.      |                           |                               |
-| ``std::pair<T1, T2>``              | Pair of two custom types  | :file:`pybind11/pybind11.h`   |
-| ``std::tuple<...>``                | Arbitrary tuple of types  | :file:`pybind11/pybind11.h`   |
-| ``std::reference_wrapper<...>``    | Reference type wrapper    | :file:`pybind11/pybind11.h`   |
-| ``std::complex<T>``                | Complex numbers           | :file:`pybind11/complex.h`    |
-| ``std::array<T, Size>``            | STL static array          | :file:`pybind11/stl.h`        |
-| ``std::vector<T>``                 | STL dynamic array         | :file:`pybind11/stl.h`        |
-| ``std::deque<T>``                  | STL double-ended queue    | :file:`pybind11/stl.h`        |
-| ``std::valarray<T>``               | STL value array           | :file:`pybind11/stl.h`        |
-| ``std::list<T>``                   | STL linked list           | :file:`pybind11/stl.h`        |
-| ``std::map<T1, T2>``               | STL ordered map           | :file:`pybind11/stl.h`        |
-| ``std::unordered_map<T1, T2>``     | STL unordered map         | :file:`pybind11/stl.h`        |
-| ``std::set<T>``                    | STL ordered set           | :file:`pybind11/stl.h`        |
-| ``std::unordered_set<T>``          | STL unordered set         | :file:`pybind11/stl.h`        |
-| ``std::optional<T>``               | STL optional type (C++17) | :file:`pybind11/stl.h`        |
-| ``std::experimental::optional<T>`` | STL optional type (exp.)  | :file:`pybind11/stl.h`        |
-| ``std::variant<...>``              | Type-safe union (C++17)   | :file:`pybind11/stl.h`        |
-| ``std::function<...>``             | STL polymorphic function  | :file:`pybind11/functional.h` |
-| ``std::chrono::duration<...>``     | STL time duration         | :file:`pybind11/chrono.h`     |
-| ``std::chrono::time_point<...>``   | STL date/time             | :file:`pybind11/chrono.h`     |
-| ``Eigen::Matrix<...>``             | Eigen: dense matrix       | :file:`pybind11/eigen.h`      |
-| ``Eigen::Map<...>``                | Eigen: mapped memory      | :file:`pybind11/eigen.h`      |
-| ``Eigen::SparseMatrix<...>``       | Eigen: sparse matrix      | :file:`pybind11/eigen.h`      |
+|  Data type                         |  Description              | Header file                       |
+| ``int8_t``, ``uint8_t``            | 8-bit integers            | :file:`pybind11/pybind11.h`       |
+| ``int16_t``, ``uint16_t``          | 16-bit integers           | :file:`pybind11/pybind11.h`       |
+| ``int32_t``, ``uint32_t``          | 32-bit integers           | :file:`pybind11/pybind11.h`       |
+| ``int64_t``, ``uint64_t``          | 64-bit integers           | :file:`pybind11/pybind11.h`       |
+| ``ssize_t``, ``size_t``            | Platform-dependent size   | :file:`pybind11/pybind11.h`       |
+| ``float``, ``double``              | Floating point types      | :file:`pybind11/pybind11.h`       |
+| ``bool``                           | Two-state Boolean type    | :file:`pybind11/pybind11.h`       |
+| ``char``                           | Character literal         | :file:`pybind11/pybind11.h`       |
+| ``char16_t``                       | UTF-16 character literal  | :file:`pybind11/pybind11.h`       |
+| ``char32_t``                       | UTF-32 character literal  | :file:`pybind11/pybind11.h`       |
+| ``wchar_t``                        | Wide character literal    | :file:`pybind11/pybind11.h`       |
+| ``const char *``                   | UTF-8 string literal      | :file:`pybind11/pybind11.h`       |
+| ``const char16_t *``               | UTF-16 string literal     | :file:`pybind11/pybind11.h`       |
+| ``const char32_t *``               | UTF-32 string literal     | :file:`pybind11/pybind11.h`       |
+| ``const wchar_t *``                | Wide string literal       | :file:`pybind11/pybind11.h`       |
+| ``std::string``                    | STL dynamic UTF-8 string  | :file:`pybind11/pybind11.h`       |
+| ``std::u16string``                 | STL dynamic UTF-16 string | :file:`pybind11/pybind11.h`       |
+| ``std::u32string``                 | STL dynamic UTF-32 string | :file:`pybind11/pybind11.h`       |
+| ``std::wstring``                   | STL dynamic wide string   | :file:`pybind11/pybind11.h`       |
+| ``std::string_view``,              | STL C++17 string views    | :file:`pybind11/pybind11.h`       |
+| ``std::u16string_view``, etc.      |                           |                                   |
+| ``std::pair<T1, T2>``              | Pair of two custom types  | :file:`pybind11/pybind11.h`       |
+| ``std::tuple<...>``                | Arbitrary tuple of types  | :file:`pybind11/pybind11.h`       |
+| ``std::reference_wrapper<...>``    | Reference type wrapper    | :file:`pybind11/pybind11.h`       |
+| ``std::complex<T>``                | Complex numbers           | :file:`pybind11/complex.h`        |
+| ``std::array<T, Size>``            | STL static array          | :file:`pybind11/stl.h`            |
+| ``std::vector<T>``                 | STL dynamic array         | :file:`pybind11/stl.h`            |
+| ``std::deque<T>``                  | STL double-ended queue    | :file:`pybind11/stl.h`            |
+| ``std::valarray<T>``               | STL value array           | :file:`pybind11/stl.h`            |
+| ``std::list<T>``                   | STL linked list           | :file:`pybind11/stl.h`            |
+| ``std::map<T1, T2>``               | STL ordered map           | :file:`pybind11/stl.h`            |
+| ``std::unordered_map<T1, T2>``     | STL unordered map         | :file:`pybind11/stl.h`            |
+| ``std::set<T>``                    | STL ordered set           | :file:`pybind11/stl.h`            |
+| ``std::unordered_set<T>``          | STL unordered set         | :file:`pybind11/stl.h`            |
+| ``std::optional<T>``               | STL optional type (C++17) | :file:`pybind11/stl.h`            |
+| ``std::experimental::optional<T>`` | STL optional type (exp.)  | :file:`pybind11/stl.h`            |
+| ``std::variant<...>``              | Type-safe union (C++17)   | :file:`pybind11/stl.h`            |
+| ``std::filesystem::path<T>``       | STL path (C++17) [#]_     | :file:`pybind11/stl/filesystem.h` |
+| ``std::function<...>``             | STL polymorphic function  | :file:`pybind11/functional.h`     |
+| ``std::chrono::duration<...>``     | STL time duration         | :file:`pybind11/chrono.h`         |
+| ``std::chrono::time_point<...>``   | STL date/time             | :file:`pybind11/chrono.h`         |
+| ``Eigen::Matrix<...>``             | Eigen: dense matrix       | :file:`pybind11/eigen.h`          |
+| ``Eigen::Map<...>``                | Eigen: mapped memory      | :file:`pybind11/eigen.h`          |
+| ``Eigen::SparseMatrix<...>``       | Eigen: sparse matrix      | :file:`pybind11/eigen.h`          |
+.. [#] ``std::filesystem::path`` is converted to ``pathlib.Path`` and
+   ``os.PathLike`` is converted to ``std::filesystem::path``, but this requires
+   Python 3.6 (for ``__fspath__`` support).
diff --git a/wrap/pybind11/docs/advanced/cast/stl.rst b/wrap/pybind11/docs/advanced/cast/stl.rst
index 7f708b81ea..b8622ee095 100644
--- a/wrap/pybind11/docs/advanced/cast/stl.rst
+++ b/wrap/pybind11/docs/advanced/cast/stl.rst
@@ -5,7 +5,7 @@ Automatic conversion
 When including the additional header file :file:`pybind11/stl.h`, conversions
-between ``std::vector<>``/``std::deque<>``/``std::list<>``/``std::array<>``,
+between ``std::vector<>``/``std::deque<>``/``std::list<>``/``std::array<>``/``std::valarray<>``,
 ``std::set<>``/``std::unordered_set<>``, and
 ``std::map<>``/``std::unordered_map<>`` and the Python ``list``, ``set`` and
 ``dict`` data structures are automatically enabled. The types ``std::pair<>``
@@ -72,6 +72,17 @@ The ``visit_helper`` specialization is not required if your ``name::variant`` pr
 a ``name::visit()`` function. For any other function name, the specialization must be
 included to tell pybind11 how to visit the variant.
+.. warning::
+    When converting a ``variant`` type, pybind11 follows the same rules as when
+    determining which function overload to call (:ref:`overload_resolution`), and
+    so the same caveats hold. In particular, the order in which the ``variant``'s
+    alternatives are listed is important, since pybind11 will try conversions in
+    this order. This means that, for example, when converting ``variant<int, bool>``,
+    the ``bool`` variant will never be selected, as any Python ``bool`` is already
+    an ``int`` and is convertible to a C++ ``int``. Changing the order of alternatives
+    (and using ``variant<bool, int>``, in this example) provides a solution.
 .. note::
     pybind11 only supports the modern implementation of ``boost::variant``
diff --git a/wrap/pybind11/docs/advanced/cast/strings.rst b/wrap/pybind11/docs/advanced/cast/strings.rst
index e25701ecab..cfd7e7b7a5 100644
--- a/wrap/pybind11/docs/advanced/cast/strings.rst
+++ b/wrap/pybind11/docs/advanced/cast/strings.rst
@@ -36,13 +36,13 @@ everywhere <http://utf8everywhere.org/>`_.
-.. code-block:: python
+.. code-block:: pycon
-    >>> utf8_test('🎂')
+    >>> utf8_test("🎂")
     utf-8 is icing on the cake.
-    >>> utf8_charptr('🍕')
+    >>> utf8_charptr("🍕")
     My favorite food is
@@ -80,7 +80,7 @@ raise a ``UnicodeDecodeError``.
-.. code-block:: python
+.. code-block:: pycon
     >>> isinstance(example.std_string_return(), str)
@@ -114,7 +114,7 @@ conversion has the same overhead as implicit conversion.
-.. code-block:: python
+.. code-block:: pycon
     >>> str_output()
     'Send your résumé to Alice in HR'
@@ -143,7 +143,7 @@ returned to Python as ``bytes``, then one can return the data as a
-.. code-block:: python
+.. code-block:: pycon
     >>> example.return_bytes()
@@ -160,7 +160,7 @@ encoding, but cannot convert ``std::string`` back to ``bytes`` implicitly.
-.. code-block:: python
+.. code-block:: pycon
     >>> isinstance(example.asymmetry(b"have some bytes"), str)
@@ -229,16 +229,16 @@ character.
     m.def("pass_char", [](char c) { return c; });
     m.def("pass_wchar", [](wchar_t w) { return w; });
-.. code-block:: python
+.. code-block:: pycon
-    >>> example.pass_char('A')
+    >>> example.pass_char("A")
 While C++ will cast integers to character types (``char c = 0x65;``), pybind11
 does not convert Python integers to characters implicitly. The Python function
 ``chr()`` can be used to convert integers to characters.
-.. code-block:: python
+.. code-block:: pycon
     >>> example.pass_char(0x65)
@@ -259,17 +259,17 @@ a combining acute accent). The combining character will be lost if the
 two-character sequence is passed as an argument, even though it renders as a
 single grapheme.
-.. code-block:: python
+.. code-block:: pycon
-    >>> example.pass_wchar('é')
+    >>> example.pass_wchar("é")
-    >>> combining_e_acute = 'e' + '\u0301'
+    >>> combining_e_acute = "e" + "\u0301"
     >>> combining_e_acute
-    >>> combining_e_acute == 'é'
+    >>> combining_e_acute == "é"
     >>> example.pass_wchar(combining_e_acute)
@@ -278,9 +278,9 @@ single grapheme.
 Normalizing combining characters before passing the character literal to C++
 may resolve *some* of these issues:
-.. code-block:: python
+.. code-block:: pycon
-    >>> example.pass_wchar(unicodedata.normalize('NFC', combining_e_acute))
+    >>> example.pass_wchar(unicodedata.normalize("NFC", combining_e_acute))
 In some languages (Thai for example), there are `graphemes that cannot be
diff --git a/wrap/pybind11/docs/advanced/classes.rst b/wrap/pybind11/docs/advanced/classes.rst
index 4927902069..f3339336dc 100644
--- a/wrap/pybind11/docs/advanced/classes.rst
+++ b/wrap/pybind11/docs/advanced/classes.rst
@@ -9,7 +9,7 @@ that you are already familiar with the basics from :doc:`/classes`.
 Overriding virtual functions in Python
-Suppose that a C++ class or interface has a virtual function that we'd like to
+Suppose that a C++ class or interface has a virtual function that we'd like
 to override from within Python (we'll focus on the class ``Animal``; ``Dog`` is
 given as a specific example of how one would do this with traditional C++
@@ -136,7 +136,7 @@ a virtual method call.
     u'woof! woof! woof! '
     >>> class Cat(Animal):
     ...     def go(self, n_times):
-    ...             return "meow! " * n_times
+    ...         return "meow! " * n_times
     >>> c = Cat()
     >>> call_go(c)
@@ -159,8 +159,9 @@ Here is an example:
     class Dachshund(Dog):
         def __init__(self, name):
-            Dog.__init__(self) # Without this, a TypeError is raised.
+            Dog.__init__(self)  # Without this, a TypeError is raised.
             self.name = name
         def bark(self):
             return "yap!"
@@ -259,7 +260,7 @@ override the ``name()`` method):
 .. note::
-    Note the trailing commas in the ``PYBIND11_OVERIDE`` calls to ``name()``
+    Note the trailing commas in the ``PYBIND11_OVERRIDE`` calls to ``name()``
     and ``bark()``. These are needed to portably implement a trampoline for a
     function that does not take any arguments. For functions that take
     a nonzero number of arguments, the trailing comma must be omitted.
@@ -804,7 +805,7 @@ to bind these two functions:
-The ``__setstate__`` part of the ``py::picke()`` definition follows the same
+The ``__setstate__`` part of the ``py::pickle()`` definition follows the same
 rules as the single-argument version of ``py::init()``. The return type can be
 a value, pointer or holder type. See :ref:`custom_constructors` for details.
@@ -1153,12 +1154,65 @@ error:
     >>> class PyFinalChild(IsFinal):
     ...     pass
+    ...
     TypeError: type 'IsFinal' is not an acceptable base type
 .. note:: This attribute is currently ignored on PyPy
 .. versionadded:: 2.6
+Binding classes with template parameters
+pybind11 can also wrap classes that have template parameters. Consider these classes:
+.. code-block:: cpp
+    struct Cat {};
+    struct Dog {};
+    template <typename PetType>
+    struct Cage {
+        Cage(PetType& pet);
+        PetType& get();
+    };
+C++ templates may only be instantiated at compile time, so pybind11 can only
+wrap instantiated templated classes. You cannot wrap a non-instantiated template:
+.. code-block:: cpp
+    // BROKEN (this will not compile)
+    py::class_<Cage>(m, "Cage");
+        .def("get", &Cage::get);
+You must explicitly specify each template/type combination that you want to
+wrap separately.
+.. code-block:: cpp
+    // ok
+    py::class_<Cage<Cat>>(m, "CatCage")
+        .def("get", &Cage<Cat>::get);
+    // ok
+    py::class_<Cage<Dog>>(m, "DogCage")
+        .def("get", &Cage<Dog>::get);
+If your class methods have template parameters you can wrap those as well,
+but once again each instantiation must be explicitly specified:
+.. code-block:: cpp
+    typename <typename T>
+    struct MyClass {
+        template <typename V>
+        T fn(V v);
+    };
+    py::class<MyClass<int>>(m, "MyClassT")
+        .def("fn", &MyClass<int>::fn<std::string>);
 Custom automatic downcasters
@@ -1247,7 +1301,7 @@ Accessing the type object
 You can get the type object from a C++ class that has already been registered using:
-.. code-block:: python
+.. code-block:: cpp
     py::type T_py = py::type::of<T>();
@@ -1259,3 +1313,37 @@ object, just like ``type(ob)`` in Python.
     Other types, like ``py::type::of<int>()``, do not work, see :ref:`type-conversions`.
 .. versionadded:: 2.6
+Custom type setup
+For advanced use cases, such as enabling garbage collection support, you may
+wish to directly manipulate the ``PyHeapTypeObject`` corresponding to a
+``py::class_`` definition.
+You can do that using ``py::custom_type_setup``:
+.. code-block:: cpp
+   struct OwnsPythonObjects {
+       py::object value = py::none();
+   };
+   py::class_<OwnsPythonObjects> cls(
+       m, "OwnsPythonObjects", py::custom_type_setup([](PyHeapTypeObject *heap_type) {
+           auto *type = &heap_type->ht_type;
+           type->tp_flags |= Py_TPFLAGS_HAVE_GC;
+           type->tp_traverse = [](PyObject *self_base, visitproc visit, void *arg) {
+               auto &self = py::cast<OwnsPythonObjects&>(py::handle(self_base));
+               Py_VISIT(self.value.ptr());
+               return 0;
+           };
+           type->tp_clear = [](PyObject *self_base) {
+               auto &self = py::cast<OwnsPythonObjects&>(py::handle(self_base));
+               self.value = py::none();
+               return 0;
+           };
+       }));
+   cls.def(py::init<>());
+   cls.def_readwrite("value", &OwnsPythonObjects::value);
+.. versionadded:: 2.8
diff --git a/wrap/pybind11/docs/advanced/embedding.rst b/wrap/pybind11/docs/advanced/embedding.rst
index 98a5c52190..dd980d483a 100644
--- a/wrap/pybind11/docs/advanced/embedding.rst
+++ b/wrap/pybind11/docs/advanced/embedding.rst
@@ -40,15 +40,15 @@ The essential structure of the ``main.cpp`` file looks like this:
 The interpreter must be initialized before using any Python API, which includes
-all the functions and classes in pybind11. The RAII guard class `scoped_interpreter`
+all the functions and classes in pybind11. The RAII guard class ``scoped_interpreter``
 takes care of the interpreter lifetime. After the guard is destroyed, the interpreter
 shuts down and clears its memory. No Python functions can be called after this.
 Executing Python code
-There are a few different ways to run Python code. One option is to use `eval`,
-`exec` or `eval_file`, as explained in :ref:`eval`. Here is a quick example in
+There are a few different ways to run Python code. One option is to use ``eval``,
+``exec`` or ``eval_file``, as explained in :ref:`eval`. Here is a quick example in
 the context of an executable with an embedded interpreter:
 .. code-block:: cpp
@@ -108,11 +108,11 @@ The two approaches can also be combined:
 Importing modules
-Python modules can be imported using `module::import()`:
+Python modules can be imported using ``module_::import()``:
 .. code-block:: cpp
-    py::module sys = py::module::import("sys");
+    py::module_ sys = py::module_::import("sys");
 For convenience, the current working directory is included in ``sys.path`` when
@@ -122,18 +122,19 @@ embedding the interpreter. This makes it easy to import local Python files:
     """calc.py located in the working directory"""
     def add(i, j):
         return i + j
 .. code-block:: cpp
-    py::module calc = py::module::import("calc");
+    py::module_ calc = py::module_::import("calc");
     py::object result = calc.attr("add")(1, 2);
     int n = result.cast<int>();
     assert(n == 3);
-Modules can be reloaded using `module::reload()` if the source is modified e.g.
+Modules can be reloaded using ``module_::reload()`` if the source is modified e.g.
 by an external process. This can be useful in scenarios where the application
 imports a user defined data processing script which needs to be updated after
 changes by the user. Note that this function does not reload modules recursively.
@@ -143,7 +144,7 @@ changes by the user. Note that this function does not reload modules recursively
 Adding embedded modules
-Embedded binary modules can be added using the `PYBIND11_EMBEDDED_MODULE` macro.
+Embedded binary modules can be added using the ``PYBIND11_EMBEDDED_MODULE`` macro.
 Note that the definition must be placed at global scope. They can be imported
 like any other module.
@@ -153,7 +154,7 @@ like any other module.
     namespace py = pybind11;
     PYBIND11_EMBEDDED_MODULE(fast_calc, m) {
-        // `m` is a `py::module` which is used to bind functions and classes
+        // `m` is a `py::module_` which is used to bind functions and classes
         m.def("add", [](int i, int j) {
             return i + j;
@@ -162,14 +163,14 @@ like any other module.
     int main() {
         py::scoped_interpreter guard{};
-        auto fast_calc = py::module::import("fast_calc");
+        auto fast_calc = py::module_::import("fast_calc");
         auto result = fast_calc.attr("add")(1, 2).cast<int>();
         assert(result == 3);
 Unlike extension modules where only a single binary module can be created, on
 the embedded side an unlimited number of modules can be added using multiple
-`PYBIND11_EMBEDDED_MODULE` definitions (as long as they have unique names).
+``PYBIND11_EMBEDDED_MODULE`` definitions (as long as they have unique names).
 These modules are added to Python's list of builtins, so they can also be
 imported in pure Python files loaded by the interpreter. Everything interacts
@@ -196,7 +197,7 @@ naturally:
     int main() {
         py::scoped_interpreter guard{};
-        auto py_module = py::module::import("py_module");
+        auto py_module = py::module_::import("py_module");
         auto locals = py::dict("fmt"_a="{} + {} = {}", **py_module.attr("__dict__"));
         assert(locals["a"].cast<int>() == 1);
@@ -215,9 +216,9 @@ naturally:
 Interpreter lifetime
-The Python interpreter shuts down when `scoped_interpreter` is destroyed. After
+The Python interpreter shuts down when ``scoped_interpreter`` is destroyed. After
 this, creating a new instance will restart the interpreter. Alternatively, the
-`initialize_interpreter` / `finalize_interpreter` pair of functions can be used
+``initialize_interpreter`` / ``finalize_interpreter`` pair of functions can be used
 to directly set the state at any time.
 Modules created with pybind11 can be safely re-initialized after the interpreter
@@ -229,8 +230,8 @@ global data. All the details can be found in the CPython documentation.
 .. warning::
-    Creating two concurrent `scoped_interpreter` guards is a fatal error. So is
-    calling `initialize_interpreter` for a second time after the interpreter
+    Creating two concurrent ``scoped_interpreter`` guards is a fatal error. So is
+    calling ``initialize_interpreter`` for a second time after the interpreter
     has already been initialized.
     Do not use the raw CPython API functions ``Py_Initialize`` and
@@ -241,7 +242,7 @@ global data. All the details can be found in the CPython documentation.
 Sub-interpreter support
-Creating multiple copies of `scoped_interpreter` is not possible because it
+Creating multiple copies of ``scoped_interpreter`` is not possible because it
 represents the main Python interpreter. Sub-interpreters are something different
 and they do permit the existence of multiple interpreters. This is an advanced
 feature of the CPython API and should be handled with care. pybind11 does not
@@ -257,5 +258,5 @@ We'll just mention a couple of caveats the sub-interpreters support in pybind11:
  2. Managing multiple threads, multiple interpreters and the GIL can be
     challenging and there are several caveats here, even within the pure
     CPython API (please refer to the Python docs for details). As for
-    pybind11, keep in mind that `gil_scoped_release` and `gil_scoped_acquire`
+    pybind11, keep in mind that ``gil_scoped_release`` and ``gil_scoped_acquire``
     do not take sub-interpreters into account.
diff --git a/wrap/pybind11/docs/advanced/exceptions.rst b/wrap/pybind11/docs/advanced/exceptions.rst
index a96f8e8f4d..7cd8447b93 100644
--- a/wrap/pybind11/docs/advanced/exceptions.rst
+++ b/wrap/pybind11/docs/advanced/exceptions.rst
@@ -43,18 +43,28 @@ at its exception handler.
 |                                      | of bounds access in ``__getitem__``, |
 |                                      | ``__setitem__``, etc.)               |
-| :class:`pybind11::value_error`       | ``ValueError`` (used to indicate     |
-|                                      | wrong value passed in                |
-|                                      | ``container.remove(...)``)           |
 | :class:`pybind11::key_error`         | ``KeyError`` (used to indicate out   |
 |                                      | of bounds access in ``__getitem__``, |
 |                                      | ``__setitem__`` in dict-like         |
 |                                      | objects, etc.)                       |
+| :class:`pybind11::value_error`       | ``ValueError`` (used to indicate     |
+|                                      | wrong value passed in                |
+|                                      | ``container.remove(...)``)           |
+| :class:`pybind11::type_error`        | ``TypeError``                        |
+| :class:`pybind11::buffer_error`      | ``BufferError``                      |
+| :class:`pybind11::import_error`      | ``ImportError``                      |
+| :class:`pybind11::attribute_error`   | ``AttributeError``                   |
+| Any other exception                  | ``RuntimeError``                     |
 Exception translation is not bidirectional. That is, *catching* the C++
-exceptions defined above above will not trap exceptions that originate from
+exceptions defined above will not trap exceptions that originate from
 Python. For that, catch :class:`pybind11::error_already_set`. See :ref:`below
 <handling_python_exceptions_cpp>` for further details.
@@ -67,9 +77,10 @@ Registering custom translators
 If the default exception conversion policy described above is insufficient,
 pybind11 also provides support for registering custom exception translators.
-To register a simple exception conversion that translates a C++ exception into
-a new Python exception using the C++ exception's ``what()`` method, a helper
-function is available:
+Similar to pybind11 classes, exception translators can be local to the module
+they are defined in or global to the entire python session.  To register a simple
+exception conversion that translates a C++ exception into a new Python exception
+using the C++ exception's ``what()`` method, a helper function is available:
 .. code-block:: cpp
@@ -79,29 +90,39 @@ This call creates a Python exception class with the name ``PyExp`` in the given
 module and automatically converts any encountered exceptions of type ``CppExp``
 into Python exceptions of type ``PyExp``.
+A matching function is available for registering a local exception translator:
+.. code-block:: cpp
+    py::register_local_exception<CppExp>(module, "PyExp");
 It is possible to specify base class for the exception using the third
-parameter, a `handle`:
+parameter, a ``handle``:
 .. code-block:: cpp
     py::register_exception<CppExp>(module, "PyExp", PyExc_RuntimeError);
+    py::register_local_exception<CppExp>(module, "PyExp", PyExc_RuntimeError);
-Then `PyExp` can be caught both as `PyExp` and `RuntimeError`.
+Then ``PyExp`` can be caught both as ``PyExp`` and ``RuntimeError``.
 The class objects of the built-in Python exceptions are listed in the Python
 documentation on `Standard Exceptions <https://docs.python.org/3/c-api/exceptions.html#standard-exceptions>`_.
-The default base class is `PyExc_Exception`.
+The default base class is ``PyExc_Exception``.
-When more advanced exception translation is needed, the function
-``py::register_exception_translator(translator)`` can be used to register
+When more advanced exception translation is needed, the functions
+``py::register_exception_translator(translator)`` and
+``py::register_local_exception_translator(translator)`` can be used to register
 functions that can translate arbitrary exception types (and which may include
-additional logic to do so).  The function takes a stateless callable (e.g.  a
+additional logic to do so).  The functions takes a stateless callable (e.g. a
 function pointer or a lambda function without captured variables) with the call
 signature ``void(std::exception_ptr)``.
 When a C++ exception is thrown, the registered exception translators are tried
 in reverse order of registration (i.e. the last registered translator gets the
-first shot at handling the exception).
+first shot at handling the exception). All local translators will be tried
+before a global translator is tried.
 Inside the translator, ``std::rethrow_exception`` should be used within
 a try block to re-throw the exception.  One or more catch clauses to catch
@@ -156,6 +177,57 @@ section.
     may be explicitly (re-)thrown to delegate it to the other,
     previously-declared existing exception translators.
+    Note that ``libc++`` and ``libstdc++`` `behave differently <https://stackoverflow.com/questions/19496643/using-clang-fvisibility-hidden-and-typeinfo-and-type-erasure/28827430>`_
+    with ``-fvisibility=hidden``. Therefore exceptions that are used across ABI boundaries need to be explicitly exported, as exercised in ``tests/test_exceptions.h``.
+    See also: "Problems with C++ exceptions" under `GCC Wiki <https://gcc.gnu.org/wiki/Visibility>`_.
+Local vs Global Exception Translators
+When a global exception translator is registered, it will be applied across all
+modules in the reverse order of registration. This can create behavior where the
+order of module import influences how exceptions are translated.
+If module1 has the following translator:
+.. code-block:: cpp
+      py::register_exception_translator([](std::exception_ptr p) {
+        try {
+            if (p) std::rethrow_exception(p);
+        } catch (const std::invalid_argument &e) {
+            PyErr_SetString("module1 handled this")
+        }
+      }
+and module2 has the following similar translator:
+.. code-block:: cpp
+      py::register_exception_translator([](std::exception_ptr p) {
+        try {
+            if (p) std::rethrow_exception(p);
+        } catch (const std::invalid_argument &e) {
+            PyErr_SetString("module2 handled this")
+        }
+      }
+then which translator handles the invalid_argument will be determined by the
+order that module1 and module2 are imported. Since exception translators are
+applied in the reverse order of registration, which ever module was imported
+last will "win" and that translator will be applied.
+If there are multiple pybind11 modules that share exception types (either
+standard built-in or custom) loaded into a single python instance and
+consistent error handling behavior is needed, then local translators should be
+Changing the previous example to use ``register_local_exception_translator``
+would mean that when invalid_argument is thrown in the module2 code, the
+module2 translator will always handle it, while in module1, the module1
+translator will do the same.
 .. _handling_python_exceptions_cpp:
 Handling exceptions from Python in C++
@@ -182,13 +254,13 @@ For example:
     try {
         // open("missing.txt", "r")
-        auto file = py::module::import("io").attr("open")("missing.txt", "r");
+        auto file = py::module_::import("io").attr("open")("missing.txt", "r");
         auto text = file.attr("read")();
     } catch (py::error_already_set &e) {
         if (e.matches(PyExc_FileNotFoundError)) {
             py::print("missing.txt not found");
-        } else if (e.match(PyExc_PermissionError)) {
+        } else if (e.matches(PyExc_PermissionError)) {
             py::print("missing.txt found but not accessible");
         } else {
@@ -253,6 +325,34 @@ Alternately, to ignore the error, call `PyErr_Clear
 Any Python error must be thrown or cleared, or Python/pybind11 will be left in
 an invalid state.
+Chaining exceptions ('raise from')
+In Python 3.3 a mechanism for indicating that exceptions were caused by other
+exceptions was introduced:
+.. code-block:: py
+    try:
+        print(1 / 0)
+    except Exception as exc:
+        raise RuntimeError("could not divide by zero") from exc
+To do a similar thing in pybind11, you can use the ``py::raise_from`` function. It
+sets the current python error indicator, so to continue propagating the exception
+you should ``throw py::error_already_set()`` (Python 3 only).
+.. code-block:: cpp
+    try {
+        py::eval("print(1 / 0"));
+    } catch (py::error_already_set &e) {
+        py::raise_from(e, PyExc_RuntimeError, "could not divide by zero");
+        throw py::error_already_set();
+    }
+.. versionadded:: 2.8
 .. _unraisable_exceptions:
 Handling unraisable exceptions
diff --git a/wrap/pybind11/docs/advanced/functions.rst b/wrap/pybind11/docs/advanced/functions.rst
index c895517c50..bf5b5fa00d 100644
--- a/wrap/pybind11/docs/advanced/functions.rst
+++ b/wrap/pybind11/docs/advanced/functions.rst
@@ -17,7 +17,7 @@ bindings for functions that return a non-trivial type. Just by looking at the
 type information, it is not clear whether Python should take charge of the
 returned value and eventually free its resources, or if this is handled on the
 C++ side. For this reason, pybind11 provides a several *return value policy*
-annotations that can be passed to the :func:`module::def` and
+annotations that can be passed to the :func:`module_::def` and
 :func:`class_::def` functions. The default policy is
@@ -50,7 +50,7 @@ implied transfer of ownership, i.e.:
 .. code-block:: cpp
-    m.def("get_data", &get_data, return_value_policy::reference);
+    m.def("get_data", &get_data, py::return_value_policy::reference);
 On the other hand, this is not the right policy for many other situations,
 where ignoring ownership could lead to resource leaks.
@@ -90,17 +90,18 @@ The following table provides an overview of available policies:
 |                                                  | return value is referenced by Python. This is the default policy for       |
 |                                                  | property getters created via ``def_property``, ``def_readwrite``, etc.     |
-| :enum:`return_value_policy::automatic`           | **Default policy.** This policy falls back to the policy                   |
+| :enum:`return_value_policy::automatic`           | This policy falls back to the policy                                       |
 |                                                  | :enum:`return_value_policy::take_ownership` when the return value is a     |
 |                                                  | pointer. Otherwise, it uses :enum:`return_value_policy::move` or           |
 |                                                  | :enum:`return_value_policy::copy` for rvalue and lvalue references,        |
 |                                                  | respectively. See above for a description of what all of these different   |
-|                                                  | policies do.                                                               |
+|                                                  | policies do. This is the default policy for ``py::class_``-wrapped types.  |
 | :enum:`return_value_policy::automatic_reference` | As above, but use policy :enum:`return_value_policy::reference` when the   |
 |                                                  | return value is a pointer. This is the default conversion policy for       |
 |                                                  | function arguments when calling Python functions manually from C++ code    |
-|                                                  | (i.e. via handle::operator()). You probably won't need to use this.        |
+|                                                  | (i.e. via ``handle::operator()``) and the casters in ``pybind11/stl.h``.   |
+|                                                  | You probably won't need to use this explicitly.                            |
 Return value policies can also be applied to properties:
@@ -119,7 +120,7 @@ targeted arguments can be passed through the :class:`cpp_function` constructor:
 .. code-block:: cpp
     class_<MyClass>(m, "MyClass")
-        .def_property("data"
+        .def_property("data",
             py::cpp_function(&MyClass::getData, py::return_value_policy::copy),
@@ -182,6 +183,9 @@ relies on the ability to create a *weak reference* to the nurse object. When
 the nurse object is not a pybind11-registered type and does not support weak
 references, an exception will be thrown.
+If you use an incorrect argument index, you will get a ``RuntimeError`` saying
+``Could not activate keep_alive!``. You should review the indices you're using.
 Consider the following example: here, the binding code for a list append
 operation ties the lifetime of the newly added element to the underlying
@@ -228,7 +232,7 @@ is equivalent to the following pseudocode:
 The only requirement is that ``T`` is default-constructible, but otherwise any
-scope guard will work. This is very useful in combination with `gil_scoped_release`.
+scope guard will work. This is very useful in combination with ``gil_scoped_release``.
 See :ref:`gil`.
 Multiple guards can also be specified as ``py::call_guard<T1, T2, T3...>``. The
@@ -251,7 +255,7 @@ For instance, the following statement iterates over a Python ``dict``:
 .. code-block:: cpp
-    void print_dict(py::dict dict) {
+    void print_dict(const py::dict& dict) {
         /* Easily interact with Python types */
         for (auto item : dict)
             std::cout << "key=" << std::string(py::str(item.first)) << ", "
@@ -268,7 +272,7 @@ And used in Python as usual:
 .. code-block:: pycon
-    >>> print_dict({'foo': 123, 'bar': 'hello'})
+    >>> print_dict({"foo": 123, "bar": "hello"})
     key=foo, value=123
     key=bar, value=hello
@@ -289,7 +293,7 @@ Such functions can also be created using pybind11:
 .. code-block:: cpp
-   void generic(py::args args, py::kwargs kwargs) {
+   void generic(py::args args, const py::kwargs& kwargs) {
        /// .. do something with args
        if (kwargs)
            /// .. do something with kwargs
@@ -302,8 +306,9 @@ The class ``py::args`` derives from ``py::tuple`` and ``py::kwargs`` derives
 from ``py::dict``.
 You may also use just one or the other, and may combine these with other
-arguments as long as the ``py::args`` and ``py::kwargs`` arguments are the last
-arguments accepted by the function.
+arguments.  Note, however, that ``py::kwargs`` must always be the last argument
+of the function, and ``py::args`` implies that any further arguments are
+keyword-only (see :ref:`keyword_only_arguments`).
 Please refer to the other examples for details on how to iterate over these,
 and on how to cast their entries into C++ objects. A demonstration is also
@@ -362,6 +367,8 @@ like so:
         .def("myFunction", py::arg("arg") = static_cast<SomeType *>(nullptr));
+.. _keyword_only_arguments:
 Keyword-only arguments
@@ -373,10 +380,11 @@ argument in a function definition:
     def f(a, *, b):  # a can be positional or via keyword; b must be via keyword
     f(a=1, b=2)  # good
     f(b=2, a=1)  # good
-    f(1, b=2)    # good
-    f(1, 2)      # TypeError: f() takes 1 positional argument but 2 were given
+    f(1, b=2)  # good
+    f(1, 2)  # TypeError: f() takes 1 positional argument but 2 were given
 Pybind11 provides a ``py::kw_only`` object that allows you to implement
 the same behaviour by specifying the object between positional and keyword-only
@@ -392,6 +400,15 @@ feature does *not* require Python 3 to work.
 .. versionadded:: 2.6
+As of pybind11 2.9, a ``py::args`` argument implies that any following arguments
+are keyword-only, as if ``py::kw_only()`` had been specified in the same
+relative location of the argument list as the ``py::args`` argument.  The
+``py::kw_only()`` may be included to be explicit about this, but is not
+required.  (Prior to 2.9 ``py::args`` may only occur at the end of the argument
+list, or immediately before a ``py::kwargs`` argument at the end).
+.. versionadded:: 2.9
 Positional-only arguments
@@ -524,6 +541,8 @@ The default behaviour when the tag is unspecified is to allow ``None``.
     not allow ``None`` as argument.  To pass optional argument of these copied types consider
     using ``std::optional<T>``
+.. _overload_resolution:
 Overload resolution order
@@ -540,11 +559,13 @@ an explicit ``py::arg().noconvert()`` attribute in the function definition).
 If the second pass also fails a ``TypeError`` is raised.
 Within each pass, overloads are tried in the order they were registered with
+pybind11. If the ``py::prepend()`` tag is added to the definition, a function
+can be placed at the beginning of the overload sequence instead, allowing user
+overloads to proceed built in functions.
 What this means in practice is that pybind11 will prefer any overload that does
-not require conversion of arguments to an overload that does, but otherwise prefers
-earlier-defined overloads to later-defined ones.
+not require conversion of arguments to an overload that does, but otherwise
+prefers earlier-defined overloads to later-defined ones.
 .. note::
@@ -553,3 +574,42 @@ earlier-defined overloads to later-defined ones.
     requiring one conversion over one requiring three, but only prioritizes
     overloads requiring no conversion at all to overloads that require
     conversion of at least one argument.
+.. versionadded:: 2.6
+    The ``py::prepend()`` tag.
+Binding functions with template parameters
+You can bind functions that have template parameters. Here's a function:
+.. code-block:: cpp
+    template <typename T>
+    void set(T t);
+C++ templates cannot be instantiated at runtime, so you cannot bind the
+non-instantiated function:
+.. code-block:: cpp
+    // BROKEN (this will not compile)
+    m.def("set", &set);
+You must bind each instantiated function template separately. You may bind
+each instantiation with the same name, which will be treated the same as
+an overloaded function:
+.. code-block:: cpp
+    m.def("set", &set<int>);
+    m.def("set", &set<std::string>);
+Sometimes it's more clear to bind them with separate names, which is also
+an option:
+.. code-block:: cpp
+    m.def("setInt", &set<int>);
+    m.def("setString", &set<std::string>);
diff --git a/wrap/pybind11/docs/advanced/misc.rst b/wrap/pybind11/docs/advanced/misc.rst
index a5899c67a4..edab15fcb7 100644
--- a/wrap/pybind11/docs/advanced/misc.rst
+++ b/wrap/pybind11/docs/advanced/misc.rst
@@ -84,7 +84,7 @@ could be realized as follows (important changes highlighted):
-The ``call_go`` wrapper can also be simplified using the `call_guard` policy
+The ``call_go`` wrapper can also be simplified using the ``call_guard`` policy
 (see :ref:`call_policies`) which yields the same result:
 .. code-block:: cpp
@@ -132,7 +132,7 @@ However, it can be acquired as follows:
 .. code-block:: cpp
-    py::object pet = (py::object) py::module::import("basic").attr("Pet");
+    py::object pet = (py::object) py::module_::import("basic").attr("Pet");
     py::class_<Dog>(m, "Dog", pet)
         .def(py::init<const std::string &>())
@@ -146,7 +146,7 @@ has been executed:
 .. code-block:: cpp
-    py::module::import("basic");
+    py::module_::import("basic");
     py::class_<Dog, Pet>(m, "Dog")
         .def(py::init<const std::string &>())
@@ -223,7 +223,7 @@ avoids this issue involves weak reference with a cleanup callback:
 .. code-block:: cpp
-    // Register a callback function that is invoked when the BaseClass object is colelcted
+    // Register a callback function that is invoked when the BaseClass object is collected
     py::cpp_function cleanup_callback(
         [](py::handle weakref) {
             // perform cleanup here -- this function is called with the GIL held
@@ -237,13 +237,13 @@ avoids this issue involves weak reference with a cleanup callback:
 .. note::
-    PyPy (at least version 5.9) does not garbage collect objects when the
-    interpreter exits. An alternative approach (which also works on CPython) is to use
-    the :py:mod:`atexit` module [#f7]_, for example:
+    PyPy does not garbage collect objects when the interpreter exits. An alternative
+    approach (which also works on CPython) is to use the :py:mod:`atexit` module [#f7]_,
+    for example:
     .. code-block:: cpp
-        auto atexit = py::module::import("atexit");
+        auto atexit = py::module_::import("atexit");
         atexit.attr("register")(py::cpp_function([]() {
             // perform cleanup here -- this function is called with the GIL held
@@ -284,7 +284,7 @@ work, it is important that all lines are indented consistently, i.e.:
 By default, pybind11 automatically generates and prepends a signature to the docstring of a function
-registered with ``module::def()`` and ``class_::def()``. Sometimes this
+registered with ``module_::def()`` and ``class_::def()``. Sometimes this
 behavior is not desirable, because you want to provide your own signature or remove
 the docstring completely to exclude the function from the Sphinx documentation.
 The class ``options`` allows you to selectively suppress auto-generated signatures:
diff --git a/wrap/pybind11/docs/advanced/pycpp/numpy.rst b/wrap/pybind11/docs/advanced/pycpp/numpy.rst
index e50d24a991..30daeefff9 100644
--- a/wrap/pybind11/docs/advanced/pycpp/numpy.rst
+++ b/wrap/pybind11/docs/advanced/pycpp/numpy.rst
@@ -57,11 +57,11 @@ specification.
     struct buffer_info {
         void *ptr;
-        ssize_t itemsize;
+        py::ssize_t itemsize;
         std::string format;
-        ssize_t ndim;
-        std::vector<ssize_t> shape;
-        std::vector<ssize_t> strides;
+        py::ssize_t ndim;
+        std::vector<py::ssize_t> shape;
+        std::vector<py::ssize_t> strides;
 To create a C++ function that can take a Python buffer object as an argument,
@@ -150,8 +150,10 @@ NumPy array containing double precision values.
 When it is invoked with a different type (e.g. an integer or a list of
 integers), the binding code will attempt to cast the input into a NumPy array
-of the requested type. Note that this feature requires the
-:file:`pybind11/numpy.h` header to be included.
+of the requested type. This feature requires the :file:`pybind11/numpy.h`
+header to be included. Note that :file:`pybind11/numpy.h` does not depend on
+the NumPy headers, and thus can be used without declaring a build-time
+dependency on NumPy; NumPy>=1.7.0 is a runtime dependency.
 Data in NumPy arrays is not guaranteed to packed in a dense manner;
 furthermore, entries can be separated by arbitrary column and row strides.
@@ -169,6 +171,31 @@ template parameter, and it ensures that non-conforming arguments are converted
 into an array satisfying the specified requirements instead of trying the next
 function overload.
+There are several methods on arrays; the methods listed below under references
+work, as well as the following functions based on the NumPy API:
+- ``.dtype()`` returns the type of the contained values.
+- ``.strides()`` returns a pointer to the strides of the array (optionally pass
+  an integer axis to get a number).
+- ``.flags()`` returns the flag settings. ``.writable()`` and ``.owndata()``
+  are directly available.
+- ``.offset_at()`` returns the offset (optionally pass indices).
+- ``.squeeze()`` returns a view with length-1 axes removed.
+- ``.view(dtype)`` returns a view of the array with a different dtype.
+- ``.reshape({i, j, ...})`` returns a view of the array with a different shape.
+  ``.resize({...})`` is also available.
+- ``.index_at(i, j, ...)`` gets the count from the beginning to a given index.
+There are also several methods for getting references (described below).
 Structured types
@@ -231,8 +258,8 @@ by the compiler. The result is returned as a NumPy array of type
 .. code-block:: pycon
-    >>> x = np.array([[1, 3],[5, 7]])
-    >>> y = np.array([[2, 4],[6, 8]])
+    >>> x = np.array([[1, 3], [5, 7]])
+    >>> y = np.array([[2, 4], [6, 8]])
     >>> z = 3
     >>> result = vectorized_func(x, y, z)
@@ -309,17 +336,17 @@ where ``N`` gives the required dimensionality of the array:
     m.def("sum_3d", [](py::array_t<double> x) {
         auto r = x.unchecked<3>(); // x must have ndim = 3; can be non-writeable
         double sum = 0;
-        for (ssize_t i = 0; i < r.shape(0); i++)
-            for (ssize_t j = 0; j < r.shape(1); j++)
-                for (ssize_t k = 0; k < r.shape(2); k++)
+        for (py::ssize_t i = 0; i < r.shape(0); i++)
+            for (py::ssize_t j = 0; j < r.shape(1); j++)
+                for (py::ssize_t k = 0; k < r.shape(2); k++)
                     sum += r(i, j, k);
         return sum;
     m.def("increment_3d", [](py::array_t<double> x) {
         auto r = x.mutable_unchecked<3>(); // Will throw if ndim != 3 or flags.writeable is false
-        for (ssize_t i = 0; i < r.shape(0); i++)
-            for (ssize_t j = 0; j < r.shape(1); j++)
-                for (ssize_t k = 0; k < r.shape(2); k++)
+        for (py::ssize_t i = 0; i < r.shape(0); i++)
+            for (py::ssize_t j = 0; j < r.shape(1); j++)
+                for (py::ssize_t k = 0; k < r.shape(2); k++)
                     r(i, j, k) += 1.0;
     }, py::arg().noconvert());
@@ -343,21 +370,21 @@ The returned proxy object supports some of the same methods as ``py::array`` so
 that it can be used as a drop-in replacement for some existing, index-checked
 uses of ``py::array``:
-- ``r.ndim()`` returns the number of dimensions
+- ``.ndim()`` returns the number of dimensions
-- ``r.data(1, 2, ...)`` and ``r.mutable_data(1, 2, ...)``` returns a pointer to
+- ``.data(1, 2, ...)`` and ``r.mutable_data(1, 2, ...)``` returns a pointer to
   the ``const T`` or ``T`` data, respectively, at the given indices.  The
   latter is only available to proxies obtained via ``a.mutable_unchecked()``.
-- ``itemsize()`` returns the size of an item in bytes, i.e. ``sizeof(T)``.
+- ``.itemsize()`` returns the size of an item in bytes, i.e. ``sizeof(T)``.
-- ``ndim()`` returns the number of dimensions.
+- ``.ndim()`` returns the number of dimensions.
-- ``shape(n)`` returns the size of dimension ``n``
+- ``.shape(n)`` returns the size of dimension ``n``
-- ``size()`` returns the total number of elements (i.e. the product of the shapes).
+- ``.size()`` returns the total number of elements (i.e. the product of the shapes).
-- ``nbytes()`` returns the number of bytes used by the referenced elements
+- ``.nbytes()`` returns the number of bytes used by the referenced elements
   (i.e. ``itemsize()`` times ``size()``).
 .. seealso::
@@ -376,7 +403,7 @@ In Python 2, the syntactic sugar ``...`` is not available, but the singleton
 .. code-block:: python
-   a = # a NumPy array
+   a = ...  # a NumPy array
    b = a[0, ..., 0]
 The function ``py::ellipsis()`` function can be used to perform the same
@@ -388,7 +415,7 @@ operation on the C++ side:
    py::array b = a[py::make_tuple(0, py::ellipsis(), 0)];
 .. versionchanged:: 2.6
-   ``py::ellipsis()`` is now also avaliable in Python 2.
+   ``py::ellipsis()`` is now also available in Python 2.
 Memory view
diff --git a/wrap/pybind11/docs/advanced/pycpp/object.rst b/wrap/pybind11/docs/advanced/pycpp/object.rst
index 70e493acd9..93e1a94d8f 100644
--- a/wrap/pybind11/docs/advanced/pycpp/object.rst
+++ b/wrap/pybind11/docs/advanced/pycpp/object.rst
@@ -20,6 +20,40 @@ Available types include :class:`handle`, :class:`object`, :class:`bool_`,
     Be sure to review the :ref:`pytypes_gotchas` before using this heavily in
     your C++ API.
+.. _instantiating_compound_types:
+Instantiating compound Python types from C++
+Dictionaries can be initialized in the :class:`dict` constructor:
+.. code-block:: cpp
+    using namespace pybind11::literals; // to bring in the `_a` literal
+    py::dict d("spam"_a=py::none(), "eggs"_a=42);
+A tuple of python objects can be instantiated using :func:`py::make_tuple`:
+.. code-block:: cpp
+    py::tuple tup = py::make_tuple(42, py::none(), "spam");
+Each element is converted to a supported Python type.
+A `simple namespace`_ can be instantiated using
+.. code-block:: cpp
+    using namespace pybind11::literals;  // to bring in the `_a` literal
+    py::object SimpleNamespace = py::module_::import("types").attr("SimpleNamespace");
+    py::object ns = SimpleNamespace("spam"_a=py::none(), "eggs"_a=42);
+Attributes on a namespace can be modified with the :func:`py::delattr`,
+:func:`py::getattr`, and :func:`py::setattr` functions. Simple namespaces can
+be useful as lightweight stand-ins for class instances.
+.. _simple namespace: https://docs.python.org/3/library/types.html#types.SimpleNamespace
 .. _casting_back_and_forth:
 Casting back and forth
@@ -30,7 +64,7 @@ types to Python, which can be done using :func:`py::cast`:
 .. code-block:: cpp
-    MyClass *cls = ..;
+    MyClass *cls = ...;
     py::object obj = py::cast(cls);
 The reverse direction uses the following syntax:
@@ -56,12 +90,12 @@ This example obtains a reference to the Python ``Decimal`` class.
 .. code-block:: cpp
     // Equivalent to "from decimal import Decimal"
-    py::object Decimal = py::module::import("decimal").attr("Decimal");
+    py::object Decimal = py::module_::import("decimal").attr("Decimal");
 .. code-block:: cpp
     // Try to import scipy
-    py::object scipy = py::module::import("scipy");
+    py::object scipy = py::module_::import("scipy");
     return scipy.attr("__version__");
@@ -81,7 +115,7 @@ via ``operator()``.
 .. code-block:: cpp
     // Use Python to make our directories
-    py::object os = py::module::import("os");
+    py::object os = py::module_::import("os");
     py::object makedirs = os.attr("makedirs");
@@ -132,6 +166,7 @@ Keyword arguments are also supported. In Python, there is the usual call syntax:
     def f(number, say, to):
         ...  # function code
     f(1234, say="hello", to=some_instance)  # keyword call in Python
 In C++, the same call can be made using:
@@ -196,9 +231,9 @@ C++ functions that require a specific subtype rather than a generic :class:`obje
     #include <pybind11/numpy.h>
     using namespace pybind11::literals;
-    py::module os = py::module::import("os");
-    py::module path = py::module::import("os.path");  // like 'import os.path as path'
-    py::module np = py::module::import("numpy");  // like 'import numpy as np'
+    py::module_ os = py::module_::import("os");
+    py::module_ path = py::module_::import("os.path");  // like 'import os.path as path'
+    py::module_ np = py::module_::import("numpy");  // like 'import numpy as np'
     py::str curdir_abs = path.attr("abspath")(path.attr("curdir"));
     py::print(py::str("Current directory: ") + curdir_abs);
diff --git a/wrap/pybind11/docs/advanced/pycpp/utilities.rst b/wrap/pybind11/docs/advanced/pycpp/utilities.rst
index 369e7c94db..af0f9cb2b0 100644
--- a/wrap/pybind11/docs/advanced/pycpp/utilities.rst
+++ b/wrap/pybind11/docs/advanced/pycpp/utilities.rst
@@ -28,7 +28,7 @@ Capturing standard output from ostream
 Often, a library will use the streams ``std::cout`` and ``std::cerr`` to print,
 but this does not play well with Python's standard ``sys.stdout`` and ``sys.stderr``
-redirection. Replacing a library's printing with `py::print <print>` may not
+redirection. Replacing a library's printing with ``py::print <print>`` may not
 be feasible. This can be fixed using a guard around the library function that
 redirects output to the corresponding Python streams:
@@ -42,20 +42,31 @@ redirects output to the corresponding Python streams:
     m.def("noisy_func", []() {
         py::scoped_ostream_redirect stream(
             std::cout,                               // std::ostream&
-            py::module::import("sys").attr("stdout") // Python output
+            py::module_::import("sys").attr("stdout") // Python output
+.. warning::
+    The implementation in ``pybind11/iostream.h`` is NOT thread safe. Multiple
+    threads writing to a redirected ostream concurrently cause data races
+    and potentially buffer overflows. Therefore it is currently a requirement
+    that all (possibly) concurrent redirected ostream writes are protected by
+    a mutex. #HelpAppreciated: Work on iostream.h thread safety. For more
+    background see the discussions under
+    `PR #2982 <https://github.com/pybind/pybind11/pull/2982>`_ and
+    `PR #2995 <https://github.com/pybind/pybind11/pull/2995>`_.
 This method respects flushes on the output streams and will flush if needed
 when the scoped guard is destroyed. This allows the output to be redirected in
 real time, such as to a Jupyter notebook. The two arguments, the C++ stream and
 the Python output, are optional, and default to standard output if not given. An
-extra type, `py::scoped_estream_redirect <scoped_estream_redirect>`, is identical
+extra type, ``py::scoped_estream_redirect <scoped_estream_redirect>``, is identical
 except for defaulting to ``std::cerr`` and ``sys.stderr``; this can be useful with
-`py::call_guard`, which allows multiple items, but uses the default constructor:
+``py::call_guard``, which allows multiple items, but uses the default constructor:
-.. code-block:: py
+.. code-block:: cpp
     // Alternative: Call single function using call guard
     m.def("noisy_func", &call_noisy_function,
@@ -63,7 +74,7 @@ except for defaulting to ``std::cerr`` and ``sys.stderr``; this can be useful wi
 The redirection can also be done in Python with the addition of a context
-manager, using the `py::add_ostream_redirect() <add_ostream_redirect>` function:
+manager, using the ``py::add_ostream_redirect() <add_ostream_redirect>`` function:
 .. code-block:: cpp
@@ -92,7 +103,7 @@ arguments to disable one of the streams if needed.
 Evaluating Python expressions from strings and files
-pybind11 provides the `eval`, `exec` and `eval_file` functions to evaluate
+pybind11 provides the ``eval``, ``exec`` and ``eval_file`` functions to evaluate
 Python expressions and statements. The following example illustrates how they
 can be used.
@@ -104,7 +115,7 @@ can be used.
     // Evaluate in scope of main module
-    py::object scope = py::module::import("__main__").attr("__dict__");
+    py::object scope = py::module_::import("__main__").attr("__dict__");
     // Evaluate an isolated expression
     int result = py::eval("my_variable + 10", scope).cast<int>();
diff --git a/wrap/pybind11/docs/advanced/smart_ptrs.rst b/wrap/pybind11/docs/advanced/smart_ptrs.rst
index da57748ca5..5a22201095 100644
--- a/wrap/pybind11/docs/advanced/smart_ptrs.rst
+++ b/wrap/pybind11/docs/advanced/smart_ptrs.rst
@@ -77,6 +77,7 @@ segmentation fault).
 .. code-block:: python
    from example import Parent
 The problem is that ``Parent::get_child()`` returns a pointer to an instance of
diff --git a/wrap/pybind11/docs/basics.rst b/wrap/pybind11/docs/basics.rst
index 71440c9c66..e0479b298d 100644
--- a/wrap/pybind11/docs/basics.rst
+++ b/wrap/pybind11/docs/basics.rst
@@ -39,7 +39,7 @@ on various C++11 language features that break older versions of Visual Studio.
     To use the C++17 in Visual Studio 2017 (MSVC 14.1), pybind11 requires the flag
     ``/permissive-`` to be passed to the compiler `to enforce standard conformance`_. When
-    building with Visual Studio 2019, this is not strictly necessary, but still adviced.
+    building with Visual Studio 2019, this is not strictly necessary, but still advised.
 ..  _`to enforce standard conformance`: https://docs.microsoft.com/en-us/cpp/build/reference/permissive-standards-conformance?view=vs-2017
@@ -109,7 +109,7 @@ a file named :file:`example.cpp` with the following contents:
     PYBIND11_MODULE(example, m) {
         m.doc() = "pybind11 example plugin"; // optional module docstring
-        m.def("add", &add, "A function which adds two numbers");
+        m.def("add", &add, "A function that adds two numbers");
 .. [#f1] In practice, implementation and binding code will generally be located
@@ -118,8 +118,8 @@ a file named :file:`example.cpp` with the following contents:
 The :func:`PYBIND11_MODULE` macro creates a function that will be called when an
 ``import`` statement is issued from within Python. The module name (``example``)
 is given as the first macro argument (it should not be in quotes). The second
-argument (``m``) defines a variable of type :class:`py::module <module>` which
-is the main interface for creating bindings. The method :func:`module::def`
+argument (``m``) defines a variable of type :class:`py::module_ <module>` which
+is the main interface for creating bindings. The method :func:`module_::def`
 generates binding code that exposes the ``add()`` function to Python.
 .. note::
@@ -136,7 +136,14 @@ On Linux, the above example can be compiled using the following command:
 .. code-block:: bash
-    $ c++ -O3 -Wall -shared -std=c++11 -fPIC `python3 -m pybind11 --includes` example.cpp -o example`python3-config --extension-suffix`
+    $ c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) example.cpp -o example$(python3-config --extension-suffix)
+.. note::
+    If you used :ref:`include_as_a_submodule` to get the pybind11 source, then
+    use ``$(python3-config --includes) -Iextern/pybind11/include`` instead of
+    ``$(python3 -m pybind11 --includes)`` in the above compilation, as
+    explained in :ref:`building_manually`.
 For more details on the required compiler flags on Linux and macOS, see
 :ref:`building_manually`. For complete cross-platform compilation instructions,
@@ -181,7 +188,7 @@ names of the arguments ("i" and "j" in this case).
           py::arg("i"), py::arg("j"));
 :class:`arg` is one of several special tag classes which can be used to pass
-metadata into :func:`module::def`. With this modified binding code, we can now
+metadata into :func:`module_::def`. With this modified binding code, we can now
 call the function using keyword arguments, which is a more readable alternative
 particularly for functions taking many parameters:
diff --git a/wrap/pybind11/docs/benchmark.py b/wrap/pybind11/docs/benchmark.py
index 023477212e..f190793671 100644
--- a/wrap/pybind11/docs/benchmark.py
+++ b/wrap/pybind11/docs/benchmark.py
@@ -1,8 +1,7 @@
 # -*- coding: utf-8 -*-
-import random
-import os
-import time
 import datetime as dt
+import os
+import random
 nfns = 4  # Functions per class
 nargs = 4  # Arguments per function
@@ -14,7 +13,7 @@ def generate_dummy_code_pybind11(nclasses=10):
     for cl in range(nclasses):
         decl += "class cl%03i;\n" % cl
-    decl += '\n'
+    decl += "\n"
     for cl in range(nclasses):
         decl += "class cl%03i {\n" % cl
@@ -22,18 +21,17 @@ def generate_dummy_code_pybind11(nclasses=10):
         bindings += '    py::class_<cl%03i>(m, "cl%03i")\n' % (cl, cl)
         for fn in range(nfns):
             ret = random.randint(0, nclasses - 1)
-            params  = [random.randint(0, nclasses - 1) for i in range(nargs)]
+            params = [random.randint(0, nclasses - 1) for i in range(nargs)]
             decl += "    cl%03i *fn_%03i(" % (ret, fn)
             decl += ", ".join("cl%03i *" % p for p in params)
             decl += ");\n"
-            bindings += '        .def("fn_%03i", &cl%03i::fn_%03i)\n' % \
-                (fn, cl, fn)
+            bindings += '        .def("fn_%03i", &cl%03i::fn_%03i)\n' % (fn, cl, fn)
         decl += "};\n\n"
-        bindings += '        ;\n'
+        bindings += "        ;\n"
     result = "#include <pybind11/pybind11.h>\n\n"
     result += "namespace py = pybind11;\n\n"
-    result += decl + '\n'
+    result += decl + "\n"
     result += "PYBIND11_MODULE(example, m) {\n"
     result += bindings
     result += "}"
@@ -46,7 +44,7 @@ def generate_dummy_code_boost(nclasses=10):
     for cl in range(nclasses):
         decl += "class cl%03i;\n" % cl
-    decl += '\n'
+    decl += "\n"
     for cl in range(nclasses):
         decl += "class cl%03i {\n" % cl
@@ -54,18 +52,20 @@ def generate_dummy_code_boost(nclasses=10):
         bindings += '    py::class_<cl%03i>("cl%03i")\n' % (cl, cl)
         for fn in range(nfns):
             ret = random.randint(0, nclasses - 1)
-            params  = [random.randint(0, nclasses - 1) for i in range(nargs)]
+            params = [random.randint(0, nclasses - 1) for i in range(nargs)]
             decl += "    cl%03i *fn_%03i(" % (ret, fn)
             decl += ", ".join("cl%03i *" % p for p in params)
             decl += ");\n"
-            bindings += '        .def("fn_%03i", &cl%03i::fn_%03i, py::return_value_policy<py::manage_new_object>())\n' % \
-                (fn, cl, fn)
+            bindings += (
+                '        .def("fn_%03i", &cl%03i::fn_%03i, py::return_value_policy<py::manage_new_object>())\n'
+                % (fn, cl, fn)
+            )
         decl += "};\n\n"
-        bindings += '        ;\n'
+        bindings += "        ;\n"
     result = "#include <boost/python.hpp>\n\n"
     result += "namespace py = boost::python;\n\n"
-    result += decl + '\n'
+    result += decl + "\n"
     result += "BOOST_PYTHON_MODULE(example) {\n"
     result += bindings
     result += "}"
@@ -73,17 +73,19 @@ def generate_dummy_code_boost(nclasses=10):
 for codegen in [generate_dummy_code_pybind11, generate_dummy_code_boost]:
-    print ("{")
+    print("{")
     for i in range(0, 10):
         nclasses = 2 ** i
         with open("test.cpp", "w") as f:
         n1 = dt.datetime.now()
-        os.system("g++ -Os -shared -rdynamic -undefined dynamic_lookup "
+        os.system(
+            "g++ -Os -shared -rdynamic -undefined dynamic_lookup "
             "-fvisibility=hidden -std=c++14 test.cpp -I include "
-            "-I /System/Library/Frameworks/Python.framework/Headers -o test.so")
+            "-I /System/Library/Frameworks/Python.framework/Headers -o test.so"
+        )
         n2 = dt.datetime.now()
         elapsed = (n2 - n1).total_seconds()
-        size = os.stat('test.so').st_size
+        size = os.stat("test.so").st_size
         print("   {%i, %f, %i}," % (nclasses * nfns, elapsed, size))
-    print ("}")
+    print("}")
diff --git a/wrap/pybind11/docs/changelog.rst b/wrap/pybind11/docs/changelog.rst
index 8f95c12741..16bf3aa3f3 100644
--- a/wrap/pybind11/docs/changelog.rst
+++ b/wrap/pybind11/docs/changelog.rst
@@ -6,21 +6,697 @@ Changelog
 Starting with version 1.8.0, pybind11 releases use a `semantic versioning
 <http://semver.org>`_ policy.
-v2.6.0 (IN PROGRESS)
+Version 2.9.1 (Feb 2, 2022)
+* If possible, attach Python exception with ``py::raise_from`` to ``TypeError``
+  when casting from C++ to Python. This will give additional info if Python
+  exceptions occur in the caster. Adds a test case of trying to convert a set
+  from C++ to Python when the hash function is not defined in Python.
+  `#3605 <https://github.com/pybind/pybind11/pull/3605>`_
+* Add a mapping of C++11 nested exceptions to their Python exception
+  equivalent using ``py::raise_from``. This attaches the nested exceptions in
+  Python using the ``__cause__`` field.
+  `#3608 <https://github.com/pybind/pybind11/pull/3608>`_
+* Propagate Python exception traceback using ``raise_from`` if a pybind11
+  function runs out of overloads.
+  `#3671 <https://github.com/pybind/pybind11/pull/3671>`_
+* ``py::multiple_inheritance`` is now only needed when C++ bases are hidden
+  from pybind11.
+  `#3650 <https://github.com/pybind/pybind11/pull/3650>`_ and
+  `#3659 <https://github.com/pybind/pybind11/pull/3659>`_
+Bug fixes:
+* Remove a boolean cast in ``numpy.h`` that causes MSVC C4800 warnings when
+  compiling against Python 3.10 or newer.
+  `#3669 <https://github.com/pybind/pybind11/pull/3669>`_
+* Render ``py::bool_`` and ``py::float_`` as ``bool`` and ``float``
+  respectively.
+  `#3622 <https://github.com/pybind/pybind11/pull/3622>`_
+Build system improvements:
+* Fix CMake extension suffix computation on Python 3.10+.
+  `#3663 <https://github.com/pybind/pybind11/pull/3663>`_
+* Allow ``CMAKE_ARGS`` to override CMake args in pybind11's own ``setup.py``.
+  `#3577 <https://github.com/pybind/pybind11/pull/3577>`_
+* Remove a few deprecated c-headers.
+  `#3610 <https://github.com/pybind/pybind11/pull/3610>`_
+* More uniform handling of test targets.
+  `#3590 <https://github.com/pybind/pybind11/pull/3590>`_
+* Add clang-tidy readability check to catch potentially swapped function args.
+  `#3611 <https://github.com/pybind/pybind11/pull/3611>`_
+Version 2.9.0 (Dec 28, 2021)
+This is the last version to support Python 2.7 and 3.5.
+New Features:
+* Allow ``py::args`` to be followed by other arguments; the remaining arguments
+  are implicitly keyword-only, as if a ``py::kw_only{}`` annotation had been
+  used.
+  `#3402 <https://github.com/pybind/pybind11/pull/3402>`_
+* Make str/bytes/memoryview more interoperable with ``std::string_view``.
+  `#3521 <https://github.com/pybind/pybind11/pull/3521>`_
+* Replace ``_`` with ``const_name`` in internals, avoid defining ``pybind::_``
+  if ``_`` defined as macro (common gettext usage)
+  `#3423 <https://github.com/pybind/pybind11/pull/3423>`_
+Bug fixes:
+* Fix a rare warning about extra copy in an Eigen constructor.
+  `#3486 <https://github.com/pybind/pybind11/pull/3486>`_
+* Fix caching of the C++ overrides.
+  `#3465 <https://github.com/pybind/pybind11/pull/3465>`_
+* Add missing ``std::forward`` calls to some ``cpp_function`` overloads.
+  `#3443 <https://github.com/pybind/pybind11/pull/3443>`_
+* Support PyPy 7.3.7 and the PyPy3.8 beta. Test python-3.11 on PRs with the
+  ``python dev`` label.
+  `#3419 <https://github.com/pybind/pybind11/pull/3419>`_
+* Replace usage of deprecated ``Eigen::MappedSparseMatrix`` with
+  ``Eigen::Map<Eigen::SparseMatrix<...>>`` for Eigen 3.3+.
+  `#3499 <https://github.com/pybind/pybind11/pull/3499>`_
+* Tweaks to support Microsoft Visual Studio 2022.
+  `#3497 <https://github.com/pybind/pybind11/pull/3497>`_
+Build system improvements:
+* Nicer CMake printout and IDE organisation for pybind11's own tests.
+  `#3479 <https://github.com/pybind/pybind11/pull/3479>`_
+* CMake: report version type as part of the version string to avoid a spurious
+  space in the package status message.
+  `#3472 <https://github.com/pybind/pybind11/pull/3472>`_
+* Flags starting with ``-g`` in ``$CFLAGS`` and ``$CPPFLAGS`` are no longer
+  overridden by ``.Pybind11Extension``.
+  `#3436 <https://github.com/pybind/pybind11/pull/3436>`_
+* Ensure ThreadPool is closed in ``setup_helpers``.
+  `#3548 <https://github.com/pybind/pybind11/pull/3548>`_
+* Avoid LTS on ``mips64`` and ``ppc64le`` (reported broken).
+  `#3557 <https://github.com/pybind/pybind11/pull/3557>`_
+v2.8.1 (Oct 27, 2021)
+Changes and additions:
+* The simple namespace creation shortcut added in 2.8.0 was deprecated due to
+  usage of CPython internal API, and will be removed soon. Use
+  ``py::module_::import("types").attr("SimpleNamespace")``.
+  `#3374 <https://github.com/pybinyyd/pybind11/pull/3374>`_
+* Add C++ Exception type to throw and catch ``AttributeError``. Useful for
+  defining custom ``__setattr__`` and ``__getattr__`` methods.
+  `#3387 <https://github.com/pybind/pybind11/pull/3387>`_
+* Fixed the potential for dangling references when using properties with
+  ``std::optional`` types.
+  `#3376 <https://github.com/pybind/pybind11/pull/3376>`_
+* Modernize usage of ``PyCodeObject`` on Python 3.9+ (moving toward support for
+  Python 3.11a1)
+  `#3368 <https://github.com/pybind/pybind11/pull/3368>`_
+* A long-standing bug in ``eigen.h`` was fixed (originally PR #3343). The bug
+  was unmasked by newly added ``static_assert``'s in the Eigen 3.4.0 release.
+  `#3352 <https://github.com/pybind/pybind11/pull/3352>`_
+* Support multiple raw inclusion of CMake helper files (Conan.io does this for
+  multi-config generators).
+  `#3420 <https://github.com/pybind/pybind11/pull/3420>`_
+* Fix harmless warning on upcoming CMake 3.22.
+  `#3368 <https://github.com/pybind/pybind11/pull/3368>`_
+* Fix 2.8.0 regression with MSVC 2017 + C++17 mode + Python 3.
+  `#3407 <https://github.com/pybind/pybind11/pull/3407>`_
+* Fix 2.8.0 regression that caused undefined behavior (typically
+  segfaults) in ``make_key_iterator``/``make_value_iterator`` if dereferencing
+  the iterator returned a temporary value instead of a reference.
+  `#3348 <https://github.com/pybind/pybind11/pull/3348>`_
+v2.8.0 (Oct 4, 2021)
-See :ref:`upgrade-guide-2.6` for help upgrading to the new version.
+New features:
-* Provide an additional spelling of ``py::module`` - ``py::module_`` (with a
-  trailing underscore), for C++20 compatibility. Only relevant when used
-  unqualified.
-  `#2489 <https://github.com/pybind/pybind11/pull/2489>`_
+* Added ``py::raise_from`` to enable chaining exceptions.
+  `#3215 <https://github.com/pybind/pybind11/pull/3215>`_
-* ``pybind11_add_module()`` now accepts an optional ``OPT_SIZE`` flag that
-  switches the binding target to size-based optimization regardless global
-  CMake build type (except in debug mode, where optimizations remain disabled).
-  This reduces binary size quite substantially (~25%).
-  `#2463 <https://github.com/pybind/pybind11/pull/2463>`_
+* Allow exception translators to be optionally registered local to a module
+  instead of applying globally across all pybind11 modules. Use
+  ``register_local_exception_translator(ExceptionTranslator&& translator)``
+  instead of  ``register_exception_translator(ExceptionTranslator&&
+  translator)`` to keep your exception remapping code local to the module.
+  `#2650 <https://github.com/pybinyyd/pybind11/pull/2650>`_
+* Add ``make_simple_namespace`` function for instantiating Python
+  ``SimpleNamespace`` objects. **Deprecated in 2.8.1.**
+  `#2840 <https://github.com/pybind/pybind11/pull/2840>`_
+* ``pybind11::scoped_interpreter`` and ``initialize_interpreter`` have new
+  arguments to allow ``sys.argv`` initialization.
+  `#2341 <https://github.com/pybind/pybind11/pull/2341>`_
+* Allow Python builtins to be used as callbacks in CPython.
+  `#1413 <https://github.com/pybind/pybind11/pull/1413>`_
+* Added ``view`` to view arrays with a different datatype.
+  `#987 <https://github.com/pybind/pybind11/pull/987>`_
+* Implemented ``reshape`` on arrays.
+  `#984 <https://github.com/pybind/pybind11/pull/984>`_
+* Enable defining custom ``__new__`` methods on classes by fixing bug
+  preventing overriding methods if they have non-pybind11 siblings.
+  `#3265 <https://github.com/pybind/pybind11/pull/3265>`_
+* Add ``make_value_iterator()``, and fix ``make_key_iterator()`` to return
+  references instead of copies.
+  `#3293 <https://github.com/pybind/pybind11/pull/3293>`_
+* Improve the classes generated by ``bind_map``: `#3310 <https://github.com/pybind/pybind11/pull/3310>`_
+  * Change ``.items`` from an iterator to a dictionary view.
+  * Add ``.keys`` and ``.values`` (both dictionary views).
+  * Allow ``__contains__`` to take any object.
+* ``pybind11::custom_type_setup`` was added, for customizing the
+  ``PyHeapTypeObject`` corresponding to a class, which may be useful for
+  enabling garbage collection support, among other things.
+  `#3287 <https://github.com/pybind/pybind11/pull/3287>`_
+* Set ``__file__`` constant when running ``eval_file`` in an embedded interpreter.
+  `#3233 <https://github.com/pybind/pybind11/pull/3233>`_
+* Python objects and (C++17) ``std::optional`` now accepted in ``py::slice``
+  constructor.
+  `#1101 <https://github.com/pybind/pybind11/pull/1101>`_
+* The pybind11 proxy types ``str``, ``bytes``, ``bytearray``, ``tuple``,
+  ``list`` now consistently support passing ``ssize_t`` values for sizes and
+  indexes. Previously, only ``size_t`` was accepted in several interfaces.
+  `#3219 <https://github.com/pybind/pybind11/pull/3219>`_
+* Avoid evaluating ``PYBIND11_TLS_REPLACE_VALUE`` arguments more than once.
+  `#3290 <https://github.com/pybind/pybind11/pull/3290>`_
+* Bug fix: enum value's ``__int__`` returning non-int when underlying type is
+  bool or of char type.
+  `#1334 <https://github.com/pybind/pybind11/pull/1334>`_
+* Fixes bug in setting error state in Capsule's pointer methods.
+  `#3261 <https://github.com/pybind/pybind11/pull/3261>`_
+* A long-standing memory leak in ``py::cpp_function::initialize`` was fixed.
+  `#3229 <https://github.com/pybind/pybind11/pull/3229>`_
+* Fixes thread safety for some ``pybind11::type_caster`` which require lifetime
+  extension, such as for ``std::string_view``.
+  `#3237 <https://github.com/pybind/pybind11/pull/3237>`_
+* Restore compatibility with gcc 4.8.4 as distributed by ubuntu-trusty, linuxmint-17.
+  `#3270 <https://github.com/pybind/pybind11/pull/3270>`_
+Build system improvements:
+* Fix regression in CMake Python package config: improper use of absolute path.
+  `#3144 <https://github.com/pybind/pybind11/pull/3144>`_
+* Cached Python version information could become stale when CMake was re-run
+  with a different Python version. The build system now detects this and
+  updates this information.
+  `#3299 <https://github.com/pybind/pybind11/pull/3299>`_
+* Specified UTF8-encoding in setup.py calls of open().
+  `#3137 <https://github.com/pybind/pybind11/pull/3137>`_
+* Fix a harmless warning from CMake 3.21 with the classic Python discovery.
+  `#3220 <https://github.com/pybind/pybind11/pull/3220>`_
+* Eigen repo and version can now be specified as cmake options.
+  `#3324 <https://github.com/pybind/pybind11/pull/3324>`_
+Backend and tidying up:
+* Reduced thread-local storage required for keeping alive temporary data for
+  type conversion to one key per ABI version, rather than one key per extension
+  module.  This makes the total thread-local storage required by pybind11 2
+  keys per ABI version.
+  `#3275 <https://github.com/pybind/pybind11/pull/3275>`_
+* Optimize NumPy array construction with additional moves.
+  `#3183 <https://github.com/pybind/pybind11/pull/3183>`_
+* Conversion to ``std::string`` and ``std::string_view`` now avoids making an
+  extra copy of the data on Python >= 3.3.
+  `#3257 <https://github.com/pybind/pybind11/pull/3257>`_
+* Remove const modifier from certain C++ methods on Python collections
+  (``list``, ``set``, ``dict``) such as (``clear()``, ``append()``,
+  ``insert()``, etc...) and annotated them with ``py-non-const``.
+* Enable readability ``clang-tidy-const-return`` and remove useless consts.
+  `#3254 <https://github.com/pybind/pybind11/pull/3254>`_
+  `#3194 <https://github.com/pybind/pybind11/pull/3194>`_
+* The clang-tidy ``google-explicit-constructor`` option was enabled.
+  `#3250 <https://github.com/pybind/pybind11/pull/3250>`_
+* Mark a pytype move constructor as noexcept (perf).
+  `#3236 <https://github.com/pybind/pybind11/pull/3236>`_
+* Enable clang-tidy check to guard against inheritance slicing.
+  `#3210 <https://github.com/pybind/pybind11/pull/3210>`_
+* Legacy warning suppression pragma were removed from eigen.h. On Unix
+  platforms, please use -isystem for Eigen include directories, to suppress
+  compiler warnings originating from Eigen headers. Note that CMake does this
+  by default. No adjustments are needed for Windows.
+  `#3198 <https://github.com/pybind/pybind11/pull/3198>`_
+* Format pybind11 with isort consistent ordering of imports
+  `#3195 <https://github.com/pybind/pybind11/pull/3195>`_
+* The warnings-suppression "pragma clamp" at the top/bottom of pybind11 was
+  removed, clearing the path to refactoring and IWYU cleanup.
+  `#3186 <https://github.com/pybind/pybind11/pull/3186>`_
+* Enable most bugprone checks in clang-tidy and fix the found potential bugs
+  and poor coding styles.
+  `#3166 <https://github.com/pybind/pybind11/pull/3166>`_
+* Add ``clang-tidy-readability`` rules to make boolean casts explicit improving
+  code readability. Also enabled other misc and readability clang-tidy checks.
+  `#3148 <https://github.com/pybind/pybind11/pull/3148>`_
+* Move object in ``.pop()`` for list.
+  `#3116 <https://github.com/pybind/pybind11/pull/3116>`_
+v2.7.1 (Aug 3, 2021)
+Minor missing functionality added:
+* Allow Python builtins to be used as callbacks in CPython.
+  `#1413 <https://github.com/pybind/pybind11/pull/1413>`_
+Bug fixes:
+* Fix regression in CMake Python package config: improper use of absolute path.
+  `#3144 <https://github.com/pybind/pybind11/pull/3144>`_
+* Fix Mingw64 and add to the CI testing matrix.
+  `#3132 <https://github.com/pybind/pybind11/pull/3132>`_
+* Specified UTF8-encoding in setup.py calls of open().
+  `#3137 <https://github.com/pybind/pybind11/pull/3137>`_
+* Add clang-tidy-readability rules to make boolean casts explicit improving
+  code readability. Also enabled other misc and readability clang-tidy checks.
+  `#3148 <https://github.com/pybind/pybind11/pull/3148>`_
+* Move object in ``.pop()`` for list.
+  `#3116 <https://github.com/pybind/pybind11/pull/3116>`_
+Backend and tidying up:
+* Removed and fixed warning suppressions.
+  `#3127 <https://github.com/pybind/pybind11/pull/3127>`_
+  `#3129 <https://github.com/pybind/pybind11/pull/3129>`_
+  `#3135 <https://github.com/pybind/pybind11/pull/3135>`_
+  `#3141 <https://github.com/pybind/pybind11/pull/3141>`_
+  `#3142 <https://github.com/pybind/pybind11/pull/3142>`_
+  `#3150 <https://github.com/pybind/pybind11/pull/3150>`_
+  `#3152 <https://github.com/pybind/pybind11/pull/3152>`_
+  `#3160 <https://github.com/pybind/pybind11/pull/3160>`_
+  `#3161 <https://github.com/pybind/pybind11/pull/3161>`_
+v2.7.0 (Jul 16, 2021)
+New features:
+* Enable ``py::implicitly_convertible<py::none, ...>`` for
+  ``py::class_``-wrapped types.
+  `#3059 <https://github.com/pybind/pybind11/pull/3059>`_
+* Allow function pointer extraction from overloaded functions.
+  `#2944 <https://github.com/pybind/pybind11/pull/2944>`_
+* NumPy: added ``.char_()`` to type which gives the NumPy public ``char``
+  result, which also distinguishes types by bit length (unlike ``.kind()``).
+  `#2864 <https://github.com/pybind/pybind11/pull/2864>`_
+* Add ``pybind11::bytearray`` to manipulate ``bytearray`` similar to ``bytes``.
+  `#2799 <https://github.com/pybind/pybind11/pull/2799>`_
+* ``pybind11/stl/filesystem.h`` registers a type caster that, on C++17/Python
+  3.6+, converts ``std::filesystem::path`` to ``pathlib.Path`` and any
+  ``os.PathLike`` to ``std::filesystem::path``.
+  `#2730 <https://github.com/pybind/pybind11/pull/2730>`_
+* A ``PYBIND11_VERSION_HEX`` define was added, similar to ``PY_VERSION_HEX``.
+  `#3120 <https://github.com/pybind/pybind11/pull/3120>`_
+* ``py::str`` changed to exclusively hold ``PyUnicodeObject``. Previously
+  ``py::str`` could also hold ``bytes``, which is probably surprising, was
+  never documented, and can mask bugs (e.g. accidental use of ``py::str``
+  instead of ``py::bytes``).
+  `#2409 <https://github.com/pybind/pybind11/pull/2409>`_
+* Add a safety guard to ensure that the Python GIL is held when C++ calls back
+  into Python via ``object_api<>::operator()`` (e.g. ``py::function``
+  ``__call__``).  (This feature is available for Python 3.6+ only.)
+  `#2919 <https://github.com/pybind/pybind11/pull/2919>`_
+* Catch a missing ``self`` argument in calls to ``__init__()``.
+  `#2914 <https://github.com/pybind/pybind11/pull/2914>`_
+* Use ``std::string_view`` if available to avoid a copy when passing an object
+  to a ``std::ostream``.
+  `#3042 <https://github.com/pybind/pybind11/pull/3042>`_
+* An important warning about thread safety was added to the ``iostream.h``
+  documentation; attempts to make ``py::scoped_ostream_redirect`` thread safe
+  have been removed, as it was only partially effective.
+  `#2995 <https://github.com/pybind/pybind11/pull/2995>`_
+* Performance: avoid unnecessary strlen calls.
+  `#3058 <https://github.com/pybind/pybind11/pull/3058>`_
+* Fix auto-generated documentation string when using ``const T`` in
+  ``pyarray_t``.
+  `#3020 <https://github.com/pybind/pybind11/pull/3020>`_
+* Unify error messages thrown by ``simple_collector``/``unpacking_collector``.
+  `#3013 <https://github.com/pybind/pybind11/pull/3013>`_
+* ``pybind11::builtin_exception`` is now explicitly exported, which means the
+  types included/defined in different modules are identical, and exceptions
+  raised in different modules can be caught correctly. The documentation was
+  updated to explain that custom exceptions that are used across module
+  boundaries need to be explicitly exported as well.
+  `#2999 <https://github.com/pybind/pybind11/pull/2999>`_
+* Fixed exception when printing UTF-8 to a ``scoped_ostream_redirect``.
+  `#2982 <https://github.com/pybind/pybind11/pull/2982>`_
+* Pickle support enhancement: ``setstate`` implementation will attempt to
+  ``setattr`` ``__dict__`` only if the unpickled ``dict`` object is not empty,
+  to not force use of ``py::dynamic_attr()`` unnecessarily.
+  `#2972 <https://github.com/pybind/pybind11/pull/2972>`_
+* Allow negative timedelta values to roundtrip.
+  `#2870 <https://github.com/pybind/pybind11/pull/2870>`_
+* Fix unchecked errors could potentially swallow signals/other exceptions.
+  `#2863 <https://github.com/pybind/pybind11/pull/2863>`_
+* Add null pointer check with ``std::localtime``.
+  `#2846 <https://github.com/pybind/pybind11/pull/2846>`_
+* Fix the ``weakref`` constructor from ``py::object`` to create a new
+  ``weakref`` on conversion.
+  `#2832 <https://github.com/pybind/pybind11/pull/2832>`_
+* Avoid relying on exceptions in C++17 when getting a ``shared_ptr`` holder
+  from a ``shared_from_this`` class.
+  `#2819 <https://github.com/pybind/pybind11/pull/2819>`_
+* Allow the codec's exception to be raised instead of :code:`RuntimeError` when
+  casting from :code:`py::str` to :code:`std::string`.
+  `#2903 <https://github.com/pybind/pybind11/pull/2903>`_
+Build system improvements:
+* In ``setup_helpers.py``, test for platforms that have some multiprocessing
+  features but lack semaphores, which ``ParallelCompile`` requires.
+  `#3043 <https://github.com/pybind/pybind11/pull/3043>`_
+* Fix ``pybind11_INCLUDE_DIR`` in case ``CMAKE_INSTALL_INCLUDEDIR`` is
+  absolute.
+  `#3005 <https://github.com/pybind/pybind11/pull/3005>`_
+* Fix bug not respecting ``WITH_SOABI`` or ``WITHOUT_SOABI`` to CMake.
+  `#2938 <https://github.com/pybind/pybind11/pull/2938>`_
+* Fix the default ``Pybind11Extension`` compilation flags with a Mingw64 python.
+  `#2921 <https://github.com/pybind/pybind11/pull/2921>`_
+* Clang on Windows: do not pass ``/MP`` (ignored flag).
+  `#2824 <https://github.com/pybind/pybind11/pull/2824>`_
+* ``pybind11.setup_helpers.intree_extensions`` can be used to generate
+  ``Pybind11Extension`` instances from cpp files placed in the Python package
+  source tree.
+  `#2831 <https://github.com/pybind/pybind11/pull/2831>`_
+Backend and tidying up:
+* Enable clang-tidy performance, readability, and modernization checks
+  throughout the codebase to enforce best coding practices.
+  `#3046 <https://github.com/pybind/pybind11/pull/3046>`_,
+  `#3049 <https://github.com/pybind/pybind11/pull/3049>`_,
+  `#3051 <https://github.com/pybind/pybind11/pull/3051>`_,
+  `#3052 <https://github.com/pybind/pybind11/pull/3052>`_,
+  `#3080 <https://github.com/pybind/pybind11/pull/3080>`_, and
+  `#3094 <https://github.com/pybind/pybind11/pull/3094>`_
+* Checks for common misspellings were added to the pre-commit hooks.
+  `#3076 <https://github.com/pybind/pybind11/pull/3076>`_
+* Changed ``Werror`` to stricter ``Werror-all`` for Intel compiler and fixed
+  minor issues.
+  `#2948 <https://github.com/pybind/pybind11/pull/2948>`_
+* Fixed compilation with GCC < 5 when the user defines ``_GLIBCXX_USE_CXX11_ABI``.
+  `#2956 <https://github.com/pybind/pybind11/pull/2956>`_
+* Added nox support for easier local testing and linting of contributions.
+  `#3101 <https://github.com/pybind/pybind11/pull/3101>`_ and
+  `#3121 <https://github.com/pybind/pybind11/pull/3121>`_
+* Avoid RTD style issue with docutils 0.17+.
+  `#3119 <https://github.com/pybind/pybind11/pull/3119>`_
+* Support pipx run, such as ``pipx run pybind11 --include`` for a quick compile.
+  `#3117 <https://github.com/pybind/pybind11/pull/3117>`_
+v2.6.2 (Jan 26, 2021)
+Minor missing functionality added:
+* enum: add missing Enum.value property.
+  `#2739 <https://github.com/pybind/pybind11/pull/2739>`_
+* Allow thread termination to be avoided during shutdown for CPython 3.7+ via
+  ``.disarm`` for ``gil_scoped_acquire``/``gil_scoped_release``.
+  `#2657 <https://github.com/pybind/pybind11/pull/2657>`_
+Fixed or improved behavior in a few special cases:
+* Fix bug where the constructor of ``object`` subclasses would not throw on
+  being passed a Python object of the wrong type.
+  `#2701 <https://github.com/pybind/pybind11/pull/2701>`_
+* The ``type_caster`` for integers does not convert Python objects with
+  ``__int__`` anymore with ``noconvert`` or during the first round of trying
+  overloads.
+  `#2698 <https://github.com/pybind/pybind11/pull/2698>`_
+* When casting to a C++ integer, ``__index__`` is always called and not
+  considered as conversion, consistent with Python 3.8+.
+  `#2801 <https://github.com/pybind/pybind11/pull/2801>`_
+Build improvements:
+* Setup helpers: ``extra_compile_args`` and ``extra_link_args`` automatically set by
+  Pybind11Extension are now prepended, which allows them to be overridden
+  by user-set ``extra_compile_args`` and ``extra_link_args``.
+  `#2808 <https://github.com/pybind/pybind11/pull/2808>`_
+* Setup helpers: Don't trigger unused parameter warning.
+  `#2735 <https://github.com/pybind/pybind11/pull/2735>`_
+* CMake: Support running with ``--warn-uninitialized`` active.
+  `#2806 <https://github.com/pybind/pybind11/pull/2806>`_
+* CMake: Avoid error if included from two submodule directories.
+  `#2804 <https://github.com/pybind/pybind11/pull/2804>`_
+* CMake: Fix ``STATIC`` / ``SHARED`` being ignored in FindPython mode.
+  `#2796 <https://github.com/pybind/pybind11/pull/2796>`_
+* CMake: Respect the setting for ``CMAKE_CXX_VISIBILITY_PRESET`` if defined.
+  `#2793 <https://github.com/pybind/pybind11/pull/2793>`_
+* CMake: Fix issue with FindPython2/FindPython3 not working with ``pybind11::embed``.
+  `#2662 <https://github.com/pybind/pybind11/pull/2662>`_
+* CMake: mixing local and installed pybind11's would prioritize the installed
+  one over the local one (regression in 2.6.0).
+  `#2716 <https://github.com/pybind/pybind11/pull/2716>`_
+Bug fixes:
+* Fixed segfault in multithreaded environments when using
+  ``scoped_ostream_redirect``.
+  `#2675 <https://github.com/pybind/pybind11/pull/2675>`_
+* Leave docstring unset when all docstring-related options are disabled, rather
+  than set an empty string.
+  `#2745 <https://github.com/pybind/pybind11/pull/2745>`_
+* The module key in builtins that pybind11 uses to store its internals changed
+  from std::string to a python str type (more natural on Python 2, no change on
+  Python 3).
+  `#2814 <https://github.com/pybind/pybind11/pull/2814>`_
+* Fixed assertion error related to unhandled (later overwritten) exception in
+  CPython 3.8 and 3.9 debug builds.
+  `#2685 <https://github.com/pybind/pybind11/pull/2685>`_
+* Fix ``py::gil_scoped_acquire`` assert with CPython 3.9 debug build.
+  `#2683 <https://github.com/pybind/pybind11/pull/2683>`_
+* Fix issue with a test failing on pytest 6.2.
+  `#2741 <https://github.com/pybind/pybind11/pull/2741>`_
+Warning fixes:
+* Fix warning modifying constructor parameter 'flag' that shadows a field of
+  'set_flag' ``[-Wshadow-field-in-constructor-modified]``.
+  `#2780 <https://github.com/pybind/pybind11/pull/2780>`_
+* Suppressed some deprecation warnings about old-style
+  ``__init__``/``__setstate__`` in the tests.
+  `#2759 <https://github.com/pybind/pybind11/pull/2759>`_
+Valgrind work:
+* Fix invalid access when calling a pybind11 ``__init__`` on a non-pybind11
+  class instance.
+  `#2755 <https://github.com/pybind/pybind11/pull/2755>`_
+* Fixed various minor memory leaks in pybind11's test suite.
+  `#2758 <https://github.com/pybind/pybind11/pull/2758>`_
+* Resolved memory leak in cpp_function initialization when exceptions occurred.
+  `#2756 <https://github.com/pybind/pybind11/pull/2756>`_
+* Added a Valgrind build, checking for leaks and memory-related UB, to CI.
+  `#2746 <https://github.com/pybind/pybind11/pull/2746>`_
+Compiler support:
+* Intel compiler was not activating C++14 support due to a broken define.
+  `#2679 <https://github.com/pybind/pybind11/pull/2679>`_
+* Support ICC and NVIDIA HPC SDK in C++17 mode.
+  `#2729 <https://github.com/pybind/pybind11/pull/2729>`_
+* Support Intel OneAPI compiler (ICC 20.2) and add to CI.
+  `#2573 <https://github.com/pybind/pybind11/pull/2573>`_
+v2.6.1 (Nov 11, 2020)
+* ``py::exec``, ``py::eval``, and ``py::eval_file`` now add the builtins module
+  as ``"__builtins__"`` to their ``globals`` argument, better matching ``exec``
+  and ``eval`` in pure Python.
+  `#2616 <https://github.com/pybind/pybind11/pull/2616>`_
+* ``setup_helpers`` will no longer set a minimum macOS version higher than the
+  current version.
+  `#2622 <https://github.com/pybind/pybind11/pull/2622>`_
+* Allow deleting static properties.
+  `#2629 <https://github.com/pybind/pybind11/pull/2629>`_
+* Seal a leak in ``def_buffer``, cleaning up the ``capture`` object after the
+  ``class_`` object goes out of scope.
+  `#2634 <https://github.com/pybind/pybind11/pull/2634>`_
+* ``pybind11_INCLUDE_DIRS`` was incorrect, potentially causing a regression if
+  it was expected to include ``PYTHON_INCLUDE_DIRS`` (please use targets
+  instead).
+  `#2636 <https://github.com/pybind/pybind11/pull/2636>`_
+* Added parameter names to the ``py::enum_`` constructor and methods, avoiding
+  ``arg0`` in the generated docstrings.
+  `#2637 <https://github.com/pybind/pybind11/pull/2637>`_
+* Added ``needs_recompile`` optional function to the ``ParallelCompiler``
+  helper, to allow a recompile to be skipped based on a user-defined function.
+  `#2643 <https://github.com/pybind/pybind11/pull/2643>`_
+v2.6.0 (Oct 21, 2020)
+See :ref:`upgrade-guide-2.6` for help upgrading to the new version.
+New features:
 * Keyword-only arguments supported in Python 2 or 3 with ``py::kw_only()``.
   `#2100 <https://github.com/pybind/pybind11/pull/2100>`_
@@ -28,11 +704,17 @@ See :ref:`upgrade-guide-2.6` for help upgrading to the new version.
 * Positional-only arguments supported in Python 2 or 3 with ``py::pos_only()``.
   `#2459 <https://github.com/pybind/pybind11/pull/2459>`_
+* ``py::is_final()`` class modifier to block subclassing (CPython only).
+  `#2151 <https://github.com/pybind/pybind11/pull/2151>`_
+* Added ``py::prepend()``, allowing a function to be placed at the beginning of
+  the overload chain.
+  `#1131 <https://github.com/pybind/pybind11/pull/1131>`_
 * Access to the type object now provided with ``py::type::of<T>()`` and
   `#2364 <https://github.com/pybind/pybind11/pull/2364>`_
 * Perfect forwarding support for methods.
   `#2048 <https://github.com/pybind/pybind11/pull/2048>`_
@@ -42,11 +724,48 @@ See :ref:`upgrade-guide-2.6` for help upgrading to the new version.
 * ``py::hash`` is now public.
   `#2217 <https://github.com/pybind/pybind11/pull/2217>`_
-* ``py::is_final()`` class modifier to block subclassing (CPython only).
-  `#2151 <https://github.com/pybind/pybind11/pull/2151>`_
+* ``py::class_<union_type>`` is now supported. Note that writing to one data
+  member of the union and reading another (type punning) is UB in C++. Thus
+  pybind11-bound enums should never be used for such conversions.
+  `#2320 <https://github.com/pybind/pybind11/pull/2320>`_.
-* ``py::memoryview``  update and documentation.
-  `#2223 <https://github.com/pybind/pybind11/pull/2223>`_
+* Classes now check local scope when registering members, allowing a subclass
+  to have a member with the same name as a parent (such as an enum).
+  `#2335 <https://github.com/pybind/pybind11/pull/2335>`_
+Code correctness features:
+* Error now thrown when ``__init__`` is forgotten on subclasses.
+  `#2152 <https://github.com/pybind/pybind11/pull/2152>`_
+* Throw error if conversion to a pybind11 type if the Python object isn't a
+  valid instance of that type, such as ``py::bytes(o)`` when ``py::object o``
+  isn't a bytes instance.
+  `#2349 <https://github.com/pybind/pybind11/pull/2349>`_
+* Throw if conversion to ``str`` fails.
+  `#2477 <https://github.com/pybind/pybind11/pull/2477>`_
+API changes:
+* ``py::module`` was renamed ``py::module_`` to avoid issues with C++20 when
+  used unqualified, but an alias ``py::module`` is provided for backward
+  compatibility.
+  `#2489 <https://github.com/pybind/pybind11/pull/2489>`_
+* Public constructors for ``py::module_`` have been deprecated; please use
+  ``pybind11::module_::create_extension_module`` if you were using the public
+  constructor (fairly rare after ``PYBIND11_MODULE`` was introduced).
+  `#2552 <https://github.com/pybind/pybind11/pull/2552>`_
+* ``PYBIND11_OVERLOAD*`` macros and ``get_overload`` function replaced by
+  correctly-named ``PYBIND11_OVERRIDE*`` and ``get_override``, fixing
+  inconsistencies in the presence of a closing ``;`` in these macros.
+  ``get_type_overload`` is deprecated.
+  `#2325 <https://github.com/pybind/pybind11/pull/2325>`_
+Packaging / building improvements:
 * The Python package was reworked to be more powerful and useful.
   `#2433 <https://github.com/pybind/pybind11/pull/2433>`_
@@ -54,7 +773,7 @@ See :ref:`upgrade-guide-2.6` for help upgrading to the new version.
   * :ref:`build-setuptools` is easier thanks to a new
     ``pybind11.setup_helpers`` module, which provides utilities to use
     setuptools with pybind11. It can be used via PEP 518, ``setup_requires``,
-    or by directly copying ``setup_helpers.py`` into your project.
+    or by directly importing or copying ``setup_helpers.py`` into your project.
   * CMake configuration files are now included in the Python package. Use
     ``pybind11.get_cmake_dir()`` or ``python -m pybind11 --cmakedir`` to get
@@ -62,17 +781,21 @@ See :ref:`upgrade-guide-2.6` for help upgrading to the new version.
     site-packages location in your ``CMAKE_MODULE_PATH``. Or you can use the
     new ``pybind11[global]`` extra when you install ``pybind11``, which
     installs the CMake files and headers into your base environment in the
-    standard location
+    standard location.
   * ``pybind11-config`` is another way to write ``python -m pybind11`` if you
     have your PATH set up.
+  * Added external typing support to the helper module, code from
+    ``import pybind11`` can now be type checked.
+    `#2588 <https://github.com/pybind/pybind11/pull/2588>`_
 * Minimum CMake required increased to 3.4.
   `#2338 <https://github.com/pybind/pybind11/pull/2338>`_ and
   `#2370 <https://github.com/pybind/pybind11/pull/2370>`_
-  * Full integration with CMake’s C++ standard system replaces
+  * Full integration with CMake’s C++ standard system and compile features
+    replaces ``PYBIND11_CPP_STANDARD``.
   * Generated config file is now portable to different Python/compiler/CMake
@@ -85,27 +808,36 @@ See :ref:`upgrade-guide-2.6` for help upgrading to the new version.
-* Optional :ref:`find-python-mode` and :ref:`nopython-mode` with CMake.
-  `#2370 <https://github.com/pybind/pybind11/pull/2370>`_
+  * ``CUDA`` as a language is now supported.
+  * Helper functions ``pybind11_strip``, ``pybind11_extension``,
+    ``pybind11_find_import`` added, see :doc:`cmake/index`.
+  * Optional :ref:`find-python-mode` and :ref:`nopython-mode` with CMake.
+    `#2370 <https://github.com/pybind/pybind11/pull/2370>`_
 * Uninstall target added.
   `#2265 <https://github.com/pybind/pybind11/pull/2265>`_ and
   `#2346 <https://github.com/pybind/pybind11/pull/2346>`_
-* ``PYBIND11_OVERLOAD*`` macros and ``get_overload`` function replaced by
-  correctly-named ``PYBIND11_OVERRIDE*`` and ``get_override``, fixing
-  inconsistencies in the presene of a closing ``;`` in these macros.
-  ``get_type_overload`` is deprecated.
-  `#2325 <https://github.com/pybind/pybind11/pull/2325>`_
+* ``pybind11_add_module()`` now accepts an optional ``OPT_SIZE`` flag that
+  switches the binding target to size-based optimization if the global build
+  type can not always be fixed to ``MinSizeRel`` (except in debug mode, where
+  optimizations remain disabled).  ``MinSizeRel`` or this flag reduces binary
+  size quite substantially (~25% on some platforms).
+  `#2463 <https://github.com/pybind/pybind11/pull/2463>`_
-Smaller or developer focused features:
+Smaller or developer focused features and fixes:
-* Moved ``mkdoc.py`` to a new repo, `pybind11-mkdoc`_.
+* Moved ``mkdoc.py`` to a new repo, `pybind11-mkdoc`_. There are no longer
+  submodules in the main repo.
-.. _pybind11-mkdoc: https://github.com/pybind/pybind11-mkdoc
+* ``py::memoryview`` segfault fix and update, with new
+  ``py::memoryview::from_memory`` in Python 3, and documentation.
+  `#2223 <https://github.com/pybind/pybind11/pull/2223>`_
-* Error now thrown when ``__init__`` is forgotten on subclasses.
-  `#2152 <https://github.com/pybind/pybind11/pull/2152>`_
+* Fix for ``buffer_info`` on Python 2.
+  `#2503 <https://github.com/pybind/pybind11/pull/2503>`_
 * If ``__eq__`` defined but not ``__hash__``, ``__hash__`` is now set to
@@ -114,12 +846,6 @@ Smaller or developer focused features:
 * ``py::ellipsis`` now also works on Python 2.
   `#2360 <https://github.com/pybind/pybind11/pull/2360>`_
-* Throw if conversion to ``str`` fails.
-  `#2477 <https://github.com/pybind/pybind11/pull/2477>`_
-* Added missing signature for ``py::array``.
-  `#2363 <https://github.com/pybind/pybind11/pull/2363>`_
 * Pointer to ``std::tuple`` & ``std::pair`` supported in cast.
   `#2334 <https://github.com/pybind/pybind11/pull/2334>`_
@@ -127,7 +853,26 @@ Smaller or developer focused features:
   argument type.
   `#2293 <https://github.com/pybind/pybind11/pull/2293>`_
-* Bugfixes related to more extensive testing
+* Added missing signature for ``py::array``.
+  `#2363 <https://github.com/pybind/pybind11/pull/2363>`_
+* ``unchecked_mutable_reference`` has access to operator ``()`` and ``[]`` when
+  const.
+  `#2514 <https://github.com/pybind/pybind11/pull/2514>`_
+* ``py::vectorize`` is now supported on functions that return void.
+  `#1969 <https://github.com/pybind/pybind11/pull/1969>`_
+* ``py::capsule`` supports ``get_pointer`` and ``set_pointer``.
+  `#1131 <https://github.com/pybind/pybind11/pull/1131>`_
+* Fix crash when different instances share the same pointer of the same type.
+  `#2252 <https://github.com/pybind/pybind11/pull/2252>`_
+* Fix for ``py::len`` not clearing Python's error state when it fails and throws.
+  `#2575 <https://github.com/pybind/pybind11/pull/2575>`_
+* Bugfixes related to more extensive testing, new GitHub Actions CI.
   `#2321 <https://github.com/pybind/pybind11/pull/2321>`_
 * Bug in timezone issue in Eastern hemisphere midnight fixed.
@@ -141,16 +886,27 @@ Smaller or developer focused features:
   requested ordering.
   `#2484 <https://github.com/pybind/pybind11/pull/2484>`_
-* PyPy fixes, including support for PyPy3 and PyPy 7.
+* Avoid a segfault on some compilers when types are removed in Python.
+  `#2564 <https://github.com/pybind/pybind11/pull/2564>`_
+* ``py::arg::none()`` is now also respected when passing keyword arguments.
+  `#2611 <https://github.com/pybind/pybind11/pull/2611>`_
+* PyPy fixes, PyPy 7.3.x now supported, including PyPy3. (Known issue with
+  PyPy2 and Windows `#2596 <https://github.com/pybind/pybind11/issues/2596>`_).
   `#2146 <https://github.com/pybind/pybind11/pull/2146>`_
-* CPython 3.9 fixes.
+* CPython 3.9.0 workaround for undefined behavior (macOS segfault).
+  `#2576 <https://github.com/pybind/pybind11/pull/2576>`_
+* CPython 3.9 warning fixes.
   `#2253 <https://github.com/pybind/pybind11/pull/2253>`_
-* More C++20 support.
+* Improved C++20 support, now tested in CI.
   `#2489 <https://github.com/pybind/pybind11/pull/2489>`_
+  `#2599 <https://github.com/pybind/pybind11/pull/2599>`_
-* Debug Python interpreter support.
+* Improved but still incomplete debug Python interpreter support.
   `#2025 <https://github.com/pybind/pybind11/pull/2025>`_
 * NVCC (CUDA 11) now supported and tested in CI.
@@ -159,11 +915,20 @@ Smaller or developer focused features:
 * NVIDIA PGI compilers now supported and tested in CI.
   `#2475 <https://github.com/pybind/pybind11/pull/2475>`_
-* Extensive style checking in CI, with `pre-commit`_ support.
+* At least Intel 18 now explicitly required when compiling with Intel.
+  `#2577 <https://github.com/pybind/pybind11/pull/2577>`_
-.. _pre-commit: https://pre-commit.com
+* Extensive style checking in CI, with `pre-commit`_ support. Code
+  modernization, checked by clang-tidy.
+* Expanded docs, including new main page, new installing section, and CMake
+  helpers page, along with over a dozen new sections on existing pages.
+* In GitHub, new docs for contributing and new issue templates.
+.. _pre-commit: https://pre-commit.com
+.. _pybind11-mkdoc: https://github.com/pybind/pybind11-mkdoc
 v2.5.0 (Mar 31, 2020)
@@ -261,7 +1026,7 @@ v2.4.0 (Sep 19, 2019)
   `#1888 <https://github.com/pybind/pybind11/pull/1888>`_.
 * ``py::details::overload_cast_impl`` is available in C++11 mode, can be used
-  like ``overload_cast`` with an additional set of parantheses.
+  like ``overload_cast`` with an additional set of parentheses.
   `#1581 <https://github.com/pybind/pybind11/pull/1581>`_.
 * Fixed ``get_include()`` on Conda.
@@ -520,7 +1285,7 @@ v2.2.2 (February 7, 2018)
 v2.2.1 (September 14, 2017)
-* Added ``py::module::reload()`` member function for reloading a module.
+* Added ``py::module_::reload()`` member function for reloading a module.
   `#1040 <https://github.com/pybind/pybind11/pull/1040>`_.
 * Fixed a reference leak in the number converter.
@@ -583,6 +1348,7 @@ v2.2.0 (August 31, 2017)
       from cpp_module import CppBase1, CppBase2
       class PyDerived(CppBase1, CppBase2):
           def __init__(self):
               CppBase1.__init__(self)  # C++ bases must be initialized explicitly
@@ -795,7 +1561,7 @@ v2.2.0 (August 31, 2017)
 * Intel C++ compiler compatibility fixes.
   `#937 <https://github.com/pybind/pybind11/pull/937>`_.
-* Fixed implicit conversion of `py::enum_` to integer types on Python 2.7.
+* Fixed implicit conversion of ``py::enum_`` to integer types on Python 2.7.
   `#821 <https://github.com/pybind/pybind11/pull/821>`_.
 * Added ``py::hash`` to fetch the hash value of Python objects, and
diff --git a/wrap/pybind11/docs/classes.rst b/wrap/pybind11/docs/classes.rst
index f3610ef367..13fa8b5387 100644
--- a/wrap/pybind11/docs/classes.rst
+++ b/wrap/pybind11/docs/classes.rst
@@ -44,12 +44,12 @@ interactive Python session demonstrating this example is shown below:
     % python
     >>> import example
-    >>> p = example.Pet('Molly')
+    >>> p = example.Pet("Molly")
     >>> print(p)
     <example.Pet object at 0x10cd98060>
     >>> p.getName()
-    >>> p.setName('Charly')
+    >>> p.setName("Charly")
     >>> p.getName()
@@ -122,10 +122,10 @@ This makes it possible to write
 .. code-block:: pycon
-    >>> p = example.Pet('Molly')
+    >>> p = example.Pet("Molly")
     >>> p.name
-    >>> p.name = 'Charly'
+    >>> p.name = "Charly"
     >>> p.name
@@ -174,10 +174,10 @@ Native Python classes can pick up new attributes dynamically:
 .. code-block:: pycon
     >>> class Pet:
-    ...     name = 'Molly'
+    ...     name = "Molly"
     >>> p = Pet()
-    >>> p.name = 'Charly'  # overwrite existing
+    >>> p.name = "Charly"  # overwrite existing
     >>> p.age = 2  # dynamically add a new attribute
 By default, classes exported from C++ do not support this and the only writable
@@ -195,7 +195,7 @@ Trying to set any other attribute results in an error:
 .. code-block:: pycon
     >>> p = example.Pet()
-    >>> p.name = 'Charly'  # OK, attribute defined in C++
+    >>> p.name = "Charly"  # OK, attribute defined in C++
     >>> p.age = 2  # fail
     AttributeError: 'Pet' object has no attribute 'age'
@@ -213,7 +213,7 @@ Now everything works as expected:
 .. code-block:: pycon
     >>> p = example.Pet()
-    >>> p.name = 'Charly'  # OK, overwrite value in C++
+    >>> p.name = "Charly"  # OK, overwrite value in C++
     >>> p.age = 2  # OK, dynamically add a new attribute
     >>> p.__dict__  # just like a native Python class
     {'age': 2}
@@ -280,7 +280,7 @@ expose fields and methods of both types:
 .. code-block:: pycon
-    >>> p = example.Dog('Molly')
+    >>> p = example.Dog("Molly")
     >>> p.name
     >>> p.bark()
@@ -446,8 +446,7 @@ you can use ``py::detail::overload_cast_impl`` with an additional set of parenth
 Enumerations and internal types
-Let's now suppose that the example class contains an internal enumeration type,
+Let's now suppose that the example class contains internal types like enumerations, e.g.:
 .. code-block:: cpp
@@ -457,10 +456,15 @@ e.g.:
+        struct Attributes {
+            float age = 0;
+        };
         Pet(const std::string &name, Kind type) : name(name), type(type) { }
         std::string name;
         Kind type;
+        Attributes attr;
 The binding code for this example looks as follows:
@@ -471,22 +475,28 @@ The binding code for this example looks as follows:
     pet.def(py::init<const std::string &, Pet::Kind>())
         .def_readwrite("name", &Pet::name)
-        .def_readwrite("type", &Pet::type);
+        .def_readwrite("type", &Pet::type)
+        .def_readwrite("attr", &Pet::attr);
     py::enum_<Pet::Kind>(pet, "Kind")
         .value("Dog", Pet::Kind::Dog)
         .value("Cat", Pet::Kind::Cat)
-To ensure that the ``Kind`` type is created within the scope of ``Pet``, the
-``pet`` :class:`class_` instance must be supplied to the :class:`enum_`.
+    py::class_<Pet::Attributes> attributes(pet, "Attributes")
+        .def(py::init<>())
+        .def_readwrite("age", &Pet::Attributes::age);
+To ensure that the nested types ``Kind`` and ``Attributes`` are created within the scope of ``Pet``, the
+``pet`` :class:`class_` instance must be supplied to the :class:`enum_` and :class:`class_`
 constructor. The :func:`enum_::export_values` function exports the enum entries
 into the parent scope, which should be skipped for newer C++11-style strongly
 typed enums.
 .. code-block:: pycon
-    >>> p = Pet('Lucy', Pet.Cat)
+    >>> p = Pet("Lucy", Pet.Cat)
     >>> p.type
     >>> int(p.type)
@@ -508,7 +518,7 @@ The ``name`` property returns the name of the enum value as a unicode string.
     .. code-block:: pycon
-        >>> p = Pet( "Lucy", Pet.Cat )
+        >>> p = Pet("Lucy", Pet.Cat)
         >>> pet_type = p.type
         >>> pet_type
diff --git a/wrap/pybind11/docs/cmake/index.rst b/wrap/pybind11/docs/cmake/index.rst
new file mode 100644
index 0000000000..eaf66d70f3
--- /dev/null
+++ b/wrap/pybind11/docs/cmake/index.rst
@@ -0,0 +1,8 @@
+CMake helpers
+Pybind11 can be used with ``add_subdirectory(extern/pybind11)``, or from an
+install with ``find_package(pybind11 CONFIG)``. The interface provided in
+either case is functionally identical.
+.. cmake-module:: ../../tools/pybind11Config.cmake.in
diff --git a/wrap/pybind11/docs/compiling.rst b/wrap/pybind11/docs/compiling.rst
index cbf14a466b..75608bd576 100644
--- a/wrap/pybind11/docs/compiling.rst
+++ b/wrap/pybind11/docs/compiling.rst
@@ -31,20 +31,18 @@ An example of a ``setup.py`` using pybind11's helpers:
 .. code-block:: python
+    from glob import glob
     from setuptools import setup
     from pybind11.setup_helpers import Pybind11Extension
     ext_modules = [
-            ["src/main.cpp"],
+            sorted(glob("src/*.cpp")),  # Sort source files for reproducibility
-    setup(
-        ...,
-        ext_modules=ext_modules
-    )
+    setup(..., ext_modules=ext_modules)
 If you want to do an automatic search for the highest supported C++ standard,
 that is supported via a ``build_ext`` command override; it will only affect
@@ -52,21 +50,81 @@ that is supported via a ``build_ext`` command override; it will only affect
 .. code-block:: python
+    from glob import glob
     from setuptools import setup
     from pybind11.setup_helpers import Pybind11Extension, build_ext
     ext_modules = [
-            ["src/main.cpp"],
+            sorted(glob("src/*.cpp")),
-    setup(
-        ...,
-        cmdclass={"build_ext": build_ext},
-        ext_modules=ext_modules
-    )
+    setup(..., cmdclass={"build_ext": build_ext}, ext_modules=ext_modules)
+If you have single-file extension modules that are directly stored in the
+Python source tree (``foo.cpp`` in the same directory as where a ``foo.py``
+would be located), you can also generate ``Pybind11Extensions`` using
+``setup_helpers.intree_extensions``: ``intree_extensions(["path/to/foo.cpp",
+...])`` returns a list of ``Pybind11Extensions`` which can be passed to
+``ext_modules``, possibly after further customizing their attributes
+(``libraries``, ``include_dirs``, etc.).  By doing so, a ``foo.*.so`` extension
+module will be generated and made available upon installation.
+``intree_extension`` will automatically detect if you are using a ``src``-style
+layout (as long as no namespace packages are involved), but you can also
+explicitly pass ``package_dir`` to it (as in ``setuptools.setup``).
+Since pybind11 does not require NumPy when building, a light-weight replacement
+for NumPy's parallel compilation distutils tool is included. Use it like this:
+.. code-block:: python
+    from pybind11.setup_helpers import ParallelCompile
+    # Optional multithreaded build
+    ParallelCompile("NPY_NUM_BUILD_JOBS").install()
+    setup(...)
+The argument is the name of an environment variable to control the number of
+threads, such as ``NPY_NUM_BUILD_JOBS`` (as used by NumPy), though you can set
+something different if you want; ``CMAKE_BUILD_PARALLEL_LEVEL`` is another choice
+a user might expect. You can also pass ``default=N`` to set the default number
+of threads (0 will take the number of threads available) and ``max=N``, the
+maximum number of threads; if you have a large extension you may want set this
+to a memory dependent number.
+If you are developing rapidly and have a lot of C++ files, you may want to
+avoid rebuilding files that have not changed. For simple cases were you are
+using ``pip install -e .`` and do not have local headers, you can skip the
+rebuild if an object file is newer than its source (headers are not checked!)
+with the following:
+.. code-block:: python
+    from pybind11.setup_helpers import ParallelCompile, naive_recompile
+    ParallelCompile("NPY_NUM_BUILD_JOBS", needs_recompile=naive_recompile).install()
+If you have a more complex build, you can implement a smarter function and pass
+it to ``needs_recompile``, or you can use [Ccache]_ instead. ``CXX="cache g++"
+pip install -e .`` would be the way to use it with GCC, for example. Unlike the
+simple solution, this even works even when not compiling in editable mode, but
+it does require Ccache to be installed.
+Keep in mind that Pip will not even attempt to rebuild if it thinks it has
+already built a copy of your code, which it deduces from the version number.
+One way to avoid this is to use [setuptools_scm]_, which will generate a
+version number that includes the number of commits since your last tag and a
+hash for a dirty directory. Another way to force a rebuild is purge your cache
+or use Pip's ``--no-cache-dir`` option.
+.. [Ccache] https://ccache.dev
+.. [setuptools_scm] https://github.com/pypa/setuptools_scm
 .. _setup_helpers-pep518:
@@ -85,7 +143,7 @@ Your ``pyproject.toml`` file will likely look something like this:
 .. code-block:: toml
-    requires = ["setuptools", "wheel", "pybind11==2.6.0"]
+    requires = ["setuptools>=42", "wheel", "pybind11~=2.6.1"]
     build-backend = "setuptools.build_meta"
 .. note::
@@ -96,10 +154,12 @@ Your ``pyproject.toml`` file will likely look something like this:
     in Python) using something like `cibuildwheel`_, remember that ``setup.py``
     and ``pyproject.toml`` are not even contained in the wheel, so this high
     Pip requirement is only for source builds, and will not affect users of
-    your binary wheels.
+    your binary wheels. If you are building SDists and wheels, then
+    `pypa-build`_ is the recommended official tool.
 .. _PEP 517: https://www.python.org/dev/peps/pep-0517/
 .. _cibuildwheel: https://cibuildwheel.readthedocs.io
+.. _pypa-build: https://pypa-build.readthedocs.io/en/latest/
 .. _setup_helpers-setup_requires:
@@ -140,6 +200,23 @@ this, you will need to import from a local file in ``setup.py`` and ensure the
 helper file is part of your MANIFEST.
+Closely related, if you include pybind11 as a subproject, you can run the
+``setup_helpers.py`` inplace. If loaded correctly, this should even pick up
+the correct include for pybind11, though you can turn it off as shown above if
+you want to input it manually.
+Suggested usage if you have pybind11 as a submodule in ``extern/pybind11``:
+.. code-block:: python
+    DIR = os.path.abspath(os.path.dirname(__file__))
+    sys.path.append(os.path.join(DIR, "extern", "pybind11"))
+    from pybind11.setup_helpers import Pybind11Extension  # noqa: E402
+    del sys.path[-1]
 .. versionchanged:: 2.6
     Added ``setup_helpers`` file.
@@ -184,6 +261,8 @@ PyPI integration, can be found in the [cmake_example]_  repository.
 .. versionchanged:: 2.6
    CMake 3.4+ is required.
+Further information can be found at :doc:`cmake/index`.
@@ -224,8 +303,15 @@ As stated above, LTO is enabled by default. Some newer compilers also support
 different flavors of LTO such as `ThinLTO`_. Setting ``THIN_LTO`` will cause
 the function to prefer this flavor if available. The function falls back to
 regular LTO if ``-flto=thin`` is not available. If
-``CMAKE_INTERPROCEDURAL_OPTIMIZATION`` is set (either ON or OFF), then that
-will be respected instead of the built-in flag search.
+``CMAKE_INTERPROCEDURAL_OPTIMIZATION`` is set (either ``ON`` or ``OFF``), then
+that will be respected instead of the built-in flag search.
+.. note::
+   If you want to set the property form on targets or the
+   ``CMAKE_INTERPROCEDURAL_OPTIMIZATION_<CONFIG>`` versions of this, you should
+   still use ``set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF)`` (otherwise a
+   no-op) to disable pybind11's ipo flags.
 The ``OPT_SIZE`` flag enables size-based optimization equivalent to the
 standard ``/Os`` or ``-Os`` compiler flags and the ``MinSizeRel`` build type,
@@ -252,10 +338,9 @@ standard explicitly with
 .. code-block:: cmake
-    set(CMAKE_CXX_STANDARD 14)  # or 11, 14, 17, 20
+    set(CMAKE_CXX_STANDARD 14 CACHE STRING "C++ version selection")  # or 11, 14, 17, 20
     set(CMAKE_CXX_STANDARD_REQUIRED ON)  # optional, ensure standard is supported
-    set(CMAKE_CXX_EXTENSIONS OFF)  # optional, keep compiler extensionsn off
+    set(CMAKE_CXX_EXTENSIONS OFF)  # optional, keep compiler extensions off
 The variables can also be set when calling CMake from the command line using
 the ``-D<variable>=<value>`` flag. You can also manually set ``CXX_STANDARD``
@@ -325,13 +410,14 @@ can refer to the same [cmake_example]_ repository for a full sample project
 FindPython mode
-CMake 3.12+ (3.15+ recommended) added a new module called FindPython that had a
-highly improved search algorithm and modern targets and tools. If you use
-FindPython, pybind11 will detect this and use the existing targets instead:
+CMake 3.12+ (3.15+ recommended, 3.18.2+ ideal) added a new module called
+FindPython that had a highly improved search algorithm and modern targets
+and tools. If you use FindPython, pybind11 will detect this and use the
+existing targets instead:
 .. code-block:: cmake
-    cmake_minumum_required(VERSION 3.15...3.18)
+    cmake_minimum_required(VERSION 3.15...3.19)
     project(example LANGUAGES CXX)
     find_package(Python COMPONENTS Interpreter Development REQUIRED)
@@ -357,6 +443,14 @@ setting ``Python_ROOT_DIR`` may be the most common one (though with
 virtualenv/venv support, and Conda support, this tends to find the correct
 Python version more often than the old system did).
+.. warning::
+    When the Python libraries (i.e. ``libpythonXX.a`` and ``libpythonXX.so``
+    on Unix) are not available, as is the case on a manylinux image, the
+    ``Development`` component will not be resolved by ``FindPython``. When not
+    using the embedding functionality, CMake 3.18+ allows you to specify
+    ``Development.Module`` instead of ``Development`` to resolve this issue.
 .. versionadded:: 2.6
 Advanced: interface library targets
@@ -428,7 +522,7 @@ Instead of setting properties, you can set ``CMAKE_*`` variables to initialize t
     compiler flags are provided to ensure high quality code generation. In
     contrast to the ``pybind11_add_module()`` command, the CMake interface
     provides a *composable* set of targets to ensure that you retain flexibility.
-    It can be expecially important to provide or set these properties; the
+    It can be especially important to provide or set these properties; the
     :ref:`FAQ <faq:symhidden>` contains an explanation on why these are needed.
 .. versionadded:: 2.6
@@ -481,7 +575,7 @@ On Linux, you can compile an example such as the one given in
 .. code-block:: bash
-    $ c++ -O3 -Wall -shared -std=c++11 -fPIC `python3 -m pybind11 --includes` example.cpp -o example`python3-config --extension-suffix`
+    $ c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) example.cpp -o example$(python3-config --extension-suffix)
 The flags given here assume that you're using Python 3. For Python 2, just
 change the executable appropriately (to ``python`` or ``python2``).
@@ -493,7 +587,7 @@ using ``pip`` or ``conda``. If it hasn't, you can also manually specify
 ``python3-config --includes``.
 Note that Python 2.7 modules don't use a special suffix, so you should simply
-use ``example.so`` instead of ``example`python3-config --extension-suffix```.
+use ``example.so`` instead of ``example$(python3-config --extension-suffix)``.
 Besides, the ``--extension-suffix`` option may or may not be available, depending
 on the distribution; in the latter case, the module extension can be manually
 set to ``.so``.
@@ -504,7 +598,7 @@ building the module:
 .. code-block:: bash
-    $ c++ -O3 -Wall -shared -std=c++11 -undefined dynamic_lookup `python3 -m pybind11 --includes` example.cpp -o example`python3-config --extension-suffix`
+    $ c++ -O3 -Wall -shared -std=c++11 -undefined dynamic_lookup $(python3 -m pybind11 --includes) example.cpp -o example$(python3-config --extension-suffix)
 In general, it is advisable to include several additional build parameters
 that can considerably reduce the size of the created binary. Refer to section
@@ -523,23 +617,11 @@ build system that works on all platforms including Windows.
     contains one (which will lead to a segfault).
-Building with vcpkg
+Building with Bazel
-You can download and install pybind11 using the Microsoft `vcpkg
-<https://github.com/Microsoft/vcpkg/>`_ dependency manager:
-.. code-block:: bash
-    git clone https://github.com/Microsoft/vcpkg.git
-    cd vcpkg
-    ./bootstrap-vcpkg.sh
-    ./vcpkg integrate install
-    vcpkg install pybind11
-The pybind11 port in vcpkg is kept up to date by Microsoft team members and
-community contributors. If the version is out of date, please `create an issue
-or pull request <https://github.com/Microsoft/vcpkg/>`_ on the vcpkg
+You can build with the Bazel build system using the `pybind11_bazel
+<https://github.com/pybind/pybind11_bazel>`_ repository.
 Generating binding code automatically
diff --git a/wrap/pybind11/docs/conf.py b/wrap/pybind11/docs/conf.py
index 0946f30e2e..092e274e09 100644
--- a/wrap/pybind11/docs/conf.py
+++ b/wrap/pybind11/docs/conf.py
@@ -13,57 +13,68 @@
 # All configuration values have a default; values that are commented out
 # serve to show the default.
-import sys
 import os
-import shlex
+import re
 import subprocess
+import sys
+from pathlib import Path
+DIR = Path(__file__).parent.resolve()
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.insert(0, os.path.abspath('.'))
+# sys.path.insert(0, os.path.abspath('.'))
 # -- General configuration ------------------------------------------------
 # If your documentation needs a minimal Sphinx version, state it here.
-#needs_sphinx = '1.0'
+# needs_sphinx = '1.0'
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = ['breathe']
+extensions = [
+    "breathe",
+    "sphinxcontrib.rsvgconverter",
+    "sphinxcontrib.moderncmakedomain",
-breathe_projects = {'pybind11': '.build/doxygenxml/'}
-breathe_default_project = 'pybind11'
-breathe_domain_by_extension = {'h': 'cpp'}
+breathe_projects = {"pybind11": ".build/doxygenxml/"}
+breathe_default_project = "pybind11"
+breathe_domain_by_extension = {"h": "cpp"}
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['.templates']
+templates_path = [".templates"]
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 # source_suffix = ['.rst', '.md']
-source_suffix = '.rst'
+source_suffix = ".rst"
 # The encoding of source files.
-#source_encoding = 'utf-8-sig'
+# source_encoding = 'utf-8-sig'
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 # General information about the project.
-project = 'pybind11'
-copyright = '2017, Wenzel Jakob'
-author = 'Wenzel Jakob'
+project = "pybind11"
+copyright = "2017, Wenzel Jakob"
+author = "Wenzel Jakob"
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
-# The short X.Y version.
-version = '2.5'
+# Read the listed version
+with open("../pybind11/_version.py") as f:
+    code = compile(f.read(), "../pybind11/_version.py", "exec")
+loc = {}
+exec(code, loc)
 # The full version, including alpha/beta/rc tags.
-release = '2.5.dev1'
+version = loc["__version__"]
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -74,37 +85,37 @@
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
-#today = ''
+# today = ''
 # Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
+# today_fmt = '%B %d, %Y'
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['.build', 'release.rst']
+exclude_patterns = [".build", "release.rst"]
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
-default_role = 'any'
+default_role = "any"
 # If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
+# add_function_parentheses = True
 # If true, the current module name will be prepended to all description
 # unit titles (such as .. function::).
-#add_module_names = True
+# add_module_names = True
 # If true, sectionauthor and moduleauthor directives will be shown in the
 # output. They are ignored by default.
-#show_authors = False
+# show_authors = False
 # The name of the Pygments (syntax highlighting) style to use.
-#pygments_style = 'monokai'
+# pygments_style = 'monokai'
 # A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
+# modindex_common_prefix = []
 # If true, keep warnings as "system message" paragraphs in the built documents.
-#keep_warnings = False
+# keep_warnings = False
 # If true, `todo` and `todoList` produce output, else they produce nothing.
 todo_include_todos = False
@@ -115,141 +126,150 @@
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
+on_rtd = os.environ.get("READTHEDOCS", None) == "True"
 if not on_rtd:  # only import and set the theme if we're building docs locally
     import sphinx_rtd_theme
-    html_theme = 'sphinx_rtd_theme'
+    html_theme = "sphinx_rtd_theme"
     html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
-    html_context = {
-        'css_files': [
-            '_static/theme_overrides.css'
-        ]
-    }
+    html_context = {"css_files": ["_static/theme_overrides.css"]}
     html_context = {
-        'css_files': [
-            '//media.readthedocs.org/css/sphinx_rtd_theme.css',
-            '//media.readthedocs.org/css/readthedocs-doc-embed.css',
-            '_static/theme_overrides.css'
+        "css_files": [
+            "//media.readthedocs.org/css/sphinx_rtd_theme.css",
+            "//media.readthedocs.org/css/readthedocs-doc-embed.css",
+            "_static/theme_overrides.css",
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
-#html_theme_options = {}
+# html_theme_options = {}
 # Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
+# html_theme_path = []
 # The name for this set of Sphinx documents.  If None, it defaults to
-# "<project> v<release> documentation".
-#html_title = None
+# "<project> v<version> documentation".
+# html_title = None
 # A shorter title for the navigation bar.  Default is the same as html_title.
-#html_short_title = None
+# html_short_title = None
 # The name of an image file (relative to this directory) to place at the top
 # of the sidebar.
-#html_logo = None
+# html_logo = None
 # The name of an image file (within the static path) to use as favicon of the
 # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
 # pixels large.
-#html_favicon = None
+# html_favicon = None
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
 # Add any extra paths that contain custom files (such as robots.txt or
 # .htaccess) here, relative to this directory. These files are copied
 # directly to the root of the documentation.
-#html_extra_path = []
+# html_extra_path = []
 # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
 # using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
+# html_last_updated_fmt = '%b %d, %Y'
 # If true, SmartyPants will be used to convert quotes and dashes to
 # typographically correct entities.
-#html_use_smartypants = True
+# html_use_smartypants = True
 # Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
+# html_sidebars = {}
 # Additional templates that should be rendered to pages, maps page names to
 # template names.
-#html_additional_pages = {}
+# html_additional_pages = {}
 # If false, no module index is generated.
-#html_domain_indices = True
+# html_domain_indices = True
 # If false, no index is generated.
-#html_use_index = True
+# html_use_index = True
 # If true, the index is split into individual pages for each letter.
-#html_split_index = False
+# html_split_index = False
 # If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
+# html_show_sourcelink = True
 # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-#html_show_sphinx = True
+# html_show_sphinx = True
 # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-#html_show_copyright = True
+# html_show_copyright = True
 # If true, an OpenSearch description file will be output, and all pages will
 # contain a <link> tag referring to it.  The value of this option must be the
 # base URL from which the finished HTML is served.
-#html_use_opensearch = ''
+# html_use_opensearch = ''
 # This is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = None
+# html_file_suffix = None
 # Language to be used for generating the HTML full-text search index.
 # Sphinx supports the following languages:
 #   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
 #   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr'
-#html_search_language = 'en'
+# html_search_language = 'en'
 # A dictionary with options for the search language support, empty by default.
 # Now only 'ja' uses this config value
-#html_search_options = {'type': 'default'}
+# html_search_options = {'type': 'default'}
 # The name of a javascript file (relative to the configuration directory) that
 # implements a search results scorer. If empty, the default will be used.
-#html_search_scorer = 'scorer.js'
+# html_search_scorer = 'scorer.js'
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'pybind11doc'
+htmlhelp_basename = "pybind11doc"
 # -- Options for LaTeX output ---------------------------------------------
-latex_elements = {
-# The paper size ('letterpaper' or 'a4paper').
-#'papersize': 'letterpaper',
-# The font size ('10pt', '11pt' or '12pt').
-#'pointsize': '10pt',
+latex_engine = "pdflatex"
-# Additional stuff for the LaTeX preamble.
-'preamble': r'\DeclareUnicodeCharacter{00A0}{}',
-# Latex figure (float) alignment
-#'figure_align': 'htbp',
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    # 'papersize': 'letterpaper',
+    #
+    # The font size ('10pt', '11pt' or '12pt').
+    # 'pointsize': '10pt',
+    #
+    # Additional stuff for the LaTeX preamble.
+    # remove blank pages (between the title page and the TOC, etc.)
+    "classoptions": ",openany,oneside",
+    "preamble": r"""
+    # Latex figure (float) alignment
+    # 'figure_align': 'htbp',
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-  (master_doc, 'pybind11.tex', 'pybind11 Documentation',
-   'Wenzel Jakob', 'manual'),
+    (master_doc, "pybind11.tex", "pybind11 Documentation", "Wenzel Jakob", "manual"),
 # The name of an image file (relative to this directory) to place at the top of
@@ -258,32 +278,29 @@
 # For "manual" documents, if this is true, then toplevel headings are parts,
 # not chapters.
-#latex_use_parts = False
+# latex_use_parts = False
 # If true, show page references after internal links.
-#latex_show_pagerefs = False
+# latex_show_pagerefs = False
 # If true, show URL addresses after external links.
-#latex_show_urls = False
+# latex_show_urls = False
 # Documents to append as an appendix to all manuals.
-#latex_appendices = []
+# latex_appendices = []
 # If false, no module index is generated.
-#latex_domain_indices = True
+# latex_domain_indices = True
 # -- Options for manual page output ---------------------------------------
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'pybind11', 'pybind11 Documentation',
-     [author], 1)
+man_pages = [(master_doc, "pybind11", "pybind11 Documentation", [author], 1)]
 # If true, show URL addresses after external links.
-#man_show_urls = False
+# man_show_urls = False
 # -- Options for Texinfo output -------------------------------------------
@@ -292,41 +309,73 @@
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-  (master_doc, 'pybind11', 'pybind11 Documentation',
-   author, 'pybind11', 'One line description of project.',
-   'Miscellaneous'),
+    (
+        master_doc,
+        "pybind11",
+        "pybind11 Documentation",
+        author,
+        "pybind11",
+        "One line description of project.",
+        "Miscellaneous",
+    ),
 # Documents to append as an appendix to all manuals.
-#texinfo_appendices = []
+# texinfo_appendices = []
 # If false, no module index is generated.
-#texinfo_domain_indices = True
+# texinfo_domain_indices = True
 # How to display URL addresses: 'footnote', 'no', or 'inline'.
-#texinfo_show_urls = 'footnote'
+# texinfo_show_urls = 'footnote'
 # If true, do not generate a @detailmenu in the "Top" node's menu.
-#texinfo_no_detailmenu = False
+# texinfo_no_detailmenu = False
-primary_domain = 'cpp'
-highlight_language = 'cpp'
+primary_domain = "cpp"
+highlight_language = "cpp"
 def generate_doxygen_xml(app):
-    build_dir = os.path.join(app.confdir, '.build')
+    build_dir = os.path.join(app.confdir, ".build")
     if not os.path.exists(build_dir):
-        subprocess.call(['doxygen', '--version'])
-        retcode = subprocess.call(['doxygen'], cwd=app.confdir)
+        subprocess.call(["doxygen", "--version"])
+        retcode = subprocess.call(["doxygen"], cwd=app.confdir)
         if retcode < 0:
             sys.stderr.write("doxygen error code: {}\n".format(-retcode))
     except OSError as e:
         sys.stderr.write("doxygen execution failed: {}\n".format(e))
+def prepare(app):
+    with open(DIR.parent / "README.rst") as f:
+        contents = f.read()
+    if app.builder.name == "latex":
+        # Remove badges and stuff from start
+        contents = contents[contents.find(r".. start") :]
+        # Filter out section titles for index.rst for LaTeX
+        contents = re.sub(r"^(.*)\n[-~]{3,}$", r"**\1**", contents, flags=re.MULTILINE)
+    with open(DIR / "readme.rst", "w") as f:
+        f.write(contents)
+def clean_up(app, exception):
+    (DIR / "readme.rst").unlink()
 def setup(app):
-    """Add hook for building doxygen xml when needed"""
+    # Add hook for building doxygen xml when needed
     app.connect("builder-inited", generate_doxygen_xml)
+    # Copy the readme in
+    app.connect("builder-inited", prepare)
+    # Clean up the generated readme
+    app.connect("build-finished", clean_up)
diff --git a/wrap/pybind11/docs/faq.rst b/wrap/pybind11/docs/faq.rst
index 5f7866fa76..e2f477b1f5 100644
--- a/wrap/pybind11/docs/faq.rst
+++ b/wrap/pybind11/docs/faq.rst
@@ -5,7 +5,7 @@ Frequently asked questions
 1. Make sure that the name specified in PYBIND11_MODULE is identical to the
-filename of the extension library (without suffixes such as .so)
+filename of the extension library (without suffixes such as ``.so``).
 2. If the above did not fix the issue, you are likely using an incompatible
 version of Python (for instance, the extension library was compiled against
@@ -27,18 +27,6 @@ The Python interpreter immediately crashes when importing my module
 See the first answer.
-CMake doesn't detect the right Python version
-The CMake-based build system will try to automatically detect the installed
-version of Python and link against that. When this fails, or when there are
-multiple versions of Python and it finds the wrong one, delete
-``CMakeCache.txt`` and then invoke CMake as follows:
-.. code-block:: bash
-    cmake -DPYTHON_EXECUTABLE:FILEPATH=<path-to-python-executable> .
 .. _faq_reference_arguments:
 Limitations involving reference arguments
@@ -66,7 +54,7 @@ provided by the caller -- in fact, it does nothing at all.
 .. code-block:: python
     def increment(i):
-        i += 1 # nope..
+        i += 1  # nope..
 pybind11 is also affected by such language-level conventions, which means that
 binding ``increment`` or ``increment_ptr`` will also create Python functions
@@ -100,8 +88,8 @@ following example:
 .. code-block:: cpp
-    void init_ex1(py::module &);
-    void init_ex2(py::module &);
+    void init_ex1(py::module_ &);
+    void init_ex2(py::module_ &);
     /* ... */
     PYBIND11_MODULE(example, m) {
@@ -114,7 +102,7 @@ following example:
 .. code-block:: cpp
-    void init_ex1(py::module &m) {
+    void init_ex1(py::module_ &m) {
         m.def("add", [](int a, int b) { return a + b; });
@@ -122,7 +110,7 @@ following example:
 .. code-block:: cpp
-    void init_ex2(py::module &m) {
+    void init_ex2(py::module_ &m) {
         m.def("sub", [](int a, int b) { return a - b; });
@@ -181,8 +169,8 @@ can be changed, but even if it isn't it is not always enough to guarantee
 complete independence of the symbols involved when not using
-Additionally, ``-fvisiblity=hidden`` can deliver considerably binary size
-savings.  (See the following section for more details).
+Additionally, ``-fvisibility=hidden`` can deliver considerably binary size
+savings. (See the following section for more details.)
 .. _`faq:symhidden`:
@@ -192,7 +180,7 @@ How can I create smaller binaries?
 To do its job, pybind11 extensively relies on a programming technique known as
 *template metaprogramming*, which is a way of performing computation at compile
-time using type information. Template metaprogamming usually instantiates code
+time using type information. Template metaprogramming usually instantiates code
 involving significant numbers of deeply nested types that are either completely
 removed or reduced to just a few instructions during the compiler's optimization
 phase. However, due to the nested nature of these types, the resulting symbol
@@ -275,17 +263,34 @@ been received, you must either explicitly interrupt execution by throwing
+CMake doesn't detect the right Python version
+The CMake-based build system will try to automatically detect the installed
+version of Python and link against that. When this fails, or when there are
+multiple versions of Python and it finds the wrong one, delete
+``CMakeCache.txt`` and then add ``-DPYTHON_EXECUTABLE=$(which python)`` to your
+CMake configure line. (Replace ``$(which python)`` with a path to python if
+your prefer.)
+You can alternatively try ``-DPYBIND11_FINDPYTHON=ON``, which will activate the
+new CMake FindPython support instead of pybind11's custom search. Requires
+CMake 3.12+, and 3.15+ or 3.18.2+ are even better. You can set this in your
+``CMakeLists.txt`` before adding or finding pybind11, as well.
 Inconsistent detection of Python version in CMake and pybind11
-The functions ``find_package(PythonInterp)`` and ``find_package(PythonLibs)`` provided by CMake
-for Python version detection are not used by pybind11 due to unreliability and limitations that make
-them unsuitable for pybind11's needs. Instead pybind provides its own, more reliable Python detection
-CMake code. Conflicts can arise, however, when using pybind11 in a project that *also* uses the CMake
-Python detection in a system with several Python versions installed.
+The functions ``find_package(PythonInterp)`` and ``find_package(PythonLibs)``
+provided by CMake for Python version detection are modified by pybind11 due to
+unreliability and limitations that make them unsuitable for pybind11's needs.
+Instead pybind11 provides its own, more reliable Python detection CMake code.
+Conflicts can arise, however, when using pybind11 in a project that *also* uses
+the CMake Python detection in a system with several Python versions installed.
-This difference may cause inconsistencies and errors if *both* mechanisms are used in the same project. Consider the following
-CMake code executed in a system with Python 2.7 and 3.x installed:
+This difference may cause inconsistencies and errors if *both* mechanisms are
+used in the same project. Consider the following CMake code executed in a
+system with Python 2.7 and 3.x installed:
 .. code-block:: cmake
@@ -303,10 +308,24 @@ In contrast this code:
-will detect Python 3.x for pybind11 and may crash on ``find_package(PythonLibs)`` afterwards.
-It is advised to avoid using ``find_package(PythonInterp)`` and ``find_package(PythonLibs)`` from CMake and rely
-on pybind11 in detecting Python version. If this is not possible CMake machinery should be called *before* including pybind11.
+will detect Python 3.x for pybind11 and may crash on
+``find_package(PythonLibs)`` afterwards.
+There are three possible solutions:
+1. Avoid using ``find_package(PythonInterp)`` and ``find_package(PythonLibs)``
+   from CMake and rely on pybind11 in detecting Python version. If this is not
+   possible, the CMake machinery should be called *before* including pybind11.
+2. Set ``PYBIND11_FINDPYTHON`` to ``True`` or use ``find_package(Python
+   COMPONENTS Interpreter Development)`` on modern CMake (3.12+, 3.15+ better,
+   3.18.2+ best). Pybind11 in these cases uses the new CMake FindPython instead
+   of the old, deprecated search tools, and these modules are much better at
+   finding the correct Python.
+3. Set ``PYBIND11_NOPYTHON`` to ``TRUE``. Pybind11 will not search for Python.
+   However, you will have to use the target-based system, and do more setup
+   yourself, because it does not know about or include things that depend on
+   Python, like ``pybind11_add_module``. This might be ideal for integrating
+   into an existing system, like scikit-build's Python helpers.
 How to cite this project?
diff --git a/wrap/pybind11/docs/index.rst b/wrap/pybind11/docs/index.rst
index d236611b72..4e2e8ca3a0 100644
--- a/wrap/pybind11/docs/index.rst
+++ b/wrap/pybind11/docs/index.rst
@@ -1,18 +1,17 @@
-.. only: not latex
+.. only:: latex
-    .. image:: pybind11-logo.png
+   Intro
+   =====
-pybind11 --- Seamless operability between C++11 and Python
+.. include:: readme.rst
-.. only: not latex
+.. only:: not latex
 .. toctree::
    :maxdepth: 1
-   intro
@@ -20,6 +19,7 @@ pybind11 --- Seamless operability between C++11 and Python
    :caption: The Basics
    :maxdepth: 2
+   installing
@@ -45,3 +45,4 @@ pybind11 --- Seamless operability between C++11 and Python
+   cmake/index
diff --git a/wrap/pybind11/docs/intro.rst b/wrap/pybind11/docs/intro.rst
deleted file mode 100644
index 10e1799a19..0000000000
--- a/wrap/pybind11/docs/intro.rst
+++ /dev/null
@@ -1,93 +0,0 @@
-.. image:: pybind11-logo.png
-About this project
-**pybind11** is a lightweight header-only library that exposes C++ types in Python
-and vice versa, mainly to create Python bindings of existing C++ code. Its
-goals and syntax are similar to the excellent `Boost.Python`_ library by David
-Abrahams: to minimize boilerplate code in traditional extension modules by
-inferring type information using compile-time introspection.
-.. _Boost.Python: http://www.boost.org/doc/libs/release/libs/python/doc/index.html
-The main issue with Boost.Python—and the reason for creating such a similar
-project—is Boost. Boost is an enormously large and complex suite of utility
-libraries that works with almost every C++ compiler in existence. This
-compatibility has its cost: arcane template tricks and workarounds are
-necessary to support the oldest and buggiest of compiler specimens. Now that
-C++11-compatible compilers are widely available, this heavy machinery has
-become an excessively large and unnecessary dependency.
-Think of this library as a tiny self-contained version of Boost.Python with
-everything stripped away that isn't relevant for binding generation. Without
-comments, the core header files only require ~4K lines of code and depend on
-Python (2.7 or 3.x, or PyPy2.7 >= 5.7) and the C++ standard library. This
-compact implementation was possible thanks to some of the new C++11 language
-features (specifically: tuples, lambda functions and variadic templates). Since
-its creation, this library has grown beyond Boost.Python in many ways, leading
-to dramatically simpler binding code in many common situations.
-Core features
-The following core C++ features can be mapped to Python
-- Functions accepting and returning custom data structures per value, reference, or pointer
-- Instance methods and static methods
-- Overloaded functions
-- Instance attributes and static attributes
-- Arbitrary exception types
-- Enumerations
-- Callbacks
-- Iterators and ranges
-- Custom operators
-- Single and multiple inheritance
-- STL data structures
-- Smart pointers with reference counting like ``std::shared_ptr``
-- Internal references with correct reference counting
-- C++ classes with virtual (and pure virtual) methods can be extended in Python
-In addition to the core functionality, pybind11 provides some extra goodies:
-- Python 2.7, 3.x, and PyPy (PyPy2.7 >= 5.7) are supported with an
-  implementation-agnostic interface.
-- It is possible to bind C++11 lambda functions with captured variables. The
-  lambda capture data is stored inside the resulting Python function object.
-- pybind11 uses C++11 move constructors and move assignment operators whenever
-  possible to efficiently transfer custom data types.
-- It's easy to expose the internal storage of custom data types through
-  Pythons' buffer protocols. This is handy e.g. for fast conversion between
-  C++ matrix classes like Eigen and NumPy without expensive copy operations.
-- pybind11 can automatically vectorize functions so that they are transparently
-  applied to all entries of one or more NumPy array arguments.
-- Python's slice-based access and assignment operations can be supported with
-  just a few lines of code.
-- Everything is contained in just a few header files; there is no need to link
-  against any additional libraries.
-- Binaries are generally smaller by a factor of at least 2 compared to
-  equivalent bindings generated by Boost.Python. A recent pybind11 conversion
-  of `PyRosetta`_, an enormous Boost.Python binding project, reported a binary
-  size reduction of **5.4x** and compile time reduction by **5.8x**.
-- Function signatures are precomputed at compile time (using ``constexpr``),
-  leading to smaller binaries.
-- With little extra effort, C++ types can be pickled and unpickled similar to
-  regular Python objects.
-.. _PyRosetta: http://graylab.jhu.edu/RosettaCon2016/PyRosetta-4.pdf
-Supported compilers
-1. Clang/LLVM (any non-ancient version with C++11 support)
-2. GCC 4.8 or newer
-3. Microsoft Visual Studio 2015 or newer
-4. Intel C++ compiler v17 or newer (v16 with pybind11 v2.0 and v15 with pybind11 v2.0 and a `workaround <https://github.com/pybind/pybind11/issues/276>`_ )
diff --git a/wrap/pybind11/docs/limitations.rst b/wrap/pybind11/docs/limitations.rst
index 59474f82fd..def5ad659c 100644
--- a/wrap/pybind11/docs/limitations.rst
+++ b/wrap/pybind11/docs/limitations.rst
@@ -1,6 +1,9 @@
+Design choices
 pybind11 strives to be a general solution to binding generation, but it also has
 certain limitations:
@@ -11,9 +14,59 @@ certain limitations:
 - The NumPy interface ``pybind11::array`` greatly simplifies accessing
   numerical data from C++ (and vice versa), but it's not a full-blown array
-  class like ``Eigen::Array`` or ``boost.multi_array``.
+  class like ``Eigen::Array`` or ``boost.multi_array``. ``Eigen`` objects are
+  directly supported, however, with ``pybind11/eigen.h``.
+Large but useful features could be implemented in pybind11 but would lead to a
+significant increase in complexity. Pybind11 strives to be simple and compact.
+Users who require large new features are encouraged to write an extension to
+pybind11; see `pybind11_json <https://github.com/pybind/pybind11_json>`_ for an
+Known bugs
+These are issues that hopefully will one day be fixed, but currently are
+unsolved. If you know how to help with one of these issues, contributions
+are welcome!
+- Intel 20.2 is currently having an issue with the test suite.
+  `#2573 <https://github.com/pybind/pybind11/pull/2573>`_
+- Debug mode Python does not support 1-5 tests in the test suite currently.
+  `#2422 <https://github.com/pybind/pybind11/pull/2422>`_
+- PyPy3 7.3.1 and 7.3.2 have issues with several tests on 32-bit Windows.
+Known limitations
+These are issues that are probably solvable, but have not been fixed yet. A
+clean, well written patch would likely be accepted to solve them.
+- Type casters are not kept alive recursively.
+  `#2527 <https://github.com/pybind/pybind11/issues/2527>`_
+  One consequence is that containers of ``char *`` are currently not supported.
+  `#2245 <https://github.com/pybind/pybind11/issues/2245>`_
+- The ``cpptest`` does not run on Windows with Python 3.8 or newer, due to DLL
+  loader changes. User code that is correctly installed should not be affected.
+  `#2560 <https://github.com/pybind/pybind11/issue/2560>`_
+Python 3.9.0 warning
+Combining older versions of pybind11 (< 2.6.0) with Python on exactly 3.9.0
+will trigger undefined behavior that typically manifests as crashes during
+interpreter shutdown (but could also destroy your data. **You have been
-These features could be implemented but would lead to a significant increase in
-complexity. I've decided to draw the line here to keep this project simple and
-compact. Users who absolutely require these features are encouraged to fork
+This issue was `fixed in Python <https://github.com/python/cpython/pull/22670>`_.
+As a mitigation for this bug, pybind11 2.6.0 or newer includes a workaround
+specifically when Python 3.9.0 is detected at runtime, leaking about 50 bytes
+of memory when a callback function is garbage collected.  For reference, the
+pybind11 test suite has about 2,000 such callbacks, but only 49 are garbage
+collected before the end-of-process. Wheels (even if built with Python 3.9.0)
+will correctly avoid the leak when run in Python 3.9.1, and this does not
+affect other 3.X versions.
diff --git a/wrap/pybind11/docs/reference.rst b/wrap/pybind11/docs/reference.rst
index e3a61afb6b..e64a03519d 100644
--- a/wrap/pybind11/docs/reference.rst
+++ b/wrap/pybind11/docs/reference.rst
@@ -52,6 +52,20 @@ Convenience classes for specific Python types
 .. doxygengroup:: pytypes
+Convenience functions converting to Python types
+.. doxygenfunction:: make_tuple(Args&&...)
+.. doxygenfunction:: make_iterator(Iterator, Sentinel, Extra &&...)
+.. doxygenfunction:: make_iterator(Type &, Extra&&...)
+.. doxygenfunction:: make_key_iterator(Iterator, Sentinel, Extra &&...)
+.. doxygenfunction:: make_key_iterator(Type &, Extra&&...)
+.. doxygenfunction:: make_value_iterator(Iterator, Sentinel, Extra &&...)
+.. doxygenfunction:: make_value_iterator(Type &, Extra&&...)
 .. _extras:
 Passing extra arguments to ``def`` or ``class_``
@@ -110,7 +124,6 @@ Exceptions
 .. doxygenclass:: builtin_exception
diff --git a/wrap/pybind11/docs/release.rst b/wrap/pybind11/docs/release.rst
index 9846f971a6..e761cdf7a6 100644
--- a/wrap/pybind11/docs/release.rst
+++ b/wrap/pybind11/docs/release.rst
@@ -1,21 +1,97 @@
+On version numbers
+The two version numbers (C++ and Python) must match when combined (checked when
+you build the PyPI package), and must be a valid `PEP 440
+<https://www.python.org/dev/peps/pep-0440>`_ version when combined.
+For example:
+.. code-block:: C++
+    #define PYBIND11_VERSION_PATCH Z.dev1
+For beta, ``PYBIND11_VERSION_PATCH`` should be ``Z.b1``. RC's can be ``Z.rc1``.
+Always include the dot (even though PEP 440 allows it to be dropped). For a
+final release, this must be a simple integer. There is also a HEX version of
+the version just below.
 To release a new version of pybind11:
+If you don't have nox, you should either use ``pipx run nox`` instead, or use
+``pipx install nox`` or ``brew install nox`` (Unix).
-- Update the version number and push to pypi
-    - Update ``pybind11/_version.py`` (set release version, remove 'dev').
-    - Update ``PYBIND11_VERSION_MAJOR`` etc. in ``include/pybind11/detail/common.h``.
-    - Ensure that all the information in ``setup.py`` is up-to-date.
-    - Update version in ``docs/conf.py``.
-    - Tag release date in ``docs/changelog.rst``.
-    - ``git add`` and ``git commit``.
-    - if new minor version: ``git checkout -b vX.Y``, ``git push -u origin vX.Y``
+- Update the version number
+    - Update ``PYBIND11_VERSION_MAJOR`` etc. in
+      ``include/pybind11/detail/common.h``. PATCH should be a simple integer.
+    - Update the version HEX just below, as well.
+    - Update ``pybind11/_version.py`` (match above)
+    - Run ``nox -s tests_packaging`` to ensure this was done correctly.
+    - Ensure that all the information in ``setup.cfg`` is up-to-date, like
+      supported Python versions.
+    - Add release date in ``docs/changelog.rst``.
+          - Check to make sure
+            `needs-changelog <https://github.com/pybind/pybind11/pulls?q=is%3Apr+is%3Aclosed+label%3A%22needs+changelog%22>`_
+            issues are entered in the changelog (clear the label when done).
+    - ``git add`` and ``git commit``, ``git push``. **Ensure CI passes**. (If it
+      fails due to a known flake issue, either ignore or restart CI.)
+- Add a release branch if this is a new minor version, or update the existing release branch if it is a patch version
+    - New branch: ``git checkout -b vX.Y``, ``git push -u origin vX.Y``
+    - Update branch: ``git checkout vX.Y``, ``git merge <release branch>``, ``git push``
+- Update tags (optional; if you skip this, the GitHub release makes a
+    non-annotated tag for you)
     - ``git tag -a vX.Y.Z -m 'vX.Y.Z release'``.
-    - ``git push``
     - ``git push --tags``.
-    - ``python setup.py sdist upload``.
-    - ``python setup.py bdist_wheel upload``.
+- Update stable
+    - ``git checkout stable``
+    - ``git merge master``
+    - ``git push``
+- Make a GitHub release (this shows up in the UI, sends new release
+  notifications to users watching releases, and also uploads PyPI packages).
+  (Note: if you do not use an existing tag, this creates a new lightweight tag
+  for you, so you could skip the above step.)
+    - GUI method: Under `releases <https://github.com/pybind/pybind11/releases>`_
+      click "Draft a new release" on the far right, fill in the tag name
+      (if you didn't tag above, it will be made here), fill in a release name
+      like "Version X.Y.Z", and copy-and-paste the markdown-formatted (!) changelog
+      into the description (usually ``cat docs/changelog.rst | pandoc -f rst -t gfm``).
+      Check "pre-release" if this is a beta/RC.
+    - CLI method: with ``gh`` installed, run ``gh release create vX.Y.Z -t "Version X.Y.Z"``
+      If this is a pre-release, add ``-p``.
 - Get back to work
-    - Update ``_version.py`` (add 'dev' and increment minor).
-    - Update version in ``docs/conf.py``
-    - Update version macros in ``include/pybind11/common.h``
-    - ``git add`` and ``git commit``.
-      ``git push``
+    - Make sure you are on master, not somewhere else: ``git checkout master``
+    - Update version macros in ``include/pybind11/detail/common.h`` (set PATCH to
+      ``0.dev1`` and increment MINOR).
+    - Update ``_version.py`` to match
+    - Run ``nox -s tests_packaging`` to ensure this was done correctly.
+    - Add a spot for in-development updates in ``docs/changelog.rst``.
+    - ``git add``, ``git commit``, ``git push``
+If a version branch is updated, remember to set PATCH to ``1.dev1``.
+If you'd like to bump homebrew, run:
+.. code-block:: console
+    brew bump-formula-pr --url https://github.com/pybind/pybind11/archive/vX.Y.Z.tar.gz
+Conda-forge should automatically make a PR in a few hours, and automatically
+merge it if there are no issues.
+Manual packaging
+If you need to manually upload releases, you can download the releases from the job artifacts and upload them with twine. You can also make the files locally (not recommended in general, as your local directory is more likely to be "dirty" and SDists love picking up random unrelated/hidden files); this is the procedure:
+.. code-block:: bash
+    nox -s build
+    twine upload dist/*
+This makes SDists and wheels, and the final line uploads them.
diff --git a/wrap/pybind11/docs/requirements.txt b/wrap/pybind11/docs/requirements.txt
index f4c3dc2e0b..b2801b1f0d 100644
--- a/wrap/pybind11/docs/requirements.txt
+++ b/wrap/pybind11/docs/requirements.txt
@@ -1,5 +1,5 @@
diff --git a/wrap/pybind11/docs/upgrade.rst b/wrap/pybind11/docs/upgrade.rst
index 62e2312e94..d91d51e6f2 100644
--- a/wrap/pybind11/docs/upgrade.rst
+++ b/wrap/pybind11/docs/upgrade.rst
@@ -8,31 +8,90 @@ to a new version. But it goes into more detail. This includes things like
 deprecated APIs and their replacements, build system changes, general code
 modernization and other useful information.
+.. _upgrade-guide-2.9:
+* Any usage of the recently added ``py::make_simple_namespace`` should be
+  converted to using ``py::module_::import("types").attr("SimpleNamespace")``
+  instead.
+* The use of ``_`` in custom type casters can now be replaced with the more
+  readable ``const_name`` instead. The old ``_`` shortcut has been retained
+  unless it is being used as a macro (like for gettext).
+.. _upgrade-guide-2.7:
+*Before* v2.7, ``py::str`` can hold ``PyUnicodeObject`` or ``PyBytesObject``,
+and ``py::isinstance<str>()`` is ``true`` for both ``py::str`` and
+``py::bytes``. Starting with v2.7, ``py::str`` exclusively holds
+``PyUnicodeObject`` (`#2409 <https://github.com/pybind/pybind11/pull/2409>`_),
+and ``py::isinstance<str>()`` is ``true`` only for ``py::str``. To help in
+the transition of user code, the ``PYBIND11_STR_LEGACY_PERMISSIVE`` macro
+is provided as an escape hatch to go back to the legacy behavior. This macro
+will be removed in future releases. Two types of required fixes are expected
+to be common:
+* Accidental use of ``py::str`` instead of ``py::bytes``, masked by the legacy
+  behavior. These are probably very easy to fix, by changing from
+  ``py::str`` to ``py::bytes``.
+* Reliance on py::isinstance<str>(obj) being ``true`` for
+  ``py::bytes``. This is likely to be easy to fix in most cases by adding
+  ``|| py::isinstance<bytes>(obj)``, but a fix may be more involved, e.g. if
+  ``py::isinstance<T>`` appears in a template. Such situations will require
+  careful review and custom fixes.
 .. _upgrade-guide-2.6:
-The ``tools/clang`` submodule and ``tools/mkdoc.py`` have been moved to a
-standalone package, `pybind11-mkdoc`_. If you were using those tools, please
-use them via a pip install from the new location.
+Usage of the ``PYBIND11_OVERLOAD*`` macros and ``get_overload`` function should
+be replaced by ``PYBIND11_OVERRIDE*`` and ``get_override``. In the future, the
+old macros may be deprecated and removed.
-.. _pybind11-mkdoc: https://github.com/pybind/pybind11-mkdoc
+``py::module`` has been renamed ``py::module_``, but a backward compatible
+typedef has been included. This change was to avoid a language change in C++20
+that requires unqualified ``module`` not be placed at the start of a logical
+line. Qualified usage is unaffected and the typedef will remain unless the
+C++ language rules change again.
+The public constructors of ``py::module_`` have been deprecated. Use
+``PYBIND11_MODULE`` or ``module_::create_extension_module`` instead.
 An error is now thrown when ``__init__`` is forgotten on subclasses. This was
 incorrect before, but was not checked. Add a call to ``__init__`` if it is
+A ``py::type_error`` is now thrown when casting to a subclass (like
+``py::bytes`` from ``py::object``) if the conversion is not valid. Make a valid
+conversion instead.
 The undocumented ``h.get_type()`` method has been deprecated and replaced by
+Enums now have a ``__str__`` method pre-defined; if you want to override it,
+the simplest fix is to add the new ``py::prepend()`` tag when defining
 If ``__eq__`` defined but not ``__hash__``, ``__hash__`` is now set to
 ``None``, as in normal CPython. You should add ``__hash__`` if you intended the
 class to be hashable, possibly using the new ``py::hash`` shortcut.
-Usage of the ``PYBIND11_OVERLOAD*`` macros and ``get_overload`` function should
-be replaced by ``PYBIND11_OVERRIDE*`` and ``get_override``. In the future, the
-old macros may be deprecated and removed.
+The constructors for ``py::array`` now always take signed integers for size,
+for consistency. This may lead to compiler warnings on some systems. Cast to
+``py::ssize_t`` instead of ``std::size_t``.
+The ``tools/clang`` submodule and ``tools/mkdoc.py`` have been moved to a
+standalone package, `pybind11-mkdoc`_. If you were using those tools, please
+use them via a pip install from the new location.
 The ``pybind11`` package on PyPI no longer fills the wheel "headers" slot - if
 you were using the headers from this slot, they are available by requesting the
@@ -41,6 +100,8 @@ be unaffected, as the ``pybind11/include`` location is reported by ``python -m
 pybind11 --includes`` and ``pybind11.get_include()`` is still correct and has
 not changed since 2.5).
+.. _pybind11-mkdoc: https://github.com/pybind/pybind11-mkdoc
 CMake support:
@@ -54,7 +115,7 @@ something. The changes are:
 * If you do not request a standard, pybind11 targets will compile with the
   compiler default, but not less than C++11, instead of forcing C++14 always.
-  If you depend on the old behavior, please use ``set(CMAKE_CXX_STANDARD 14)``
+  If you depend on the old behavior, please use ``set(CMAKE_CXX_STANDARD 14 CACHE STRING "")``
 * Direct ``pybind11::module`` usage should always be accompanied by at least
@@ -80,7 +141,8 @@ In addition, the following changes may be of interest:
 * Using ``find_package(Python COMPONENTS Interpreter Development)`` before
   pybind11 will cause pybind11 to use the new Python mechanisms instead of its
   own custom search, based on a patched version of classic ``FindPythonInterp``
-  / ``FindPythonLibs``. In the future, this may become the default.
+  / ``FindPythonLibs``. In the future, this may become the default. A recent
+  (3.15+ or 3.18.2+) version of CMake is recommended.
@@ -170,7 +232,7 @@ way to get and set object state. See :ref:`pickling` for details.
             [](const Foo &self) { // __getstate__
-                return py::make_tuple(f.value1(), f.value2(), ...); // unchanged
+                return py::make_tuple(self.value1(), self.value2(), ...); // unchanged
             [](py::tuple t) { // __setstate__, note: no `self` argument
                 return new Foo(t[0].cast<std::string>(), ...);
@@ -234,7 +296,7 @@ Within pybind11's CMake build system, ``pybind11_add_module`` has always been
 setting the ``-fvisibility=hidden`` flag in release mode. From now on, it's
 being applied unconditionally, even in debug mode and it can no longer be opted
 out of with the ``NO_EXTRAS`` option. The ``pybind11::module`` target now also
-adds this flag to it's interface. The ``pybind11::embed`` target is unchanged.
+adds this flag to its interface. The ``pybind11::embed`` target is unchanged.
 The most significant change here is for the ``pybind11::module`` target. If you
 were previously relying on default visibility, i.e. if your Python module was
diff --git a/wrap/pybind11/include/pybind11/attr.h b/wrap/pybind11/include/pybind11/attr.h
index d0a8b34b8f..f1b66fb80c 100644
--- a/wrap/pybind11/include/pybind11/attr.h
+++ b/wrap/pybind11/include/pybind11/attr.h
@@ -12,13 +12,17 @@
 #include "cast.h"
+#include <functional>
 /// \addtogroup annotations
 /// @{
 /// Annotation for methods
-struct is_method { handle class_; is_method(const handle &c) : class_(c) { } };
+struct is_method { handle class_;
+    explicit is_method(const handle &c) : class_(c) {}
 /// Annotation for operators
 struct is_operator { };
@@ -27,16 +31,24 @@ struct is_operator { };
 struct is_final { };
 /// Annotation for parent scope
-struct scope { handle value; scope(const handle &s) : value(s) { } };
+struct scope { handle value;
+    explicit scope(const handle &s) : value(s) {}
 /// Annotation for documentation
-struct doc { const char *value; doc(const char *value) : value(value) { } };
+struct doc { const char *value;
+    explicit doc(const char *value) : value(value) {}
 /// Annotation for function names
-struct name { const char *value; name(const char *value) : value(value) { } };
+struct name { const char *value;
+    explicit name(const char *value) : value(value) {}
 /// Annotation indicating that a function is an overload associated with a given "sibling"
-struct sibling { handle value; sibling(const handle &value) : value(value.ptr()) { } };
+struct sibling { handle value;
+    explicit sibling(const handle &value) : value(value.ptr()) {}
 /// Annotation indicating that a class derives from another given type
 template <typename T> struct base {
@@ -62,18 +74,41 @@ struct metaclass {
     handle value;
     PYBIND11_DEPRECATED("py::metaclass() is no longer required. It's turned on by default now.")
-    metaclass() { } // NOLINT(modernize-use-equals-default): breaks MSVC 2015 when adding an attribute
+    // NOLINTNEXTLINE(modernize-use-equals-default): breaks MSVC 2015 when adding an attribute
+    metaclass() {}
     /// Override pybind11's default metaclass
     explicit metaclass(handle value) : value(value) { }
+/// Specifies a custom callback with signature `void (PyHeapTypeObject*)` that
+/// may be used to customize the Python type.
+/// The callback is invoked immediately before `PyType_Ready`.
+/// Note: This is an advanced interface, and uses of it may require changes to
+/// work with later versions of pybind11.  You may wish to consult the
+/// implementation of `make_new_python_type` in `detail/classes.h` to understand
+/// the context in which the callback will be run.
+struct custom_type_setup {
+    using callback = std::function<void(PyHeapTypeObject *heap_type)>;
+    explicit custom_type_setup(callback value) : value(std::move(value)) {}
+    callback value;
 /// Annotation that marks a class as local to the module:
-struct module_local { const bool value; constexpr module_local(bool v = true) : value(v) { } };
+struct module_local { const bool value;
+    constexpr explicit module_local(bool v = true) : value(v) {}
 /// Annotation to mark enums as an arithmetic type
 struct arithmetic { };
+/// Mark a function for addition at the beginning of the existing overload chain instead of the end
+struct prepend { };
 /** \rst
     A call policy which places one or more guard variables (``Ts...``) around the function call.
@@ -120,7 +155,7 @@ enum op_id : int;
 enum op_type : int;
 struct undefined_t;
 template <op_id id, op_type ot, typename L = undefined_t, typename R = undefined_t> struct op_;
-inline void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret);
+void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret);
 /// Internal data structure which holds metadata about a keyword argument
 struct argument_record {
@@ -138,8 +173,8 @@ struct argument_record {
 struct function_record {
         : is_constructor(false), is_new_style_constructor(false), is_stateless(false),
-          is_operator(false), is_method(false),
-          has_args(false), has_kwargs(false), has_kw_only_args(false) { }
+          is_operator(false), is_method(false), has_args(false),
+          has_kwargs(false), prepend(false) { }
     /// Function name
     char *name = nullptr; /* why no C++ strings? They generate heavier code.. */
@@ -186,14 +221,15 @@ struct function_record {
     /// True if the function has a '**kwargs' argument
     bool has_kwargs : 1;
-    /// True once a 'py::kw_only' is encountered (any following args are keyword-only)
-    bool has_kw_only_args : 1;
+    /// True if this function is to be inserted at the beginning of the overload resolution chain
+    bool prepend : 1;
     /// Number of arguments (including py::args and/or py::kwargs, if present)
     std::uint16_t nargs;
-    /// Number of trailing arguments (counted in `nargs`) that are keyword-only
-    std::uint16_t nargs_kw_only = 0;
+    /// Number of leading positional arguments, which are terminated by a py::args or py::kwargs
+    /// argument or by a py::kw_only annotation.
+    std::uint16_t nargs_pos = 0;
     /// Number of leading arguments (counted in `nargs`) that are positional-only
     std::uint16_t nargs_pos_only = 0;
@@ -253,6 +289,9 @@ struct type_record {
     /// Custom metaclass (optional)
     handle metaclass;
+    /// Custom type setup.
+    custom_type_setup::callback custom_type_setup_callback;
     /// Multiple inheritance marker
     bool multiple_inheritance : 1;
@@ -370,20 +409,23 @@ template <> struct process_attribute<is_new_style_constructor> : process_attribu
     static void init(const is_new_style_constructor &, function_record *r) { r->is_new_style_constructor = true; }
-inline void process_kw_only_arg(const arg &a, function_record *r) {
-    if (!a.name || strlen(a.name) == 0)
-        pybind11_fail("arg(): cannot specify an unnamed argument after an kw_only() annotation");
-    ++r->nargs_kw_only;
+inline void check_kw_only_arg(const arg &a, function_record *r) {
+    if (r->args.size() > r->nargs_pos && (!a.name || a.name[0] == '\0'))
+        pybind11_fail("arg(): cannot specify an unnamed argument after a kw_only() annotation or args() argument");
+inline void append_self_arg_if_needed(function_record *r) {
+    if (r->is_method && r->args.empty())
+        r->args.emplace_back("self", nullptr, handle(), /*convert=*/ true, /*none=*/ false);
 /// Process a keyword argument attribute (*without* a default value)
 template <> struct process_attribute<arg> : process_attribute_default<arg> {
     static void init(const arg &a, function_record *r) {
-        if (r->is_method && r->args.empty())
-            r->args.emplace_back("self", nullptr, handle(), true /*convert*/, false /*none not allowed*/);
+        append_self_arg_if_needed(r);
         r->args.emplace_back(a.name, nullptr, handle(), !a.flag_noconvert, a.flag_none);
-        if (r->has_kw_only_args) process_kw_only_arg(a, r);
+        check_kw_only_arg(a, r);
@@ -391,7 +433,7 @@ template <> struct process_attribute<arg> : process_attribute_default<arg> {
 template <> struct process_attribute<arg_v> : process_attribute_default<arg_v> {
     static void init(const arg_v &a, function_record *r) {
         if (r->is_method && r->args.empty())
-            r->args.emplace_back("self", nullptr /*descr*/, handle() /*parent*/, true /*convert*/, false /*none not allowed*/);
+            r->args.emplace_back("self", /*descr=*/ nullptr, /*parent=*/ handle(), /*convert=*/ true, /*none=*/ false);
         if (!a.value) {
 #if !defined(NDEBUG)
@@ -416,21 +458,28 @@ template <> struct process_attribute<arg_v> : process_attribute_default<arg_v> {
         r->args.emplace_back(a.name, a.descr, a.value.inc_ref(), !a.flag_noconvert, a.flag_none);
-        if (r->has_kw_only_args) process_kw_only_arg(a, r);
+        check_kw_only_arg(a, r);
 /// Process a keyword-only-arguments-follow pseudo argument
 template <> struct process_attribute<kw_only> : process_attribute_default<kw_only> {
     static void init(const kw_only &, function_record *r) {
-        r->has_kw_only_args = true;
+        append_self_arg_if_needed(r);
+        if (r->has_args && r->nargs_pos != static_cast<std::uint16_t>(r->args.size()))
+            pybind11_fail("Mismatched args() and kw_only(): they must occur at the same relative argument location (or omit kw_only() entirely)");
+        r->nargs_pos = static_cast<std::uint16_t>(r->args.size());
 /// Process a positional-only-argument maker
 template <> struct process_attribute<pos_only> : process_attribute_default<pos_only> {
     static void init(const pos_only &, function_record *r) {
+        append_self_arg_if_needed(r);
         r->nargs_pos_only = static_cast<std::uint16_t>(r->args.size());
+        if (r->nargs_pos_only > r->nargs_pos)
+            pybind11_fail("pos_only(): cannot follow a py::args() argument");
+            // It also can't follow a kw_only, but a static_assert in pybind11.h checks that
@@ -457,6 +506,13 @@ struct process_attribute<dynamic_attr> : process_attribute_default<dynamic_attr>
     static void init(const dynamic_attr &, type_record *r) { r->dynamic_attr = true; }
+template <>
+struct process_attribute<custom_type_setup> {
+    static void init(const custom_type_setup &value, type_record *r) {
+        r->custom_type_setup_callback = value.value;
+    }
 template <>
 struct process_attribute<is_final> : process_attribute_default<is_final> {
     static void init(const is_final &, type_record *r) { r->is_final = true; }
@@ -477,6 +533,12 @@ struct process_attribute<module_local> : process_attribute_default<module_local>
     static void init(const module_local &l, type_record *r) { r->module_local = l.value; }
+/// Process a 'prepend' attribute, putting this at the beginning of the overload chain
+template <>
+struct process_attribute<prepend> : process_attribute_default<prepend> {
+    static void init(const prepend &, function_record *r) { r->prepend = true; }
 /// Process an 'arithmetic' attribute for enums (does nothing here)
 template <>
 struct process_attribute<arithmetic> : process_attribute_default<arithmetic> {};
@@ -503,20 +565,31 @@ template <size_t Nurse, size_t Patient> struct process_attribute<keep_alive<Nurs
 /// Recursively iterate over variadic template arguments
 template <typename... Args> struct process_attributes {
     static void init(const Args&... args, function_record *r) {
-        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::init(args, r), 0) ... };
-        ignore_unused(unused);
+        using expander = int[];
+        (void) expander{
+            0, ((void) process_attribute<typename std::decay<Args>::type>::init(args, r), 0)...};
     static void init(const Args&... args, type_record *r) {
-        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::init(args, r), 0) ... };
-        ignore_unused(unused);
+        using expander = int[];
+        (void) expander{0,
+                        (process_attribute<typename std::decay<Args>::type>::init(args, r), 0)...};
     static void precall(function_call &call) {
-        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::precall(call), 0) ... };
-        ignore_unused(unused);
+        using expander = int[];
+        (void) expander{0,
+                        (process_attribute<typename std::decay<Args>::type>::precall(call), 0)...};
     static void postcall(function_call &call, handle fn_ret) {
-        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::postcall(call, fn_ret), 0) ... };
-        ignore_unused(unused);
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(call, fn_ret);
+        using expander = int[];
+        (void) expander{
+            0, (process_attribute<typename std::decay<Args>::type>::postcall(call, fn_ret), 0)...};
@@ -532,7 +605,8 @@ template <typename... Extra,
           size_t named = constexpr_sum(std::is_base_of<arg, Extra>::value...),
           size_t self  = constexpr_sum(std::is_same<is_method, Extra>::value...)>
 constexpr bool expected_num_args(size_t nargs, bool has_args, bool has_kwargs) {
-    return named == 0 || (self + named + has_args + has_kwargs) == nargs;
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(nargs, has_args, has_kwargs);
+    return named == 0 || (self + named + size_t(has_args) + size_t(has_kwargs)) == nargs;
diff --git a/wrap/pybind11/include/pybind11/buffer_info.h b/wrap/pybind11/include/pybind11/buffer_info.h
index 308be06a33..eba68d1aa1 100644
--- a/wrap/pybind11/include/pybind11/buffer_info.h
+++ b/wrap/pybind11/include/pybind11/buffer_info.h
@@ -13,6 +13,29 @@
+// Default, C-style strides
+inline std::vector<ssize_t> c_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
+    auto ndim = shape.size();
+    std::vector<ssize_t> strides(ndim, itemsize);
+    if (ndim > 0)
+        for (size_t i = ndim - 1; i > 0; --i)
+            strides[i - 1] = strides[i] * shape[i];
+    return strides;
+// F-style strides; default when constructing an array_t with `ExtraFlags & f_style`
+inline std::vector<ssize_t> f_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
+    auto ndim = shape.size();
+    std::vector<ssize_t> strides(ndim, itemsize);
+    for (size_t i = 1; i < ndim; ++i)
+        strides[i] = strides[i - 1] * shape[i - 1];
+    return strides;
 /// Information record describing a Python buffer object
 struct buffer_info {
     void *ptr = nullptr;          // Pointer to the underlying storage
@@ -53,7 +76,14 @@ struct buffer_info {
     explicit buffer_info(Py_buffer *view, bool ownview = true)
     : buffer_info(view->buf, view->itemsize, view->format, view->ndim,
-            {view->shape, view->shape + view->ndim}, {view->strides, view->strides + view->ndim}, view->readonly) {
+            {view->shape, view->shape + view->ndim},
+            /* Though buffer::request() requests PyBUF_STRIDES, ctypes objects
+             * ignore this flag and return a view with NULL strides.
+             * When strides are NULL, build them manually.  */
+            view->strides
+            ? std::vector<ssize_t>(view->strides, view->strides + view->ndim)
+            : detail::c_strides({view->shape, view->shape + view->ndim}, view->itemsize),
+            (view->readonly != 0)) {
         this->m_view = view;
         this->ownview = ownview;
@@ -61,11 +91,9 @@ struct buffer_info {
     buffer_info(const buffer_info &) = delete;
     buffer_info& operator=(const buffer_info &) = delete;
-    buffer_info(buffer_info &&other) {
-        (*this) = std::move(other);
-    }
+    buffer_info(buffer_info &&other) noexcept { (*this) = std::move(other); }
-    buffer_info& operator=(buffer_info &&rhs) {
+    buffer_info &operator=(buffer_info &&rhs) noexcept {
         ptr = rhs.ptr;
         itemsize = rhs.itemsize;
         size = rhs.size;
diff --git a/wrap/pybind11/include/pybind11/cast.h b/wrap/pybind11/include/pybind11/cast.h
index b071008e67..165102443c 100644
--- a/wrap/pybind11/include/pybind11/cast.h
+++ b/wrap/pybind11/include/pybind11/cast.h
@@ -11,938 +11,25 @@
 #pragma once
 #include "pytypes.h"
-#include "detail/typeid.h"
+#include "detail/common.h"
 #include "detail/descr.h"
-#include "detail/internals.h"
+#include "detail/type_caster_base.h"
+#include "detail/typeid.h"
 #include <array>
-#include <limits>
+#include <cstring>
+#include <functional>
+#include <iosfwd>
+#include <iterator>
+#include <memory>
+#include <string>
 #include <tuple>
 #include <type_traits>
-#if defined(PYBIND11_CPP17)
-#  if defined(__has_include)
-#    if __has_include(<string_view>)
-#      define PYBIND11_HAS_STRING_VIEW
-#    endif
-#  elif defined(_MSC_VER)
-#  endif
-#include <string_view>
-#if defined(__cpp_lib_char8_t) && __cpp_lib_char8_t >= 201811L
-#  define PYBIND11_HAS_U8STRING
+#include <utility>
+#include <vector>
-/// A life support system for temporary objects created by `type_caster::load()`.
-/// Adding a patient will keep it alive up until the enclosing function returns.
-class loader_life_support {
-    /// A new patient frame is created when a function is entered
-    loader_life_support() {
-        get_internals().loader_patient_stack.push_back(nullptr);
-    }
-    /// ... and destroyed after it returns
-    ~loader_life_support() {
-        auto &stack = get_internals().loader_patient_stack;
-        if (stack.empty())
-            pybind11_fail("loader_life_support: internal error");
-        auto ptr = stack.back();
-        stack.pop_back();
-        Py_CLEAR(ptr);
-        // A heuristic to reduce the stack's capacity (e.g. after long recursive calls)
-        if (stack.capacity() > 16 && !stack.empty() && stack.capacity() / stack.size() > 2)
-            stack.shrink_to_fit();
-    }
-    /// This can only be used inside a pybind11-bound function, either by `argument_loader`
-    /// at argument preparation time or by `py::cast()` at execution time.
-    PYBIND11_NOINLINE static void add_patient(handle h) {
-        auto &stack = get_internals().loader_patient_stack;
-        if (stack.empty())
-            throw cast_error("When called outside a bound function, py::cast() cannot "
-                             "do Python -> C++ conversions which require the creation "
-                             "of temporary values");
-        auto &list_ptr = stack.back();
-        if (list_ptr == nullptr) {
-            list_ptr = PyList_New(1);
-            if (!list_ptr)
-                pybind11_fail("loader_life_support: error allocating list");
-            PyList_SET_ITEM(list_ptr, 0, h.inc_ref().ptr());
-        } else {
-            auto result = PyList_Append(list_ptr, h.ptr());
-            if (result == -1)
-                pybind11_fail("loader_life_support: error adding patient");
-        }
-    }
-// Gets the cache entry for the given type, creating it if necessary.  The return value is the pair
-// returned by emplace, i.e. an iterator for the entry and a bool set to `true` if the entry was
-// just created.
-inline std::pair<decltype(internals::registered_types_py)::iterator, bool> all_type_info_get_cache(PyTypeObject *type);
-// Populates a just-created cache entry.
-PYBIND11_NOINLINE inline void all_type_info_populate(PyTypeObject *t, std::vector<type_info *> &bases) {
-    std::vector<PyTypeObject *> check;
-    for (handle parent : reinterpret_borrow<tuple>(t->tp_bases))
-        check.push_back((PyTypeObject *) parent.ptr());
-    auto const &type_dict = get_internals().registered_types_py;
-    for (size_t i = 0; i < check.size(); i++) {
-        auto type = check[i];
-        // Ignore Python2 old-style class super type:
-        if (!PyType_Check((PyObject *) type)) continue;
-        // Check `type` in the current set of registered python types:
-        auto it = type_dict.find(type);
-        if (it != type_dict.end()) {
-            // We found a cache entry for it, so it's either pybind-registered or has pre-computed
-            // pybind bases, but we have to make sure we haven't already seen the type(s) before: we
-            // want to follow Python/virtual C++ rules that there should only be one instance of a
-            // common base.
-            for (auto *tinfo : it->second) {
-                // NB: Could use a second set here, rather than doing a linear search, but since
-                // having a large number of immediate pybind11-registered types seems fairly
-                // unlikely, that probably isn't worthwhile.
-                bool found = false;
-                for (auto *known : bases) {
-                    if (known == tinfo) { found = true; break; }
-                }
-                if (!found) bases.push_back(tinfo);
-            }
-        }
-        else if (type->tp_bases) {
-            // It's some python type, so keep follow its bases classes to look for one or more
-            // registered types
-            if (i + 1 == check.size()) {
-                // When we're at the end, we can pop off the current element to avoid growing
-                // `check` when adding just one base (which is typical--i.e. when there is no
-                // multiple inheritance)
-                check.pop_back();
-                i--;
-            }
-            for (handle parent : reinterpret_borrow<tuple>(type->tp_bases))
-                check.push_back((PyTypeObject *) parent.ptr());
-        }
-    }
- * Extracts vector of type_info pointers of pybind-registered roots of the given Python type.  Will
- * be just 1 pybind type for the Python type of a pybind-registered class, or for any Python-side
- * derived class that uses single inheritance.  Will contain as many types as required for a Python
- * class that uses multiple inheritance to inherit (directly or indirectly) from multiple
- * pybind-registered classes.  Will be empty if neither the type nor any base classes are
- * pybind-registered.
- *
- * The value is cached for the lifetime of the Python type.
- */
-inline const std::vector<detail::type_info *> &all_type_info(PyTypeObject *type) {
-    auto ins = all_type_info_get_cache(type);
-    if (ins.second)
-        // New cache entry: populate it
-        all_type_info_populate(type, ins.first->second);
-    return ins.first->second;
- * Gets a single pybind11 type info for a python type.  Returns nullptr if neither the type nor any
- * ancestors are pybind11-registered.  Throws an exception if there are multiple bases--use
- * `all_type_info` instead if you want to support multiple bases.
- */
-PYBIND11_NOINLINE inline detail::type_info* get_type_info(PyTypeObject *type) {
-    auto &bases = all_type_info(type);
-    if (bases.empty())
-        return nullptr;
-    if (bases.size() > 1)
-        pybind11_fail("pybind11::detail::get_type_info: type has multiple pybind11-registered bases");
-    return bases.front();
-inline detail::type_info *get_local_type_info(const std::type_index &tp) {
-    auto &locals = registered_local_types_cpp();
-    auto it = locals.find(tp);
-    if (it != locals.end())
-        return it->second;
-    return nullptr;
-inline detail::type_info *get_global_type_info(const std::type_index &tp) {
-    auto &types = get_internals().registered_types_cpp;
-    auto it = types.find(tp);
-    if (it != types.end())
-        return it->second;
-    return nullptr;
-/// Return the type info for a given C++ type; on lookup failure can either throw or return nullptr.
-PYBIND11_NOINLINE inline detail::type_info *get_type_info(const std::type_index &tp,
-                                                          bool throw_if_missing = false) {
-    if (auto ltype = get_local_type_info(tp))
-        return ltype;
-    if (auto gtype = get_global_type_info(tp))
-        return gtype;
-    if (throw_if_missing) {
-        std::string tname = tp.name();
-        detail::clean_type_id(tname);
-        pybind11_fail("pybind11::detail::get_type_info: unable to find type info for \"" + tname + "\"");
-    }
-    return nullptr;
-PYBIND11_NOINLINE inline handle get_type_handle(const std::type_info &tp, bool throw_if_missing) {
-    detail::type_info *type_info = get_type_info(tp, throw_if_missing);
-    return handle(type_info ? ((PyObject *) type_info->type) : nullptr);
-struct value_and_holder {
-    instance *inst = nullptr;
-    size_t index = 0u;
-    const detail::type_info *type = nullptr;
-    void **vh = nullptr;
-    // Main constructor for a found value/holder:
-    value_and_holder(instance *i, const detail::type_info *type, size_t vpos, size_t index) :
-        inst{i}, index{index}, type{type},
-        vh{inst->simple_layout ? inst->simple_value_holder : &inst->nonsimple.values_and_holders[vpos]}
-    {}
-    // Default constructor (used to signal a value-and-holder not found by get_value_and_holder())
-    value_and_holder() = default;
-    // Used for past-the-end iterator
-    value_and_holder(size_t index) : index{index} {}
-    template <typename V = void> V *&value_ptr() const {
-        return reinterpret_cast<V *&>(vh[0]);
-    }
-    // True if this `value_and_holder` has a non-null value pointer
-    explicit operator bool() const { return value_ptr(); }
-    template <typename H> H &holder() const {
-        return reinterpret_cast<H &>(vh[1]);
-    }
-    bool holder_constructed() const {
-        return inst->simple_layout
-            ? inst->simple_holder_constructed
-            : inst->nonsimple.status[index] & instance::status_holder_constructed;
-    }
-    void set_holder_constructed(bool v = true) {
-        if (inst->simple_layout)
-            inst->simple_holder_constructed = v;
-        else if (v)
-            inst->nonsimple.status[index] |= instance::status_holder_constructed;
-        else
-            inst->nonsimple.status[index] &= (uint8_t) ~instance::status_holder_constructed;
-    }
-    bool instance_registered() const {
-        return inst->simple_layout
-            ? inst->simple_instance_registered
-            : inst->nonsimple.status[index] & instance::status_instance_registered;
-    }
-    void set_instance_registered(bool v = true) {
-        if (inst->simple_layout)
-            inst->simple_instance_registered = v;
-        else if (v)
-            inst->nonsimple.status[index] |= instance::status_instance_registered;
-        else
-            inst->nonsimple.status[index] &= (uint8_t) ~instance::status_instance_registered;
-    }
-// Container for accessing and iterating over an instance's values/holders
-struct values_and_holders {
-    instance *inst;
-    using type_vec = std::vector<detail::type_info *>;
-    const type_vec &tinfo;
-    values_and_holders(instance *inst) : inst{inst}, tinfo(all_type_info(Py_TYPE(inst))) {}
-    struct iterator {
-    private:
-        instance *inst = nullptr;
-        const type_vec *types = nullptr;
-        value_and_holder curr;
-        friend struct values_and_holders;
-        iterator(instance *inst, const type_vec *tinfo)
-            : inst{inst}, types{tinfo},
-            curr(inst /* instance */,
-                 types->empty() ? nullptr : (*types)[0] /* type info */,
-                 0, /* vpos: (non-simple types only): the first vptr comes first */
-                 0 /* index */)
-        {}
-        // Past-the-end iterator:
-        iterator(size_t end) : curr(end) {}
-    public:
-        bool operator==(const iterator &other) const { return curr.index == other.curr.index; }
-        bool operator!=(const iterator &other) const { return curr.index != other.curr.index; }
-        iterator &operator++() {
-            if (!inst->simple_layout)
-                curr.vh += 1 + (*types)[curr.index]->holder_size_in_ptrs;
-            ++curr.index;
-            curr.type = curr.index < types->size() ? (*types)[curr.index] : nullptr;
-            return *this;
-        }
-        value_and_holder &operator*() { return curr; }
-        value_and_holder *operator->() { return &curr; }
-    };
-    iterator begin() { return iterator(inst, &tinfo); }
-    iterator end() { return iterator(tinfo.size()); }
-    iterator find(const type_info *find_type) {
-        auto it = begin(), endit = end();
-        while (it != endit && it->type != find_type) ++it;
-        return it;
-    }
-    size_t size() { return tinfo.size(); }
- * Extracts C++ value and holder pointer references from an instance (which may contain multiple
- * values/holders for python-side multiple inheritance) that match the given type.  Throws an error
- * if the given type (or ValueType, if omitted) is not a pybind11 base of the given instance.  If
- * `find_type` is omitted (or explicitly specified as nullptr) the first value/holder are returned,
- * regardless of type (and the resulting .type will be nullptr).
- *
- * The returned object should be short-lived: in particular, it must not outlive the called-upon
- * instance.
- */
-PYBIND11_NOINLINE inline value_and_holder instance::get_value_and_holder(const type_info *find_type /*= nullptr default in common.h*/, bool throw_if_missing /*= true in common.h*/) {
-    // Optimize common case:
-    if (!find_type || Py_TYPE(this) == find_type->type)
-        return value_and_holder(this, find_type, 0, 0);
-    detail::values_and_holders vhs(this);
-    auto it = vhs.find(find_type);
-    if (it != vhs.end())
-        return *it;
-    if (!throw_if_missing)
-        return value_and_holder();
-#if defined(NDEBUG)
-    pybind11_fail("pybind11::detail::instance::get_value_and_holder: "
-            "type is not a pybind11 base of the given instance "
-            "(compile in debug mode for type details)");
-    pybind11_fail("pybind11::detail::instance::get_value_and_holder: `" +
-            std::string(find_type->type->tp_name) + "' is not a pybind11 base of the given `" +
-            std::string(Py_TYPE(this)->tp_name) + "' instance");
-PYBIND11_NOINLINE inline void instance::allocate_layout() {
-    auto &tinfo = all_type_info(Py_TYPE(this));
-    const size_t n_types = tinfo.size();
-    if (n_types == 0)
-        pybind11_fail("instance allocation failed: new instance has no pybind11-registered base types");
-    simple_layout =
-        n_types == 1 && tinfo.front()->holder_size_in_ptrs <= instance_simple_holder_in_ptrs();
-    // Simple path: no python-side multiple inheritance, and a small-enough holder
-    if (simple_layout) {
-        simple_value_holder[0] = nullptr;
-        simple_holder_constructed = false;
-        simple_instance_registered = false;
-    }
-    else { // multiple base types or a too-large holder
-        // Allocate space to hold: [v1*][h1][v2*][h2]...[bb...] where [vN*] is a value pointer,
-        // [hN] is the (uninitialized) holder instance for value N, and [bb...] is a set of bool
-        // values that tracks whether each associated holder has been initialized.  Each [block] is
-        // padded, if necessary, to an integer multiple of sizeof(void *).
-        size_t space = 0;
-        for (auto t : tinfo) {
-            space += 1; // value pointer
-            space += t->holder_size_in_ptrs; // holder instance
-        }
-        size_t flags_at = space;
-        space += size_in_ptrs(n_types); // status bytes (holder_constructed and instance_registered)
-        // Allocate space for flags, values, and holders, and initialize it to 0 (flags and values,
-        // in particular, need to be 0).  Use Python's memory allocation functions: in Python 3.6
-        // they default to using pymalloc, which is designed to be efficient for small allocations
-        // like the one we're doing here; in earlier versions (and for larger allocations) they are
-        // just wrappers around malloc.
-#if PY_VERSION_HEX >= 0x03050000
-        nonsimple.values_and_holders = (void **) PyMem_Calloc(space, sizeof(void *));
-        if (!nonsimple.values_and_holders) throw std::bad_alloc();
-        nonsimple.values_and_holders = (void **) PyMem_New(void *, space);
-        if (!nonsimple.values_and_holders) throw std::bad_alloc();
-        std::memset(nonsimple.values_and_holders, 0, space * sizeof(void *));
-        nonsimple.status = reinterpret_cast<uint8_t *>(&nonsimple.values_and_holders[flags_at]);
-    }
-    owned = true;
-PYBIND11_NOINLINE inline void instance::deallocate_layout() {
-    if (!simple_layout)
-        PyMem_Free(nonsimple.values_and_holders);
-PYBIND11_NOINLINE inline bool isinstance_generic(handle obj, const std::type_info &tp) {
-    handle type = detail::get_type_handle(tp, false);
-    if (!type)
-        return false;
-    return isinstance(obj, type);
-PYBIND11_NOINLINE inline std::string error_string() {
-    if (!PyErr_Occurred()) {
-        PyErr_SetString(PyExc_RuntimeError, "Unknown internal error occurred");
-        return "Unknown internal error occurred";
-    }
-    error_scope scope; // Preserve error state
-    std::string errorString;
-    if (scope.type) {
-        errorString += handle(scope.type).attr("__name__").cast<std::string>();
-        errorString += ": ";
-    }
-    if (scope.value)
-        errorString += (std::string) str(scope.value);
-    PyErr_NormalizeException(&scope.type, &scope.value, &scope.trace);
-    if (scope.trace != nullptr)
-        PyException_SetTraceback(scope.value, scope.trace);
-#if !defined(PYPY_VERSION)
-    if (scope.trace) {
-        auto *trace = (PyTracebackObject *) scope.trace;
-        /* Get the deepest trace possible */
-        while (trace->tb_next)
-            trace = trace->tb_next;
-        PyFrameObject *frame = trace->tb_frame;
-        errorString += "\n\nAt:\n";
-        while (frame) {
-            int lineno = PyFrame_GetLineNumber(frame);
-            errorString +=
-                "  " + handle(frame->f_code->co_filename).cast<std::string>() +
-                "(" + std::to_string(lineno) + "): " +
-                handle(frame->f_code->co_name).cast<std::string>() + "\n";
-            frame = frame->f_back;
-        }
-    }
-    return errorString;
-PYBIND11_NOINLINE inline handle get_object_handle(const void *ptr, const detail::type_info *type ) {
-    auto &instances = get_internals().registered_instances;
-    auto range = instances.equal_range(ptr);
-    for (auto it = range.first; it != range.second; ++it) {
-        for (const auto &vh : values_and_holders(it->second)) {
-            if (vh.type == type)
-                return handle((PyObject *) it->second);
-        }
-    }
-    return handle();
-inline PyThreadState *get_thread_state_unchecked() {
-#if defined(PYPY_VERSION)
-    return PyThreadState_GET();
-#elif PY_VERSION_HEX < 0x03000000
-    return _PyThreadState_Current;
-#elif PY_VERSION_HEX < 0x03050000
-    return (PyThreadState*) _Py_atomic_load_relaxed(&_PyThreadState_Current);
-#elif PY_VERSION_HEX < 0x03050200
-    return (PyThreadState*) _PyThreadState_Current.value;
-    return _PyThreadState_UncheckedGet();
-// Forward declarations
-inline void keep_alive_impl(handle nurse, handle patient);
-inline PyObject *make_new_instance(PyTypeObject *type);
-class type_caster_generic {
-    PYBIND11_NOINLINE type_caster_generic(const std::type_info &type_info)
-        : typeinfo(get_type_info(type_info)), cpptype(&type_info) { }
-    type_caster_generic(const type_info *typeinfo)
-        : typeinfo(typeinfo), cpptype(typeinfo ? typeinfo->cpptype : nullptr) { }
-    bool load(handle src, bool convert) {
-        return load_impl<type_caster_generic>(src, convert);
-    }
-    PYBIND11_NOINLINE static handle cast(const void *_src, return_value_policy policy, handle parent,
-                                         const detail::type_info *tinfo,
-                                         void *(*copy_constructor)(const void *),
-                                         void *(*move_constructor)(const void *),
-                                         const void *existing_holder = nullptr) {
-        if (!tinfo) // no type info: error will be set already
-            return handle();
-        void *src = const_cast<void *>(_src);
-        if (src == nullptr)
-            return none().release();
-        auto it_instances = get_internals().registered_instances.equal_range(src);
-        for (auto it_i = it_instances.first; it_i != it_instances.second; ++it_i) {
-            for (auto instance_type : detail::all_type_info(Py_TYPE(it_i->second))) {
-                if (instance_type && same_type(*instance_type->cpptype, *tinfo->cpptype))
-                    return handle((PyObject *) it_i->second).inc_ref();
-            }
-        }
-        auto inst = reinterpret_steal<object>(make_new_instance(tinfo->type));
-        auto wrapper = reinterpret_cast<instance *>(inst.ptr());
-        wrapper->owned = false;
-        void *&valueptr = values_and_holders(wrapper).begin()->value_ptr();
-        switch (policy) {
-            case return_value_policy::automatic:
-            case return_value_policy::take_ownership:
-                valueptr = src;
-                wrapper->owned = true;
-                break;
-            case return_value_policy::automatic_reference:
-            case return_value_policy::reference:
-                valueptr = src;
-                wrapper->owned = false;
-                break;
-            case return_value_policy::copy:
-                if (copy_constructor)
-                    valueptr = copy_constructor(src);
-                else {
-#if defined(NDEBUG)
-                    throw cast_error("return_value_policy = copy, but type is "
-                                     "non-copyable! (compile in debug mode for details)");
-                    std::string type_name(tinfo->cpptype->name());
-                    detail::clean_type_id(type_name);
-                    throw cast_error("return_value_policy = copy, but type " +
-                                     type_name + " is non-copyable!");
-                }
-                wrapper->owned = true;
-                break;
-            case return_value_policy::move:
-                if (move_constructor)
-                    valueptr = move_constructor(src);
-                else if (copy_constructor)
-                    valueptr = copy_constructor(src);
-                else {
-#if defined(NDEBUG)
-                    throw cast_error("return_value_policy = move, but type is neither "
-                                     "movable nor copyable! "
-                                     "(compile in debug mode for details)");
-                    std::string type_name(tinfo->cpptype->name());
-                    detail::clean_type_id(type_name);
-                    throw cast_error("return_value_policy = move, but type " +
-                                     type_name + " is neither movable nor copyable!");
-                }
-                wrapper->owned = true;
-                break;
-            case return_value_policy::reference_internal:
-                valueptr = src;
-                wrapper->owned = false;
-                keep_alive_impl(inst, parent);
-                break;
-            default:
-                throw cast_error("unhandled return_value_policy: should not happen!");
-        }
-        tinfo->init_instance(wrapper, existing_holder);
-        return inst.release();
-    }
-    // Base methods for generic caster; there are overridden in copyable_holder_caster
-    void load_value(value_and_holder &&v_h) {
-        auto *&vptr = v_h.value_ptr();
-        // Lazy allocation for unallocated values:
-        if (vptr == nullptr) {
-            auto *type = v_h.type ? v_h.type : typeinfo;
-            if (type->operator_new) {
-                vptr = type->operator_new(type->type_size);
-            } else {
-                #if defined(__cpp_aligned_new) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
-                    if (type->type_align > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
-                        vptr = ::operator new(type->type_size,
-                                              std::align_val_t(type->type_align));
-                    else
-                #endif
-                vptr = ::operator new(type->type_size);
-            }
-        }
-        value = vptr;
-    }
-    bool try_implicit_casts(handle src, bool convert) {
-        for (auto &cast : typeinfo->implicit_casts) {
-            type_caster_generic sub_caster(*cast.first);
-            if (sub_caster.load(src, convert)) {
-                value = cast.second(sub_caster.value);
-                return true;
-            }
-        }
-        return false;
-    }
-    bool try_direct_conversions(handle src) {
-        for (auto &converter : *typeinfo->direct_conversions) {
-            if (converter(src.ptr(), value))
-                return true;
-        }
-        return false;
-    }
-    void check_holder_compat() {}
-    PYBIND11_NOINLINE static void *local_load(PyObject *src, const type_info *ti) {
-        auto caster = type_caster_generic(ti);
-        if (caster.load(src, false))
-            return caster.value;
-        return nullptr;
-    }
-    /// Try to load with foreign typeinfo, if available. Used when there is no
-    /// native typeinfo, or when the native one wasn't able to produce a value.
-    PYBIND11_NOINLINE bool try_load_foreign_module_local(handle src) {
-        constexpr auto *local_key = PYBIND11_MODULE_LOCAL_ID;
-        const auto pytype = type::handle_of(src);
-        if (!hasattr(pytype, local_key))
-            return false;
-        type_info *foreign_typeinfo = reinterpret_borrow<capsule>(getattr(pytype, local_key));
-        // Only consider this foreign loader if actually foreign and is a loader of the correct cpp type
-        if (foreign_typeinfo->module_local_load == &local_load
-            || (cpptype && !same_type(*cpptype, *foreign_typeinfo->cpptype)))
-            return false;
-        if (auto result = foreign_typeinfo->module_local_load(src.ptr(), foreign_typeinfo)) {
-            value = result;
-            return true;
-        }
-        return false;
-    }
-    // Implementation of `load`; this takes the type of `this` so that it can dispatch the relevant
-    // bits of code between here and copyable_holder_caster where the two classes need different
-    // logic (without having to resort to virtual inheritance).
-    template <typename ThisT>
-    PYBIND11_NOINLINE bool load_impl(handle src, bool convert) {
-        if (!src) return false;
-        if (!typeinfo) return try_load_foreign_module_local(src);
-        if (src.is_none()) {
-            // Defer accepting None to other overloads (if we aren't in convert mode):
-            if (!convert) return false;
-            value = nullptr;
-            return true;
-        }
-        auto &this_ = static_cast<ThisT &>(*this);
-        this_.check_holder_compat();
-        PyTypeObject *srctype = Py_TYPE(src.ptr());
-        // Case 1: If src is an exact type match for the target type then we can reinterpret_cast
-        // the instance's value pointer to the target type:
-        if (srctype == typeinfo->type) {
-            this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
-            return true;
-        }
-        // Case 2: We have a derived class
-        else if (PyType_IsSubtype(srctype, typeinfo->type)) {
-            auto &bases = all_type_info(srctype);
-            bool no_cpp_mi = typeinfo->simple_type;
-            // Case 2a: the python type is a Python-inherited derived class that inherits from just
-            // one simple (no MI) pybind11 class, or is an exact match, so the C++ instance is of
-            // the right type and we can use reinterpret_cast.
-            // (This is essentially the same as case 2b, but because not using multiple inheritance
-            // is extremely common, we handle it specially to avoid the loop iterator and type
-            // pointer lookup overhead)
-            if (bases.size() == 1 && (no_cpp_mi || bases.front()->type == typeinfo->type)) {
-                this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
-                return true;
-            }
-            // Case 2b: the python type inherits from multiple C++ bases.  Check the bases to see if
-            // we can find an exact match (or, for a simple C++ type, an inherited match); if so, we
-            // can safely reinterpret_cast to the relevant pointer.
-            else if (bases.size() > 1) {
-                for (auto base : bases) {
-                    if (no_cpp_mi ? PyType_IsSubtype(base->type, typeinfo->type) : base->type == typeinfo->type) {
-                        this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder(base));
-                        return true;
-                    }
-                }
-            }
-            // Case 2c: C++ multiple inheritance is involved and we couldn't find an exact type match
-            // in the registered bases, above, so try implicit casting (needed for proper C++ casting
-            // when MI is involved).
-            if (this_.try_implicit_casts(src, convert))
-                return true;
-        }
-        // Perform an implicit conversion
-        if (convert) {
-            for (auto &converter : typeinfo->implicit_conversions) {
-                auto temp = reinterpret_steal<object>(converter(src.ptr(), typeinfo->type));
-                if (load_impl<ThisT>(temp, false)) {
-                    loader_life_support::add_patient(temp);
-                    return true;
-                }
-            }
-            if (this_.try_direct_conversions(src))
-                return true;
-        }
-        // Failed to match local typeinfo. Try again with global.
-        if (typeinfo->module_local) {
-            if (auto gtype = get_global_type_info(*typeinfo->cpptype)) {
-                typeinfo = gtype;
-                return load(src, false);
-            }
-        }
-        // Global typeinfo has precedence over foreign module_local
-        return try_load_foreign_module_local(src);
-    }
-    // Called to do type lookup and wrap the pointer and type in a pair when a dynamic_cast
-    // isn't needed or can't be used.  If the type is unknown, sets the error and returns a pair
-    // with .second = nullptr.  (p.first = nullptr is not an error: it becomes None).
-    PYBIND11_NOINLINE static std::pair<const void *, const type_info *> src_and_type(
-            const void *src, const std::type_info &cast_type, const std::type_info *rtti_type = nullptr) {
-        if (auto *tpi = get_type_info(cast_type))
-            return {src, const_cast<const type_info *>(tpi)};
-        // Not found, set error:
-        std::string tname = rtti_type ? rtti_type->name() : cast_type.name();
-        detail::clean_type_id(tname);
-        std::string msg = "Unregistered type : " + tname;
-        PyErr_SetString(PyExc_TypeError, msg.c_str());
-        return {nullptr, nullptr};
-    }
-    const type_info *typeinfo = nullptr;
-    const std::type_info *cpptype = nullptr;
-    void *value = nullptr;
- * Determine suitable casting operator for pointer-or-lvalue-casting type casters.  The type caster
- * needs to provide `operator T*()` and `operator T&()` operators.
- *
- * If the type supports moving the value away via an `operator T&&() &&` method, it should use
- * `movable_cast_op_type` instead.
- */
-template <typename T>
-using cast_op_type =
-    conditional_t<std::is_pointer<remove_reference_t<T>>::value,
-        typename std::add_pointer<intrinsic_t<T>>::type,
-        typename std::add_lvalue_reference<intrinsic_t<T>>::type>;
- * Determine suitable casting operator for a type caster with a movable value.  Such a type caster
- * needs to provide `operator T*()`, `operator T&()`, and `operator T&&() &&`.  The latter will be
- * called in appropriate contexts where the value can be moved rather than copied.
- *
- * These operator are automatically provided when using the PYBIND11_TYPE_CASTER macro.
- */
-template <typename T>
-using movable_cast_op_type =
-    conditional_t<std::is_pointer<typename std::remove_reference<T>::type>::value,
-        typename std::add_pointer<intrinsic_t<T>>::type,
-    conditional_t<std::is_rvalue_reference<T>::value,
-        typename std::add_rvalue_reference<intrinsic_t<T>>::type,
-        typename std::add_lvalue_reference<intrinsic_t<T>>::type>>;
-// std::is_copy_constructible isn't quite enough: it lets std::vector<T> (and similar) through when
-// T is non-copyable, but code containing such a copy constructor fails to actually compile.
-template <typename T, typename SFINAE = void> struct is_copy_constructible : std::is_copy_constructible<T> {};
-// Specialization for types that appear to be copy constructible but also look like stl containers
-// (we specifically check for: has `value_type` and `reference` with `reference = value_type&`): if
-// so, copy constructability depends on whether the value_type is copy constructible.
-template <typename Container> struct is_copy_constructible<Container, enable_if_t<all_of<
-        std::is_copy_constructible<Container>,
-        std::is_same<typename Container::value_type &, typename Container::reference>,
-        // Avoid infinite recursion
-        negation<std::is_same<Container, typename Container::value_type>>
-    >::value>> : is_copy_constructible<typename Container::value_type> {};
-// Likewise for std::pair
-// (after C++17 it is mandatory that the copy constructor not exist when the two types aren't themselves
-// copy constructible, but this can not be relied upon when T1 or T2 are themselves containers).
-template <typename T1, typename T2> struct is_copy_constructible<std::pair<T1, T2>>
-    : all_of<is_copy_constructible<T1>, is_copy_constructible<T2>> {};
-// The same problems arise with std::is_copy_assignable, so we use the same workaround.
-template <typename T, typename SFINAE = void> struct is_copy_assignable : std::is_copy_assignable<T> {};
-template <typename Container> struct is_copy_assignable<Container, enable_if_t<all_of<
-        std::is_copy_assignable<Container>,
-        std::is_same<typename Container::value_type &, typename Container::reference>
-    >::value>> : is_copy_assignable<typename Container::value_type> {};
-template <typename T1, typename T2> struct is_copy_assignable<std::pair<T1, T2>>
-    : all_of<is_copy_assignable<T1>, is_copy_assignable<T2>> {};
-// polymorphic_type_hook<itype>::get(src, tinfo) determines whether the object pointed
-// to by `src` actually is an instance of some class derived from `itype`.
-// If so, it sets `tinfo` to point to the std::type_info representing that derived
-// type, and returns a pointer to the start of the most-derived object of that type
-// (in which `src` is a subobject; this will be the same address as `src` in most
-// single inheritance cases). If not, or if `src` is nullptr, it simply returns `src`
-// and leaves `tinfo` at its default value of nullptr.
-// The default polymorphic_type_hook just returns src. A specialization for polymorphic
-// types determines the runtime type of the passed object and adjusts the this-pointer
-// appropriately via dynamic_cast<void*>. This is what enables a C++ Animal* to appear
-// to Python as a Dog (if Dog inherits from Animal, Animal is polymorphic, Dog is
-// registered with pybind11, and this Animal is in fact a Dog).
-// You may specialize polymorphic_type_hook yourself for types that want to appear
-// polymorphic to Python but do not use C++ RTTI. (This is a not uncommon pattern
-// in performance-sensitive applications, used most notably in LLVM.)
-// polymorphic_type_hook_base allows users to specialize polymorphic_type_hook with
-// std::enable_if. User provided specializations will always have higher priority than
-// the default implementation and specialization provided in polymorphic_type_hook_base.
-template <typename itype, typename SFINAE = void>
-struct polymorphic_type_hook_base
-    static const void *get(const itype *src, const std::type_info*&) { return src; }
-template <typename itype>
-struct polymorphic_type_hook_base<itype, detail::enable_if_t<std::is_polymorphic<itype>::value>>
-    static const void *get(const itype *src, const std::type_info*& type) {
-        type = src ? &typeid(*src) : nullptr;
-        return dynamic_cast<const void*>(src);
-    }
-template <typename itype, typename SFINAE = void>
-struct polymorphic_type_hook : public polymorphic_type_hook_base<itype> {};
-/// Generic type caster for objects stored on the heap
-template <typename type> class type_caster_base : public type_caster_generic {
-    using itype = intrinsic_t<type>;
-    static constexpr auto name = _<type>();
-    type_caster_base() : type_caster_base(typeid(type)) { }
-    explicit type_caster_base(const std::type_info &info) : type_caster_generic(info) { }
-    static handle cast(const itype &src, return_value_policy policy, handle parent) {
-        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
-            policy = return_value_policy::copy;
-        return cast(&src, policy, parent);
-    }
-    static handle cast(itype &&src, return_value_policy, handle parent) {
-        return cast(&src, return_value_policy::move, parent);
-    }
-    // Returns a (pointer, type_info) pair taking care of necessary type lookup for a
-    // polymorphic type (using RTTI by default, but can be overridden by specializing
-    // polymorphic_type_hook). If the instance isn't derived, returns the base version.
-    static std::pair<const void *, const type_info *> src_and_type(const itype *src) {
-        auto &cast_type = typeid(itype);
-        const std::type_info *instance_type = nullptr;
-        const void *vsrc = polymorphic_type_hook<itype>::get(src, instance_type);
-        if (instance_type && !same_type(cast_type, *instance_type)) {
-            // This is a base pointer to a derived type. If the derived type is registered
-            // with pybind11, we want to make the full derived object available.
-            // In the typical case where itype is polymorphic, we get the correct
-            // derived pointer (which may be != base pointer) by a dynamic_cast to
-            // most derived type. If itype is not polymorphic, we won't get here
-            // except via a user-provided specialization of polymorphic_type_hook,
-            // and the user has promised that no this-pointer adjustment is
-            // required in that case, so it's OK to use static_cast.
-            if (const auto *tpi = get_type_info(*instance_type))
-                return {vsrc, tpi};
-        }
-        // Otherwise we have either a nullptr, an `itype` pointer, or an unknown derived pointer, so
-        // don't do a cast
-        return type_caster_generic::src_and_type(src, cast_type, instance_type);
-    }
-    static handle cast(const itype *src, return_value_policy policy, handle parent) {
-        auto st = src_and_type(src);
-        return type_caster_generic::cast(
-            st.first, policy, parent, st.second,
-            make_copy_constructor(src), make_move_constructor(src));
-    }
-    static handle cast_holder(const itype *src, const void *holder) {
-        auto st = src_and_type(src);
-        return type_caster_generic::cast(
-            st.first, return_value_policy::take_ownership, {}, st.second,
-            nullptr, nullptr, holder);
-    }
-    template <typename T> using cast_op_type = detail::cast_op_type<T>;
-    operator itype*() { return (type *) value; }
-    operator itype&() { if (!value) throw reference_cast_error(); return *((itype *) value); }
-    using Constructor = void *(*)(const void *);
-    /* Only enabled when the types are {copy,move}-constructible *and* when the type
-       does not have a private operator new implementation. */
-    template <typename T, typename = enable_if_t<is_copy_constructible<T>::value>>
-    static auto make_copy_constructor(const T *x) -> decltype(new T(*x), Constructor{}) {
-        return [](const void *arg) -> void * {
-            return new T(*reinterpret_cast<const T *>(arg));
-        };
-    }
-    template <typename T, typename = enable_if_t<std::is_move_constructible<T>::value>>
-    static auto make_move_constructor(const T *x) -> decltype(new T(std::move(*const_cast<T *>(x))), Constructor{}) {
-        return [](const void *arg) -> void * {
-            return new T(std::move(*const_cast<T *>(reinterpret_cast<const T *>(arg))));
-        };
-    }
-    static Constructor make_copy_constructor(...) { return nullptr; }
-    static Constructor make_move_constructor(...) { return nullptr; }
 template <typename type, typename SFINAE = void> class type_caster : public type_caster_base<type> { };
 template <typename type> using make_caster = type_caster<intrinsic_t<type>>;
@@ -960,9 +47,14 @@ template <typename type> class type_caster<std::reference_wrapper<type>> {
     using caster_t = make_caster<type>;
     caster_t subcaster;
-    using subcaster_cast_op_type = typename caster_t::template cast_op_type<type>;
-    static_assert(std::is_same<typename std::remove_const<type>::type &, subcaster_cast_op_type>::value,
-            "std::reference_wrapper<T> caster requires T to have a caster with an `T &` operator");
+    using reference_t = type&;
+    using subcaster_cast_op_type =
+        typename caster_t::template cast_op_type<reference_t>;
+    static_assert(std::is_same<typename std::remove_const<type>::type &, subcaster_cast_op_type>::value ||
+                  std::is_same<reference_t, subcaster_cast_op_type>::value,
+                  "std::reference_wrapper<T> caster requires T to have a caster with an "
+                  "`operator T &()` or `operator const T &()`");
     bool load(handle src, bool convert) { return subcaster.load(src, convert); }
     static constexpr auto name = caster_t::name;
@@ -973,28 +65,31 @@ template <typename type> class type_caster<std::reference_wrapper<type>> {
         return caster_t::cast(&src.get(), policy, parent);
     template <typename T> using cast_op_type = std::reference_wrapper<type>;
-    operator std::reference_wrapper<type>() { return subcaster.operator subcaster_cast_op_type&(); }
+    explicit operator std::reference_wrapper<type>() { return cast_op<type &>(subcaster); }
-#define PYBIND11_TYPE_CASTER(type, py_name) \
-    protected: \
-        type value; \
-    public: \
-        static constexpr auto name = py_name; \
-        template <typename T_, enable_if_t<std::is_same<type, remove_cv_t<T_>>::value, int> = 0> \
-        static handle cast(T_ *src, return_value_policy policy, handle parent) { \
-            if (!src) return none().release(); \
-            if (policy == return_value_policy::take_ownership) { \
-                auto h = cast(std::move(*src), policy, parent); delete src; return h; \
-            } else { \
-                return cast(*src, policy, parent); \
-            } \
-        } \
-        operator type*() { return &value; } \
-        operator type&() { return value; } \
-        operator type&&() && { return std::move(value); } \
-        template <typename T_> using cast_op_type = pybind11::detail::movable_cast_op_type<T_>
+#define PYBIND11_TYPE_CASTER(type, py_name)                                                       \
+protected:                                                                                        \
+    type value;                                                                                   \
+                                                                                                  \
+public:                                                                                           \
+    static constexpr auto name = py_name;                                                         \
+    template <typename T_, enable_if_t<std::is_same<type, remove_cv_t<T_>>::value, int> = 0>      \
+    static handle cast(T_ *src, return_value_policy policy, handle parent) {                      \
+        if (!src)                                                                                 \
+            return none().release();                                                              \
+        if (policy == return_value_policy::take_ownership) {                                      \
+            auto h = cast(std::move(*src), policy, parent);                                       \
+            delete src;                                                                           \
+            return h;                                                                             \
+        }                                                                                         \
+        return cast(*src, policy, parent);                                                        \
+    }                                                                                             \
+    operator type *() { return &value; }               /* NOLINT(bugprone-macro-parentheses) */   \
+    operator type &() { return value; }                /* NOLINT(bugprone-macro-parentheses) */   \
+    operator type &&() && { return std::move(value); } /* NOLINT(bugprone-macro-parentheses) */   \
+    template <typename T_>                                                                        \
+    using cast_op_type = pybind11::detail::movable_cast_op_type<T_>
 template <typename CharT> using is_std_char_type = any_of<
     std::is_same<CharT, char>, /* std::string */
@@ -1020,19 +115,46 @@ struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value && !is_std_char_t
         if (!src)
             return false;
+#if !defined(PYPY_VERSION)
+        auto index_check = [](PyObject *o) { return PyIndex_Check(o); };
+        // In PyPy 7.3.3, `PyIndex_Check` is implemented by calling `__index__`,
+        // while CPython only considers the existence of `nb_index`/`__index__`.
+        auto index_check = [](PyObject *o) { return hasattr(o, "__index__"); };
         if (std::is_floating_point<T>::value) {
             if (convert || PyFloat_Check(src.ptr()))
                 py_value = (py_type) PyFloat_AsDouble(src.ptr());
                 return false;
-        } else if (PyFloat_Check(src.ptr())) {
+        } else if (PyFloat_Check(src.ptr())
+                   || (!convert && !PYBIND11_LONG_CHECK(src.ptr()) && !index_check(src.ptr()))) {
             return false;
-        } else if (std::is_unsigned<py_type>::value) {
-            py_value = as_unsigned<py_type>(src.ptr());
-        } else { // signed integer:
-            py_value = sizeof(T) <= sizeof(long)
-                ? (py_type) PyLong_AsLong(src.ptr())
-                : (py_type) PYBIND11_LONG_AS_LONGLONG(src.ptr());
+        } else {
+            handle src_or_index = src;
+            // PyPy: 7.3.7's 3.8 does not implement PyLong_*'s __index__ calls.
+#if PY_VERSION_HEX < 0x03080000 || defined(PYPY_VERSION)
+            object index;
+            if (!PYBIND11_LONG_CHECK(src.ptr())) {  // So: index_check(src.ptr())
+                index = reinterpret_steal<object>(PyNumber_Index(src.ptr()));
+                if (!index) {
+                    PyErr_Clear();
+                    if (!convert)
+                        return false;
+                }
+                else {
+                    src_or_index = index;
+                }
+            }
+            if (std::is_unsigned<py_type>::value) {
+                py_value = as_unsigned<py_type>(src_or_index.ptr());
+            } else { // signed integer:
+                py_value = sizeof(T) <= sizeof(long)
+                    ? (py_type) PyLong_AsLong(src_or_index.ptr())
+                    : (py_type) PYBIND11_LONG_AS_LONGLONG(src_or_index.ptr());
+            }
         // Python API reported an error
@@ -1041,15 +163,8 @@ struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value && !is_std_char_t
         // Check to see if the conversion is valid (integers should match exactly)
         // Signed/unsigned checks happen elsewhere
         if (py_err || (std::is_integral<T>::value && sizeof(py_type) != sizeof(T) && py_value != (py_type) (T) py_value)) {
-            bool type_error = py_err && PyErr_ExceptionMatches(
-#if PY_VERSION_HEX < 0x03000000 && !defined(PYPY_VERSION)
-                PyExc_SystemError
-                PyExc_TypeError
-            );
-            if (type_error && convert && PyNumber_Check(src.ptr())) {
+            if (py_err && convert && (PyNumber_Check(src.ptr()) != 0)) {
                 auto tmp = reinterpret_steal<object>(std::is_floating_point<T>::value
                                                      ? PyNumber_Float(src.ptr())
                                                      : PyNumber_Long(src.ptr()));
@@ -1093,7 +208,7 @@ struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value && !is_std_char_t
         return PyLong_FromUnsignedLongLong((unsigned long long) src);
-    PYBIND11_TYPE_CASTER(T, _<std::is_integral<T>::value>("int", "float"));
+    PYBIND11_TYPE_CASTER(T, const_name<std::is_integral<T>::value>("int", "float"));
 template<typename T> struct void_caster {
@@ -1106,7 +221,7 @@ template<typename T> struct void_caster {
     static handle cast(T, return_value_policy /* policy */, handle /* parent */) {
         return none().inc_ref();
-    PYBIND11_TYPE_CASTER(T, _("None"));
+    PYBIND11_TYPE_CASTER(T, const_name("None"));
 template <> class type_caster<void_type> : public void_caster<void_type> {};
@@ -1118,7 +233,8 @@ template <> class type_caster<void> : public type_caster<void_type> {
     bool load(handle h, bool) {
         if (!h) {
             return false;
-        } else if (h.is_none()) {
+        }
+        if (h.is_none()) {
             value = nullptr;
             return true;
@@ -1143,13 +259,12 @@ template <> class type_caster<void> : public type_caster<void_type> {
     static handle cast(const void *ptr, return_value_policy /* policy */, handle /* parent */) {
         if (ptr)
             return capsule(ptr).release();
-        else
-            return none().inc_ref();
+        return none().inc_ref();
     template <typename T> using cast_op_type = void*&;
-    operator void *&() { return value; }
-    static constexpr auto name = _("capsule");
+    explicit operator void *&() { return value; }
+    static constexpr auto name = const_name("capsule");
     void *value = nullptr;
@@ -1160,9 +275,15 @@ template <> class type_caster<bool> {
     bool load(handle src, bool convert) {
         if (!src) return false;
-        else if (src.ptr() == Py_True) { value = true; return true; }
-        else if (src.ptr() == Py_False) { value = false; return true; }
-        else if (convert || !strcmp("numpy.bool_", Py_TYPE(src.ptr())->tp_name)) {
+        if (src.ptr() == Py_True) {
+            value = true;
+            return true;
+        }
+        if (src.ptr() == Py_False) {
+            value = false;
+            return true;
+        }
+        if (convert || (std::strcmp("numpy.bool_", Py_TYPE(src.ptr())->tp_name) == 0)) {
             // (allow non-implicit conversion for numpy booleans)
             Py_ssize_t res = -1;
@@ -1184,18 +305,17 @@ template <> class type_caster<bool> {
             if (res == 0 || res == 1) {
-                value = (bool) res;
+                value = (res != 0);
                 return true;
-            } else {
-                PyErr_Clear();
+            PyErr_Clear();
         return false;
     static handle cast(bool src, return_value_policy /* policy */, handle /* parent */) {
         return handle(src ? Py_True : Py_False).inc_ref();
-    PYBIND11_TYPE_CASTER(bool, _("bool"));
+    PYBIND11_TYPE_CASTER(bool, const_name("bool"));
 // Helper class for UTF-{8,16,32} C++ stl strings:
@@ -1222,7 +342,8 @@ template <typename StringType, bool IsView = false> struct string_caster {
         handle load_src = src;
         if (!src) {
             return false;
-        } else if (!PyUnicode_Check(load_src.ptr())) {
+        }
+        if (!PyUnicode_Check(load_src.ptr())) {
             return load_bytes(load_src);
@@ -1240,13 +361,33 @@ template <typename StringType, bool IsView = false> struct string_caster {
-        object utfNbytes = reinterpret_steal<object>(PyUnicode_AsEncodedString(
+#if PY_VERSION_HEX >= 0x03030000
+        // On Python >= 3.3, for UTF-8 we avoid the need for a temporary `bytes`
+        // object by using `PyUnicode_AsUTF8AndSize`.
+        if (PYBIND11_SILENCE_MSVC_C4127(UTF_N == 8)) {
+            Py_ssize_t size = -1;
+            const auto *buffer
+                = reinterpret_cast<const CharT *>(PyUnicode_AsUTF8AndSize(load_src.ptr(), &size));
+            if (!buffer) {
+                PyErr_Clear();
+                return false;
+            }
+            value = StringType(buffer, static_cast<size_t>(size));
+            return true;
+        }
+        auto utfNbytes = reinterpret_steal<object>(PyUnicode_AsEncodedString(
             load_src.ptr(), UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr));
         if (!utfNbytes) { PyErr_Clear(); return false; }
         const auto *buffer = reinterpret_cast<const CharT *>(PYBIND11_BYTES_AS_STRING(utfNbytes.ptr()));
         size_t length = (size_t) PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT);
-        if (UTF_N > 8) { buffer++; length--; } // Skip BOM for UTF-16/32
+        // Skip BOM for UTF-16/32
+        if (PYBIND11_SILENCE_MSVC_C4127(UTF_N > 8)) {
+            buffer++;
+            length--;
+        }
         value = StringType(buffer, length);
         // If we're loading a string_view we need to keep the encoded Python object alive:
@@ -1264,7 +405,7 @@ template <typename StringType, bool IsView = false> struct string_caster {
         return s;
+    PYBIND11_TYPE_CASTER(StringType, const_name(PYBIND11_STRING_NAME));
     static handle decode_utfN(const char *buffer, ssize_t nbytes) {
@@ -1274,10 +415,8 @@ template <typename StringType, bool IsView = false> struct string_caster {
             UTF_N == 16 ? PyUnicode_DecodeUTF16(buffer, nbytes, nullptr, nullptr) :
                           PyUnicode_DecodeUTF32(buffer, nbytes, nullptr, nullptr);
-        // PyPy seems to have multiple problems related to PyUnicode_UTF*: the UTF8 version
-        // sometimes segfaults for unknown reasons, while the UTF16 and 32 versions require a
-        // non-const char * arguments, which is also a nuisance, so bypass the whole thing by just
-        // passing the encoding as a string value, which works properly:
+        // PyPy segfaults when on PyUnicode_DecodeUTF16 (and possibly on PyUnicode_DecodeUTF32 as well),
+        // so bypass the whole thing by just passing the encoding as a string value, which works properly:
         return PyUnicode_Decode(buffer, nbytes, UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr);
@@ -1348,8 +487,10 @@ template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type
         return StringCaster::cast(StringType(1, src), policy, parent);
-    operator CharT*() { return none ? nullptr : const_cast<CharT *>(static_cast<StringType &>(str_caster).c_str()); }
-    operator CharT&() {
+    explicit operator CharT *() {
+        return none ? nullptr : const_cast<CharT *>(static_cast<StringType &>(str_caster).c_str());
+    }
+    explicit operator CharT &() {
         if (none)
             throw value_error("Cannot convert None to a character");
@@ -1363,12 +504,16 @@ template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type
         // out how long the first encoded character is in bytes to distinguish between these two
         // errors.  We also allow want to allow unicode characters U+0080 through U+00FF, as those
         // can fit into a single char value.
-        if (StringCaster::UTF_N == 8 && str_len > 1 && str_len <= 4) {
+        if (PYBIND11_SILENCE_MSVC_C4127(StringCaster::UTF_N == 8) && str_len > 1 && str_len <= 4) {
             auto v0 = static_cast<unsigned char>(value[0]);
-            size_t char0_bytes = !(v0 & 0x80) ? 1 : // low bits only: 0-127
-                (v0 & 0xE0) == 0xC0 ? 2 : // 0b110xxxxx - start of 2-byte sequence
-                (v0 & 0xF0) == 0xE0 ? 3 : // 0b1110xxxx - start of 3-byte sequence
-                4; // 0b11110xxx - start of 4-byte sequence
+            // low bits only: 0-127
+            // 0b110xxxxx - start of 2-byte sequence
+            // 0b1110xxxx - start of 3-byte sequence
+            // 0b11110xxx - start of 4-byte sequence
+            size_t char0_bytes = (v0 & 0x80) == 0      ? 1
+                                 : (v0 & 0xE0) == 0xC0 ? 2
+                                 : (v0 & 0xF0) == 0xE0 ? 3
+                                                       : 4;
             if (char0_bytes == str_len) {
                 // If we have a 128-255 value, we can decode it into a single char:
@@ -1384,7 +529,7 @@ template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type
         // UTF-16 is much easier: we can only have a surrogate pair for values above U+FFFF, thus a
         // surrogate pair with total length 2 instantly indicates a range error (but not a "your
         // string was too long" error).
-        else if (StringCaster::UTF_N == 16 && str_len == 2) {
+        else if (PYBIND11_SILENCE_MSVC_C4127(StringCaster::UTF_N == 16) && str_len == 2) {
             one_char = static_cast<CharT>(value[0]);
             if (one_char >= 0xD800 && one_char < 0xE000)
                 throw value_error("Character code point not in range(0x10000)");
@@ -1397,7 +542,7 @@ template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type
         return one_char;
-    static constexpr auto name = _(PYBIND11_STRING_NAME);
+    static constexpr auto name = const_name(PYBIND11_STRING_NAME);
     template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
@@ -1427,18 +572,19 @@ template <template<typename...> class Tuple, typename... Ts> class tuple_caster
     static handle cast(T *src, return_value_policy policy, handle parent) {
         if (!src) return none().release();
         if (policy == return_value_policy::take_ownership) {
-            auto h = cast(std::move(*src), policy, parent); delete src; return h;
-        } else {
-            return cast(*src, policy, parent);
+            auto h = cast(std::move(*src), policy, parent);
+            delete src;
+            return h;
+        return cast(*src, policy, parent);
-    static constexpr auto name = _("Tuple[") + concat(make_caster<Ts>::name...) + _("]");
+    static constexpr auto name = const_name("Tuple[") + concat(make_caster<Ts>::name...) + const_name("]");
     template <typename T> using cast_op_type = type;
-    operator type() & { return implicit_cast(indices{}); }
-    operator type() && { return std::move(*this).implicit_cast(indices{}); }
+    explicit operator type() & { return implicit_cast(indices{}); }
+    explicit operator type() && { return std::move(*this).implicit_cast(indices{}); }
     template <size_t... Is>
@@ -1464,6 +610,8 @@ template <template<typename...> class Tuple, typename... Ts> class tuple_caster
     /* Implementation: Convert a C++ tuple into a Python tuple */
     template <typename T, size_t... Is>
     static handle cast_impl(T &&src, return_value_policy policy, handle parent, index_sequence<Is...>) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(src, policy, parent);
         std::array<object, size> entries{{
             reinterpret_steal<object>(make_caster<Ts>::cast(std::get<Is>(std::forward<T>(src)), policy, parent))...
@@ -1494,7 +642,11 @@ struct holder_helper {
 /// Type caster for holder types like std::shared_ptr, etc.
-template <typename type, typename holder_type>
+/// The SFINAE hook is provided to help work around the current lack of support
+/// for smart-pointer interoperability. Please consider it an implementation
+/// detail that may change in the future, as formal support for smart-pointer
+/// interoperability is added into pybind11.
+template <typename type, typename holder_type, typename SFINAE = void>
 struct copyable_holder_caster : public type_caster_base<type> {
     using base = type_caster_base<type>;
@@ -1514,14 +666,7 @@ struct copyable_holder_caster : public type_caster_base<type> {
     // see issue #2180
     explicit operator type&() { return *(static_cast<type *>(this->value)); }
     explicit operator holder_type*() { return std::addressof(holder); }
-    // Workaround for Intel compiler bug
-    // see pybind11 issue 94
-    #if defined(__ICC) || defined(__INTEL_COMPILER)
-    operator holder_type&() { return holder; }
-    #else
     explicit operator holder_type&() { return holder; }
-    #endif
     static handle cast(const holder_type &src, return_value_policy, handle) {
         const auto *ptr = holder_helper<holder_type>::get(src);
@@ -1540,14 +685,14 @@ struct copyable_holder_caster : public type_caster_base<type> {
             value = v_h.value_ptr();
             holder = v_h.template holder<holder_type>();
             return true;
-        } else {
-            throw cast_error("Unable to cast from non-held to held instance (T& to Holder<T>) "
+        }
+        throw cast_error("Unable to cast from non-held to held instance (T& to Holder<T>) "
 #if defined(NDEBUG)
-                             "(compile in debug mode for type information)");
+                         "(compile in debug mode for type information)");
-                             "of type '" + type_id<holder_type>() + "''");
+                         "of type '"
+                         + type_id<holder_type>() + "''");
-        }
     template <typename T = holder_type, detail::enable_if_t<!std::is_constructible<T, const T &, type*>::value, int> = 0>
@@ -1576,7 +721,10 @@ struct copyable_holder_caster : public type_caster_base<type> {
 template <typename T>
 class type_caster<std::shared_ptr<T>> : public copyable_holder_caster<T, std::shared_ptr<T>> { };
-template <typename type, typename holder_type>
+/// Type caster for holder types like std::unique_ptr.
+/// Please consider the SFINAE hook an implementation detail, as explained
+/// in the comment for the copyable_holder_caster.
+template <typename type, typename holder_type, typename SFINAE = void>
 struct move_only_holder_caster {
     static_assert(std::is_base_of<type_caster_base<type>, type_caster<type>>::value,
             "Holder classes are only supported for custom types");
@@ -1616,14 +764,16 @@ template <typename base, typename holder> struct is_holder_type :
 template <typename base, typename deleter> struct is_holder_type<base, std::unique_ptr<base, deleter>> :
     std::true_type {};
-template <typename T> struct handle_type_name { static constexpr auto name = _<T>(); };
-template <> struct handle_type_name<bytes> { static constexpr auto name = _(PYBIND11_BYTES_NAME); };
-template <> struct handle_type_name<int_> { static constexpr auto name = _("int"); };
-template <> struct handle_type_name<iterable> { static constexpr auto name = _("Iterable"); };
-template <> struct handle_type_name<iterator> { static constexpr auto name = _("Iterator"); };
-template <> struct handle_type_name<none> { static constexpr auto name = _("None"); };
-template <> struct handle_type_name<args> { static constexpr auto name = _("*args"); };
-template <> struct handle_type_name<kwargs> { static constexpr auto name = _("**kwargs"); };
+template <typename T> struct handle_type_name { static constexpr auto name = const_name<T>(); };
+template <> struct handle_type_name<bool_> { static constexpr auto name = const_name("bool"); };
+template <> struct handle_type_name<bytes> { static constexpr auto name = const_name(PYBIND11_BYTES_NAME); };
+template <> struct handle_type_name<int_> { static constexpr auto name = const_name("int"); };
+template <> struct handle_type_name<iterable> { static constexpr auto name = const_name("Iterable"); };
+template <> struct handle_type_name<iterator> { static constexpr auto name = const_name("Iterator"); };
+template <> struct handle_type_name<float_> { static constexpr auto name = const_name("float"); };
+template <> struct handle_type_name<none> { static constexpr auto name = const_name("None"); };
+template <> struct handle_type_name<args> { static constexpr auto name = const_name("*args"); };
+template <> struct handle_type_name<kwargs> { static constexpr auto name = const_name("**kwargs"); };
 template <typename type>
 struct pyobject_caster {
@@ -1632,6 +782,17 @@ struct pyobject_caster {
     template <typename T = type, enable_if_t<std::is_base_of<object, T>::value, int> = 0>
     bool load(handle src, bool /* convert */) {
+        // For Python 2, without this implicit conversion, Python code would
+        // need to be cluttered with six.ensure_text() or similar, only to be
+        // un-cluttered later after Python 2 support is dropped.
+        if (PYBIND11_SILENCE_MSVC_C4127(std::is_same<T, str>::value) && isinstance<bytes>(src)) {
+            PyObject *str_from_bytes = PyUnicode_FromEncodedObject(src.ptr(), "utf-8", nullptr);
+            if (!str_from_bytes) throw error_already_set();
+            value = reinterpret_steal<type>(str_from_bytes);
+            return true;
+        }
         if (!isinstance<type>(src))
             return false;
         value = reinterpret_borrow<type>(src);
@@ -1779,8 +940,7 @@ template <typename T> detail::enable_if_t<detail::move_always<T>::value, T> cast
 template <typename T> detail::enable_if_t<detail::move_if_unreferenced<T>::value, T> cast(object &&object) {
     if (object.ref_count() > 1)
         return cast<T>(object);
-    else
-        return move<T>(std::move(object));
+    return move<T>(std::move(object));
 template <typename T> detail::enable_if_t<detail::move_never<T>::value, T> cast(object &&object) {
     return cast<T>(object);
@@ -1820,6 +980,21 @@ template <> inline void cast_safe<void>(object &&) {}
+// The overloads could coexist, i.e. the #if is not strictly speaking needed,
+// but it is an easy minor optimization.
+#if defined(NDEBUG)
+inline cast_error cast_error_unable_to_convert_call_arg() {
+    return cast_error(
+        "Unable to convert call argument to Python object (compile in debug mode for details)");
+inline cast_error cast_error_unable_to_convert_call_arg(const std::string &name,
+                                                        const std::string &type) {
+    return cast_error("Unable to convert call argument '" + name + "' of type '" + type
+                      + "' to Python object");
 template <return_value_policy policy = return_value_policy::automatic_reference>
 tuple make_tuple() { return tuple(0); }
@@ -1833,11 +1008,10 @@ template <return_value_policy policy = return_value_policy::automatic_reference,
     for (size_t i = 0; i < args.size(); i++) {
         if (!args[i]) {
 #if defined(NDEBUG)
-            throw cast_error("make_tuple(): unable to convert arguments to Python object (compile in debug mode for details)");
+            throw cast_error_unable_to_convert_call_arg();
             std::array<std::string, size> argtypes { {type_id<Args>()...} };
-            throw cast_error("make_tuple(): unable to convert argument of type '" +
-                argtypes[i] + "' to Python object");
+            throw cast_error_unable_to_convert_call_arg(std::to_string(i), argtypes[i]);
@@ -1879,7 +1053,14 @@ struct arg_v : arg {
 #if !defined(NDEBUG)
         , type(type_id<T>())
-    { }
+    {
+        // Workaround! See:
+        // https://github.com/pybind/pybind11/issues/2336
+        // https://github.com/pybind/pybind11/pull/2685#issuecomment-731286700
+        if (PyErr_Occurred()) {
+            PyErr_Clear();
+        }
+    }
     /// Direct construction with name, default, and description
@@ -1919,7 +1100,9 @@ struct kw_only {};
 struct pos_only {};
 template <typename T>
-arg_v arg::operator=(T &&value) const { return {std::move(*this), std::forward<T>(value)}; }
+arg_v arg::operator=(T &&value) const {
+    return {*this, std::forward<T>(value)};
 /// Alias for backward compatibility -- to be removed in version 2.0
 template <typename /*unused*/> using arg_t = arg_v;
@@ -1933,6 +1116,9 @@ constexpr arg operator"" _a(const char *name, size_t) { return arg(name); }
+template <typename T> using is_kw_only = std::is_same<intrinsic_t<T>, kw_only>;
+template <typename T> using is_pos_only = std::is_same<intrinsic_t<T>, pos_only>;
 // forward declaration (definition in attr.h)
 struct function_record;
@@ -1968,17 +1154,18 @@ class argument_loader {
     template <typename Arg> using argument_is_args   = std::is_same<intrinsic_t<Arg>, args>;
     template <typename Arg> using argument_is_kwargs = std::is_same<intrinsic_t<Arg>, kwargs>;
-    // Get args/kwargs argument positions relative to the end of the argument list:
-    static constexpr auto args_pos = constexpr_first<argument_is_args, Args...>() - (int) sizeof...(Args),
-                        kwargs_pos = constexpr_first<argument_is_kwargs, Args...>() - (int) sizeof...(Args);
+    // Get kwargs argument position, or -1 if not present:
+    static constexpr auto kwargs_pos = constexpr_last<argument_is_kwargs, Args...>();
-    static constexpr bool args_kwargs_are_last = kwargs_pos >= - 1 && args_pos >= kwargs_pos - 1;
-    static_assert(args_kwargs_are_last, "py::args/py::kwargs are only permitted as the last argument(s) of a function");
+    static_assert(kwargs_pos == -1 || kwargs_pos == (int) sizeof...(Args) - 1, "py::kwargs is only permitted as the last argument of a function");
-    static constexpr bool has_kwargs = kwargs_pos < 0;
-    static constexpr bool has_args = args_pos < 0;
+    static constexpr bool has_kwargs = kwargs_pos != -1;
+    // py::args argument position; -1 if not present.
+    static constexpr int args_pos = constexpr_last<argument_is_args, Args...>();
+    static_assert(args_pos == -1 || args_pos == constexpr_first<argument_is_args, Args...>(), "py::args cannot be specified more than once");
     static constexpr auto arg_names = concat(type_descr(make_caster<Args>::name)...);
@@ -1987,13 +1174,14 @@ class argument_loader {
     template <typename Return, typename Guard, typename Func>
+    // NOLINTNEXTLINE(readability-const-return-type)
     enable_if_t<!std::is_void<Return>::value, Return> call(Func &&f) && {
-        return std::move(*this).template call_impl<Return>(std::forward<Func>(f), indices{}, Guard{});
+        return std::move(*this).template call_impl<remove_cv_t<Return>>(std::forward<Func>(f), indices{}, Guard{});
     template <typename Return, typename Guard, typename Func>
     enable_if_t<std::is_void<Return>::value, void_type> call(Func &&f) && {
-        std::move(*this).template call_impl<Return>(std::forward<Func>(f), indices{}, Guard{});
+        std::move(*this).template call_impl<remove_cv_t<Return>>(std::forward<Func>(f), indices{}, Guard{});
         return void_type();
@@ -2057,8 +1245,8 @@ class unpacking_collector {
         // Tuples aren't (easily) resizable so a list is needed for collection,
         // but the actual function call strictly requires a tuple.
         auto args_list = list();
-        int _[] = { 0, (process(args_list, std::forward<Ts>(values)), 0)... };
-        ignore_unused(_);
+        using expander = int[];
+        (void) expander{0, (process(args_list, std::forward<Ts>(values)), 0)...};
         m_args = std::move(args_list);
@@ -2083,16 +1271,17 @@ class unpacking_collector {
         auto o = reinterpret_steal<object>(detail::make_caster<T>::cast(std::forward<T>(x), policy, {}));
         if (!o) {
 #if defined(NDEBUG)
-            argument_cast_error();
+            throw cast_error_unable_to_convert_call_arg();
-            argument_cast_error(std::to_string(args_list.size()), type_id<T>());
+            throw cast_error_unable_to_convert_call_arg(
+                std::to_string(args_list.size()), type_id<T>());
     void process(list &args_list, detail::args_proxy ap) {
-        for (const auto &a : ap)
+        for (auto a : ap)
@@ -2113,9 +1302,9 @@ class unpacking_collector {
         if (!a.value) {
 #if defined(NDEBUG)
-            argument_cast_error();
+            throw cast_error_unable_to_convert_call_arg();
-            argument_cast_error(a.name, a.type);
+            throw cast_error_unable_to_convert_call_arg(a.name, a.type);
         m_kwargs[a.name] = a.value;
@@ -2124,7 +1313,7 @@ class unpacking_collector {
     void process(list &/*args_list*/, detail::kwargs_proxy kp) {
         if (!kp)
-        for (const auto &k : reinterpret_borrow<dict>(kp)) {
+        for (auto k : reinterpret_borrow<dict>(kp)) {
             if (m_kwargs.contains(k.first)) {
 #if defined(NDEBUG)
@@ -2141,7 +1330,7 @@ class unpacking_collector {
                          "may be passed via py::arg() to a python function call. "
                          "(compile in debug mode for details)");
-    [[noreturn]] static void nameless_argument_error(std::string type) {
+    [[noreturn]] static void nameless_argument_error(const std::string &type) {
         throw type_error("Got kwargs without a name of type '" + type + "'; only named "
                          "arguments may be passed via py::arg() to a python function call. ");
@@ -2150,35 +1339,35 @@ class unpacking_collector {
                          "(compile in debug mode for details)");
-    [[noreturn]] static void multiple_values_error(std::string name) {
+    [[noreturn]] static void multiple_values_error(const std::string &name) {
         throw type_error("Got multiple values for keyword argument '" + name + "'");
-    [[noreturn]] static void argument_cast_error() {
-        throw cast_error("Unable to convert call argument to Python object "
-                         "(compile in debug mode for details)");
-    }
-    [[noreturn]] static void argument_cast_error(std::string name, std::string type) {
-        throw cast_error("Unable to convert call argument '" + name
-                         + "' of type '" + type + "' to Python object");
-    }
     tuple m_args;
     dict m_kwargs;
+// [workaround(intel)] Separate function required here
+// We need to put this into a separate function because the Intel compiler
+// fails to compile enable_if_t<!all_of<is_positional<Args>...>::value>
+// (tested with ICC 2021.1 Beta 20200827).
+template <typename... Args>
+constexpr bool args_are_all_positional()
+  return all_of<is_positional<Args>...>::value;
 /// Collect only positional arguments for a Python function call
 template <return_value_policy policy, typename... Args,
-          typename = enable_if_t<all_of<is_positional<Args>...>::value>>
+          typename = enable_if_t<args_are_all_positional<Args...>()>>
 simple_collector<policy> collect_arguments(Args &&...args) {
     return simple_collector<policy>(std::forward<Args>(args)...);
 /// Collect all arguments, including keywords and unpacking (only instantiated when needed)
 template <return_value_policy policy, typename... Args,
-          typename = enable_if_t<!all_of<is_positional<Args>...>::value>>
+          typename = enable_if_t<!args_are_all_positional<Args...>()>>
 unpacking_collector<policy> collect_arguments(Args &&...args) {
     // Following argument order rules for generalized unpacking according to PEP 448
@@ -2193,6 +1382,11 @@ unpacking_collector<policy> collect_arguments(Args &&...args) {
 template <typename Derived>
 template <return_value_policy policy, typename... Args>
 object object_api<Derived>::operator()(Args &&...args) const {
+#if !defined(NDEBUG) && PY_VERSION_HEX >= 0x03060000
+    if (!PyGILState_Check()) {
+        pybind11_fail("pybind11::object_api<>::operator() PyGILState_Check() failure.");
+    }
     return detail::collect_arguments<policy>(std::forward<Args>(args)...).call(derived().ptr());
diff --git a/wrap/pybind11/include/pybind11/chrono.h b/wrap/pybind11/include/pybind11/chrono.h
index cbe9acec35..460a28fa5d 100644
--- a/wrap/pybind11/include/pybind11/chrono.h
+++ b/wrap/pybind11/include/pybind11/chrono.h
@@ -11,9 +11,12 @@
 #pragma once
 #include "pybind11.h"
+#include <chrono>
 #include <cmath>
 #include <ctime>
-#include <chrono>
+#include <mutex>
 #include <datetime.h>
 // Backport the PyDateTime_DELTA functions from Python3.3 if required
@@ -32,10 +35,10 @@ PYBIND11_NAMESPACE_BEGIN(detail)
 template <typename type> class duration_caster {
-    typedef typename type::rep rep;
+    using rep = typename type::rep;
     using period = typename type::period;
-    using days = std::chrono::duration<uint_fast32_t, std::ratio<86400>>;
+    using days = std::chrono::duration<int_least32_t, std::ratio<86400>>; // signed 25 bits required by the standard.
     bool load(handle src, bool) {
         using namespace std::chrono;
@@ -53,11 +56,11 @@ template <typename type> class duration_caster {
             return true;
         // If invoked with a float we assume it is seconds and convert
-        else if (PyFloat_Check(src.ptr())) {
+        if (PyFloat_Check(src.ptr())) {
             value = type(duration_cast<duration<rep, period>>(duration<double>(PyFloat_AsDouble(src.ptr()))));
             return true;
-        else return false;
+        return false;
     // If this is a duration just return it back
@@ -92,9 +95,25 @@ template <typename type> class duration_caster {
         return PyDelta_FromDSU(dd.count(), ss.count(), us.count());
-    PYBIND11_TYPE_CASTER(type, _("datetime.timedelta"));
+    PYBIND11_TYPE_CASTER(type, const_name("datetime.timedelta"));
+inline std::tm *localtime_thread_safe(const std::time_t *time, std::tm *buf) {
+#if (defined(__STDC_LIB_EXT1__) && defined(__STDC_WANT_LIB_EXT1__)) || defined(_MSC_VER)
+    if (localtime_s(buf, time))
+        return nullptr;
+    return buf;
+    static std::mutex mtx;
+    std::lock_guard<std::mutex> lock(mtx);
+    std::tm *tm_ptr = std::localtime(time);
+    if (tm_ptr != nullptr) {
+        *buf = *tm_ptr;
+    }
+    return tm_ptr;
 // This is for casting times on the system clock into datetime.datetime instances
 template <typename Duration> class type_caster<std::chrono::time_point<std::chrono::system_clock, Duration>> {
@@ -161,10 +180,11 @@ template <typename Duration> class type_caster<std::chrono::time_point<std::chro
         // > If std::time_t has lower precision, it is implementation-defined whether the value is rounded or truncated.
         // (https://en.cppreference.com/w/cpp/chrono/system_clock/to_time_t)
         std::time_t tt = system_clock::to_time_t(time_point_cast<system_clock::duration>(src - us));
-        // this function uses static memory so it's best to copy it out asap just in case
-        // otherwise other code that is using localtime may break this (not just python code)
-        std::tm localtime = *std::localtime(&tt);
+        std::tm localtime;
+        std::tm *localtime_ptr = localtime_thread_safe(&tt, &localtime);
+        if (!localtime_ptr)
+            throw cast_error("Unable to represent system_clock in local time");
         return PyDateTime_FromDateAndTime(localtime.tm_year + 1900,
                                           localtime.tm_mon + 1,
@@ -173,7 +193,7 @@ template <typename Duration> class type_caster<std::chrono::time_point<std::chro
-    PYBIND11_TYPE_CASTER(type, _("datetime.datetime"));
+    PYBIND11_TYPE_CASTER(type, const_name("datetime.datetime"));
 // Other clocks that are not the system clock are not measured as datetime.datetime objects
diff --git a/wrap/pybind11/include/pybind11/complex.h b/wrap/pybind11/include/pybind11/complex.h
index f8327eb373..e1ecf43585 100644
--- a/wrap/pybind11/include/pybind11/complex.h
+++ b/wrap/pybind11/include/pybind11/complex.h
@@ -59,7 +59,7 @@ template <typename T> class type_caster<std::complex<T>> {
         return PyComplex_FromDoubles((double) src.real(), (double) src.imag());
-    PYBIND11_TYPE_CASTER(std::complex<T>, _("complex"));
+    PYBIND11_TYPE_CASTER(std::complex<T>, const_name("complex"));
diff --git a/wrap/pybind11/include/pybind11/detail/class.h b/wrap/pybind11/include/pybind11/detail/class.h
index b4a11c0a04..cc1e40ce7a 100644
--- a/wrap/pybind11/include/pybind11/detail/class.h
+++ b/wrap/pybind11/include/pybind11/detail/class.h
@@ -24,6 +24,18 @@ PYBIND11_NAMESPACE_BEGIN(detail)
 #  define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj) setattr((PyObject *) obj, "__qualname__", nameobj)
+inline std::string get_fully_qualified_tp_name(PyTypeObject *type) {
+#if !defined(PYPY_VERSION)
+    return type->tp_name;
+    auto module_name = handle((PyObject *) type).attr("__module__").cast<std::string>();
+    if (module_name == PYBIND11_BUILTINS_MODULE)
+        return type->tp_name;
+    else
+        return std::move(module_name) + "." + type->tp_name;
 inline PyTypeObject *type_incref(PyTypeObject *type) {
     return type;
@@ -117,8 +129,9 @@ extern "C" inline int pybind11_meta_setattro(PyObject* obj, PyObject* name, PyOb
     //   2. `Type.static_prop = other_static_prop` --> setattro:  replace existing `static_prop`
     //   3. `Type.regular_attribute = value`       --> setattro:  regular attribute assignment
     const auto static_prop = (PyObject *) get_internals().static_property_type;
-    const auto call_descr_set = descr && PyObject_IsInstance(descr, static_prop)
-                                && !PyObject_IsInstance(value, static_prop);
+    const auto call_descr_set = (descr != nullptr) && (value != nullptr)
+                                && (PyObject_IsInstance(descr, static_prop) != 0)
+                                && (PyObject_IsInstance(value, static_prop) == 0);
     if (call_descr_set) {
         // Call `static_property.__set__()` instead of replacing the `static_property`.
 #if !defined(PYPY_VERSION)
@@ -150,9 +163,7 @@ extern "C" inline PyObject *pybind11_meta_getattro(PyObject *obj, PyObject *name
         return descr;
-    else {
-        return PyType_Type.tp_getattro(obj, name);
-    }
+    return PyType_Type.tp_getattro(obj, name);
@@ -172,7 +183,7 @@ extern "C" inline PyObject *pybind11_meta_call(PyObject *type, PyObject *args, P
     for (const auto &vh : values_and_holders(instance)) {
         if (!vh.holder_constructed()) {
             PyErr_Format(PyExc_TypeError, "%.200s.__init__() must be called when overriding __init__",
-                         vh.type->type->tp_name);
+                         get_fully_qualified_tp_name(vh.type->type).c_str());
             return nullptr;
@@ -181,6 +192,44 @@ extern "C" inline PyObject *pybind11_meta_call(PyObject *type, PyObject *args, P
     return self;
+/// Cleanup the type-info for a pybind11-registered type.
+extern "C" inline void pybind11_meta_dealloc(PyObject *obj) {
+    auto *type = (PyTypeObject *) obj;
+    auto &internals = get_internals();
+    // A pybind11-registered type will:
+    // 1) be found in internals.registered_types_py
+    // 2) have exactly one associated `detail::type_info`
+    auto found_type = internals.registered_types_py.find(type);
+    if (found_type != internals.registered_types_py.end() &&
+        found_type->second.size() == 1 &&
+        found_type->second[0]->type == type) {
+        auto *tinfo = found_type->second[0];
+        auto tindex = std::type_index(*tinfo->cpptype);
+        internals.direct_conversions.erase(tindex);
+        if (tinfo->module_local)
+            get_local_internals().registered_types_cpp.erase(tindex);
+        else
+            internals.registered_types_cpp.erase(tindex);
+        internals.registered_types_py.erase(tinfo->type);
+        // Actually just `std::erase_if`, but that's only available in C++20
+        auto &cache = internals.inactive_override_cache;
+        for (auto it = cache.begin(), last = cache.end(); it != last; ) {
+            if (it->first == (PyObject *) tinfo->type)
+                it = cache.erase(it);
+            else
+                ++it;
+        }
+        delete tinfo;
+    }
+    PyType_Type.tp_dealloc(obj);
 /** This metaclass is assigned by default to all pybind11 types and is required in order
     for static properties to function correctly. Users may override this using `py::metaclass`.
     Return value: New reference. */
@@ -213,6 +262,8 @@ inline PyTypeObject* make_default_metaclass() {
     type->tp_getattro = pybind11_meta_getattro;
+    type->tp_dealloc = pybind11_meta_dealloc;
     if (PyType_Ready(type) < 0)
         pybind11_fail("make_default_metaclass(): failure in PyType_Ready()!");
@@ -250,7 +301,7 @@ inline bool deregister_instance_impl(void *ptr, instance *self) {
     auto &registered_instances = get_internals().registered_instances;
     auto range = registered_instances.equal_range(ptr);
     for (auto it = range.first; it != range.second; ++it) {
-        if (Py_TYPE(self) == Py_TYPE(it->second)) {
+        if (self == it->second) {
             return true;
@@ -277,7 +328,7 @@ inline bool deregister_instance(instance *self, void *valptr, const type_info *t
 inline PyObject *make_new_instance(PyTypeObject *type) {
 #if defined(PYPY_VERSION)
     // PyPy gets tp_basicsize wrong (issue 2482) under multiple inheritance when the first inherited
-    // object is a a plain Python type (i.e. not derived from an extension type).  Fix it.
+    // object is a plain Python type (i.e. not derived from an extension type).  Fix it.
     ssize_t instance_size = static_cast<ssize_t>(sizeof(instance));
     if (type->tp_basicsize < instance_size) {
         type->tp_basicsize = instance_size;
@@ -288,8 +339,6 @@ inline PyObject *make_new_instance(PyTypeObject *type) {
     // Allocate the value/holder internals:
-    inst->owned = true;
     return self;
@@ -304,12 +353,7 @@ extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *,
 /// following default function will be used which simply throws an exception.
 extern "C" inline int pybind11_object_init(PyObject *self, PyObject *, PyObject *) {
     PyTypeObject *type = Py_TYPE(self);
-    std::string msg;
-#if defined(PYPY_VERSION)
-    msg += handle((PyObject *) type).attr("__module__").cast<std::string>() + ".";
-    msg += type->tp_name;
-    msg += ": No constructor defined!";
+    std::string msg = get_fully_qualified_tp_name(type) + ": No constructor defined!";
     PyErr_SetString(PyExc_TypeError, msg.c_str());
     return -1;
@@ -448,7 +492,7 @@ extern "C" inline PyObject *pybind11_get_dict(PyObject *self, void *) {
 extern "C" inline int pybind11_set_dict(PyObject *self, PyObject *new_dict, void *) {
     if (!PyDict_Check(new_dict)) {
         PyErr_Format(PyExc_TypeError, "__dict__ must be set to a dictionary, not a '%.200s'",
-                     Py_TYPE(new_dict)->tp_name);
+                     get_fully_qualified_tp_name(Py_TYPE(new_dict)).c_str());
         return -1;
     PyObject *&dict = *_PyObject_GetDictPtr(self);
@@ -475,11 +519,6 @@ extern "C" inline int pybind11_clear(PyObject *self) {
 /// Give instances of this type a `__dict__` and opt into garbage collection.
 inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type) {
     auto type = &heap_type->ht_type;
-#if defined(PYPY_VERSION) && (PYPY_VERSION_NUM < 0x06000000)
-    pybind11_fail(std::string(type->tp_name) + ": dynamic attributes are "
-                                               "currently not supported in "
-                                               "conjunction with PyPy!");
     type->tp_flags |= Py_TPFLAGS_HAVE_GC;
     type->tp_dictoffset = type->tp_basicsize; // place dict at the end
     type->tp_basicsize += (ssize_t)sizeof(PyObject *); // and allocate enough space for it
@@ -510,6 +549,12 @@ extern "C" inline int pybind11_getbuffer(PyObject *obj, Py_buffer *view, int fla
     std::memset(view, 0, sizeof(Py_buffer));
     buffer_info *info = tinfo->get_buffer(obj, tinfo->get_buffer_data);
+    if ((flags & PyBUF_WRITABLE) == PyBUF_WRITABLE && info->readonly) {
+        delete info;
+        // view->obj = nullptr;  // Was just memset to 0, so not necessary
+        PyErr_SetString(PyExc_BufferError, "Writable buffer requested for readonly storage");
+        return -1;
+    }
     view->obj = obj;
     view->ndim = 1;
     view->internal = info;
@@ -518,13 +563,7 @@ extern "C" inline int pybind11_getbuffer(PyObject *obj, Py_buffer *view, int fla
     view->len = view->itemsize;
     for (auto s : info->shape)
         view->len *= s;
-    view->readonly = info->readonly;
-    if ((flags & PyBUF_WRITABLE) == PyBUF_WRITABLE && info->readonly) {
-        if (view)
-            view->obj = nullptr;
-        PyErr_SetString(PyExc_BufferError, "Writable buffer requested for readonly storage");
-        return -1;
-    }
+    view->readonly = static_cast<int>(info->readonly);
     if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT)
         view->format = const_cast<char *>(info->format.c_str());
     if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
@@ -567,17 +606,17 @@ inline PyObject* make_new_python_type(const type_record &rec) {
-    object module;
+    object module_;
     if (rec.scope) {
         if (hasattr(rec.scope, "__module__"))
-            module = rec.scope.attr("__module__");
+            module_ = rec.scope.attr("__module__");
         else if (hasattr(rec.scope, "__name__"))
-            module = rec.scope.attr("__name__");
+            module_ = rec.scope.attr("__name__");
     auto full_name = c_str(
 #if !defined(PYPY_VERSION)
-        module ? str(module).cast<std::string>() + "." + rec.name :
+        module_ ? str(module_).cast<std::string>() + "." + rec.name :
@@ -585,9 +624,9 @@ inline PyObject* make_new_python_type(const type_record &rec) {
     if (rec.doc && options::show_user_defined_docstrings()) {
         /* Allocate memory for docstring (using PyObject_MALLOC, since
            Python will free this later on) */
-        size_t size = strlen(rec.doc) + 1;
+        size_t size = std::strlen(rec.doc) + 1;
         tp_doc = (char *) PyObject_MALLOC(size);
-        memcpy((void *) tp_doc, rec.doc, size);
+        std::memcpy((void *) tp_doc, rec.doc, size);
     auto &internals = get_internals();
@@ -644,11 +683,13 @@ inline PyObject* make_new_python_type(const type_record &rec) {
     if (rec.buffer_protocol)
+    if (rec.custom_type_setup_callback)
+        rec.custom_type_setup_callback(heap_type);
     if (PyType_Ready(type) < 0)
         pybind11_fail(std::string(rec.name) + ": PyType_Ready failed (" + error_string() + ")!");
-    assert(rec.dynamic_attr ? PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC)
-                            : !PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+    assert(!rec.dynamic_attr || PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
     /* Register type with the parent scope */
     if (rec.scope)
@@ -656,8 +697,8 @@ inline PyObject* make_new_python_type(const type_record &rec) {
         Py_INCREF(type); // Keep it alive forever (reference leak)
-    if (module) // Needed by pydoc
-        setattr((PyObject *) type, "__module__", module);
+    if (module_) // Needed by pydoc
+        setattr((PyObject *) type, "__module__", module_);
     PYBIND11_SET_OLDPY_QUALNAME(type, qualname);
diff --git a/wrap/pybind11/include/pybind11/detail/common.h b/wrap/pybind11/include/pybind11/detail/common.h
index 1f8390fbab..5c59b41417 100644
--- a/wrap/pybind11/include/pybind11/detail/common.h
+++ b/wrap/pybind11/include/pybind11/detail/common.h
@@ -10,8 +10,12 @@
 #pragma once
-#define PYBIND11_VERSION_PATCH 0.dev1
+// Similar to Python's convention: https://docs.python.org/3/c-api/apiabiversion.html
+// Additional convention: 0xD = dev
+#define PYBIND11_VERSION_HEX 0x02090100
 #define PYBIND11_NAMESPACE_BEGIN(name) namespace name {
 #define PYBIND11_NAMESPACE_END(name) }
@@ -27,11 +31,14 @@
 #  endif
-#if !(defined(_MSC_VER) && __cplusplus == 199711L) && !defined(__INTEL_COMPILER)
+#if !(defined(_MSC_VER) && __cplusplus == 199711L)
 #  if __cplusplus >= 201402L
 #    define PYBIND11_CPP14
 #    if __cplusplus >= 201703L
 #      define PYBIND11_CPP17
+#      if __cplusplus >= 202002L
+#        define PYBIND11_CPP20
+#      endif
 #    endif
 #  endif
 #elif defined(_MSC_VER) && __cplusplus == 199711L
@@ -41,15 +48,23 @@
 #    define PYBIND11_CPP14
 #    if _MSVC_LANG > 201402L && _MSC_VER >= 1910
 #      define PYBIND11_CPP17
+#      if _MSVC_LANG >= 202002L
+#        define PYBIND11_CPP20
+#      endif
 #    endif
 #  endif
 // Compiler version assertions
 #if defined(__INTEL_COMPILER)
-#  if __INTEL_COMPILER < 1700
-#    error pybind11 requires Intel C++ compiler v17 or newer
+#  if __INTEL_COMPILER < 1800
+#    error pybind11 requires Intel C++ compiler v18 or newer
+#  elif __INTEL_COMPILER < 1900 && defined(PYBIND11_CPP14)
+#    error pybind11 supports only C++11 with Intel C++ compiler v18. Use v19 or newer for C++14.
 #  endif
+/* The following pragma cannot be pop'ed:
+   https://community.intel.com/t5/Intel-C-Compiler/Inline-and-no-inline-warning/td-p/1216764 */
+#  pragma warning disable 2196 // warning #2196: routine is both "inline" and "noinline"
 #elif defined(__clang__) && !defined(__apple_build_version__)
 #  if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 3)
 #    error pybind11 requires clang 3.3 or newer
@@ -80,13 +95,43 @@
 #  endif
-#if defined(_MSC_VER)
-#  define PYBIND11_NOINLINE __declspec(noinline)
+#  ifdef __MINGW32__
+// workaround for:
+// error: 'dllexport' implies default visibility, but xxx has already been declared with a different visibility
+#  else
+#  endif
+// For CUDA, GCC7, GCC8:
+// PYBIND11_NOINLINE_FORCED is incompatible with `-Wattributes -Werror`.
+// When defining PYBIND11_NOINLINE_FORCED, it is best to also use `-Wno-attributes`.
+// However, the measured shared-library size saving when using noinline are only
+// 1.7% for CUDA, -0.2% for GCC7, and 0.0% for GCC8 (using -DCMAKE_BUILD_TYPE=MinSizeRel,
+// the default under pybind11/tests).
+#if !defined(PYBIND11_NOINLINE_FORCED) && \
+    (defined(__CUDACC__) || (defined(__GNUC__) && (__GNUC__ == 7 || __GNUC__ == 8)))
+// The PYBIND11_NOINLINE macro is for function DEFINITIONS.
+// In contrast, FORWARD DECLARATIONS should never use this macro:
+// https://stackoverflow.com/questions/9317473/forward-declaration-of-inline-functions
+#if defined(PYBIND11_NOINLINE_DISABLED) // Option for maximum portability and experimentation.
+#  define PYBIND11_NOINLINE inline
+#elif defined(_MSC_VER)
+#  define PYBIND11_NOINLINE __declspec(noinline) inline
-#  define PYBIND11_NOINLINE __attribute__ ((noinline))
+#  define PYBIND11_NOINLINE __attribute__ ((noinline)) inline
-#if defined(PYBIND11_CPP14)
+#if defined(__MINGW32__)
+// For unknown reasons all PYBIND11_DEPRECATED member trigger a warning when declared
+// whether it is used or not
+#  define PYBIND11_DEPRECATED(reason)
+#elif defined(PYBIND11_CPP14)
 #  define PYBIND11_DEPRECATED(reason) [[deprecated(reason)]]
 #  define PYBIND11_DEPRECATED(reason) __attribute__((deprecated(reason)))
@@ -112,13 +157,61 @@
 #    define HAVE_ROUND 1
 #  endif
 #  pragma warning(push)
-#  pragma warning(disable: 4510 4610 4512 4005)
+// C4505: 'PySlice_GetIndicesEx': unreferenced local function has been removed (PyPy only)
+#  pragma warning(disable: 4505)
 #  if defined(_DEBUG) && !defined(Py_DEBUG)
+// Workaround for a VS 2022 issue.
+// NOTE: This workaround knowingly violates the Python.h include order requirement:
+// https://docs.python.org/3/c-api/intro.html#include-files
+// See https://github.com/pybind/pybind11/pull/3497 for full context.
+#    include <yvals.h>
+#    if _MSVC_STL_VERSION >= 143
+#      include <crtdefs.h>
+#    endif
 #    define PYBIND11_DEBUG_MARKER
 #    undef _DEBUG
 #  endif
+// https://en.cppreference.com/w/c/chrono/localtime
+#if defined(__STDC_LIB_EXT1__) && !defined(__STDC_WANT_LIB_EXT1__)
+#  define __STDC_WANT_LIB_EXT1__
+#ifdef __has_include
+// std::optional (but including it in c++14 mode isn't allowed)
+#  if defined(PYBIND11_CPP17) && __has_include(<optional>)
+#    define PYBIND11_HAS_OPTIONAL 1
+#  endif
+// std::experimental::optional (but not allowed in c++11 mode)
+#  if defined(PYBIND11_CPP14) && (__has_include(<experimental/optional>) && \
+                                 !__has_include(<optional>))
+#    define PYBIND11_HAS_EXP_OPTIONAL 1
+#  endif
+// std::variant
+#  if defined(PYBIND11_CPP17) && __has_include(<variant>)
+#    define PYBIND11_HAS_VARIANT 1
+#  endif
+#elif defined(_MSC_VER) && defined(PYBIND11_CPP17)
+#  define PYBIND11_HAS_OPTIONAL 1
+#  define PYBIND11_HAS_VARIANT 1
+#if defined(PYBIND11_CPP17)
+#  if defined(__has_include)
+#    if __has_include(<string_view>)
+#      define PYBIND11_HAS_STRING_VIEW
+#    endif
+#  elif defined(_MSC_VER)
+#  endif
+#if defined(__cpp_lib_char8_t) && __cpp_lib_char8_t >= 201811L
+#  define PYBIND11_HAS_U8STRING
 #include <Python.h>
 #include <frameobject.h>
 #include <pythread.h>
@@ -160,6 +253,24 @@
 #include <memory>
 #include <typeindex>
 #include <type_traits>
+#if defined(__has_include)
+#  if __has_include(<version>)
+#    include <version>
+#  endif
+// If DEFINED, pybind11::str can hold PyUnicodeObject or PyBytesObject
+//             (probably surprising and never documented, but this was the
+//             legacy behavior until and including v2.6.x). As a side-effect,
+//             pybind11::isinstance<str>() is true for both pybind11::str and
+//             pybind11::bytes.
+// If UNDEFINED, pybind11::str can only hold PyUnicodeObject, and
+//               pybind11::isinstance<str>() is true only for pybind11::str.
+//               However, for Python 2 only (!), the pybind11::str caster
+//               implicitly decodes bytes to PyUnicodeObject. This is to ease
+//               the transition from the legacy behavior to the non-permissive
+//               behavior.
 #if PY_MAJOR_VERSION >= 3 /// Compatibility macros for various Python versions
 #define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyInstanceMethod_New(ptr)
@@ -173,8 +284,8 @@
 #define PYBIND11_BYTES_SIZE PyBytes_Size
 #define PYBIND11_LONG_CHECK(o) PyLong_Check(o)
 #define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o)
-#define PYBIND11_LONG_FROM_SIGNED(o) PyLong_FromSsize_t((ssize_t) o)
-#define PYBIND11_LONG_FROM_UNSIGNED(o) PyLong_FromSize_t((size_t) o)
+#define PYBIND11_LONG_FROM_SIGNED(o) PyLong_FromSsize_t((ssize_t) (o))
+#define PYBIND11_LONG_FROM_UNSIGNED(o) PyLong_FromSize_t((size_t) (o))
 #define PYBIND11_BYTES_NAME "bytes"
 #define PYBIND11_STRING_NAME "str"
 #define PYBIND11_SLICE_OBJECT PyObject
@@ -182,6 +293,7 @@
 #define PYBIND11_STR_TYPE ::pybind11::str
 #define PYBIND11_BOOL_ATTR "__bool__"
 #define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_bool)
+#define PYBIND11_BUILTINS_MODULE "builtins"
 // Providing a separate declaration to make Clang's -Wmissing-prototypes happy.
 // See comment for PYBIND11_MODULE below for why this is marked "maybe unused".
 #define PYBIND11_PLUGIN_IMPL(name) \
@@ -209,6 +321,7 @@
 #define PYBIND11_STR_TYPE ::pybind11::bytes
 #define PYBIND11_BOOL_ATTR "__nonzero__"
 #define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_nonzero)
+#define PYBIND11_BUILTINS_MODULE "__builtin__"
 // Providing a separate PyInit decl to make Clang's -Wmissing-prototypes happy.
 // See comment for PYBIND11_MODULE below for why this is marked "maybe unused".
 #define PYBIND11_PLUGIN_IMPL(name) \
@@ -250,6 +363,19 @@ extern "C" {
         }                                                                      \
+#if PY_VERSION_HEX >= 0x03030000
+        catch (pybind11::error_already_set &e) {                                 \
+            pybind11::raise_from(e, PyExc_ImportError, "initialization failed"); \
+            return nullptr;                                                      \
+        } catch (const std::exception &e) {                                      \
+            PyErr_SetString(PyExc_ImportError, e.what());                        \
+            return nullptr;                                                      \
+        }                                                                        \
         catch (pybind11::error_already_set &e) {                               \
             PyErr_SetString(PyExc_ImportError, e.what());                      \
@@ -259,17 +385,19 @@ extern "C" {
             return nullptr;                                                    \
         }                                                                      \
 /** \rst
     ***Deprecated in favor of PYBIND11_MODULE***
     This macro creates the entry point that will be invoked when the Python interpreter
-    imports a plugin library. Please create a `module` in the function body and return
+    imports a plugin library. Please create a `module_` in the function body and return
     the pointer to its underlying Python object at the end.
     .. code-block:: cpp
         PYBIND11_PLUGIN(example) {
-            pybind11::module m("example", "pybind11 example plugin");
+            pybind11::module_ m("example", "pybind11 example plugin");
             /// Set up bindings here
             return m.ptr();
@@ -290,7 +418,7 @@ extern "C" {
     This macro creates the entry point that will be invoked when the Python interpreter
     imports an extension module. The module name is given as the fist argument and it
     should not be in quotes. The second macro argument defines a variable of type
-    `py::module` which can be used to initialize the module.
+    `py::module_` which can be used to initialize the module.
     The entry point is marked as "maybe unused" to aid dead-code detection analysis:
     since the entry point is typically only looked up at runtime and not referenced
@@ -307,26 +435,35 @@ extern "C" {
 \endrst */
-#define PYBIND11_MODULE(name, variable)                                        \
-    PYBIND11_MAYBE_UNUSED                                                      \
-    static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &);     \
-    PYBIND11_PLUGIN_IMPL(name) {                                               \
-        PYBIND11_CHECK_PYTHON_VERSION                                          \
-        PYBIND11_ENSURE_INTERNALS_READY                                        \
-        auto m = pybind11::module(PYBIND11_TOSTRING(name));                    \
-        try {                                                                  \
-            PYBIND11_CONCAT(pybind11_init_, name)(m);                          \
-            return m.ptr();                                                    \
-        } PYBIND11_CATCH_INIT_EXCEPTIONS                                       \
-    }                                                                          \
-    void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable)
+#define PYBIND11_MODULE(name, variable)                                                           \
+    static ::pybind11::module_::module_def PYBIND11_CONCAT(pybind11_module_def_, name)            \
+        PYBIND11_MAYBE_UNUSED;                                                                    \
+    PYBIND11_MAYBE_UNUSED                                                                         \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(::pybind11::module_ &);                     \
+    PYBIND11_PLUGIN_IMPL(name) {                                                                  \
+        PYBIND11_CHECK_PYTHON_VERSION                                                             \
+        PYBIND11_ENSURE_INTERNALS_READY                                                           \
+        auto m = ::pybind11::module_::create_extension_module(                                    \
+            PYBIND11_TOSTRING(name), nullptr, &PYBIND11_CONCAT(pybind11_module_def_, name));      \
+        try {                                                                                     \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                                             \
+            return m.ptr();                                                                       \
+        }                                                                                         \
+        PYBIND11_CATCH_INIT_EXCEPTIONS                                                            \
+    }                                                                                             \
+    void PYBIND11_CONCAT(pybind11_init_, name)(::pybind11::module_ & (variable))
 using ssize_t = Py_ssize_t;
 using size_t  = std::size_t;
+template <typename IntType>
+inline ssize_t ssize_t_cast(const IntType &val) {
+    static_assert(sizeof(IntType) <= sizeof(ssize_t), "Implicit narrowing is not permitted.");
+    return static_cast<ssize_t>(val);
 /// Approach used to cast a previously unknown C++ instance into a Python object
 enum class return_value_policy : uint8_t {
     /** This is the default return value policy, which falls back to the policy
@@ -481,6 +618,18 @@ template <typename T> using remove_cv_t = typename std::remove_cv<T>::type;
 template <typename T> using remove_reference_t = typename std::remove_reference<T>::type;
+#if defined(PYBIND11_CPP20)
+using std::remove_cvref;
+using std::remove_cvref_t;
+template <class T>
+struct remove_cvref {
+    using type = remove_cv_t<remove_reference_t<T>>;
+template <class T>
+using remove_cvref_t = typename remove_cvref<T>::type;
 /// Index sequences
 #if defined(PYBIND11_CPP14)
 using std::index_sequence;
@@ -488,7 +637,7 @@ using std::make_index_sequence;
 template<size_t ...> struct index_sequence  { };
 template<size_t N, size_t ...S> struct make_index_sequence_impl : make_index_sequence_impl <N - 1, N - 1, S...> { };
-template<size_t ...S> struct make_index_sequence_impl <0, S...> { typedef index_sequence<S...> type; };
+template<size_t ...S> struct make_index_sequence_impl <0, S...> { using type = index_sequence<S...>; };
 template<size_t N> using make_index_sequence = typename make_index_sequence_impl<N>::type;
@@ -502,10 +651,10 @@ template <bool... Bs> using select_indices = typename select_indices_impl<index_
 template <bool B> using bool_constant = std::integral_constant<bool, B>;
 template <typename T> struct negation : bool_constant<!T::value> { };
-// PGI cannot detect operator delete with the "compatible" void_t impl, so
+// PGI/Intel cannot detect operator delete with the "compatible" void_t impl, so
 // using the new one (C++14 defect, so generally works on newer compilers, even
 // if not in C++17 mode)
-#if defined(__PGIC__)
+#if defined(__PGIC__) || defined(__INTEL_COMPILER)
 template<typename... > using void_t = void;
 template <typename...> struct void_t_impl { using type = void; };
@@ -618,8 +767,9 @@ template <typename Base, typename Derived> using is_strict_base_of = bool_consta
 /// Like is_base_of, but also requires that the base type is accessible (i.e. that a Derived pointer
 /// can be converted to a Base pointer)
+/// For unions, `is_base_of<T, T>::value` is False, so we need to check `is_same` as well.
 template <typename Base, typename Derived> using is_accessible_base_of = bool_constant<
-    std::is_base_of<Base, Derived>::value && std::is_convertible<Derived *, Base *>::value>;
+    (std::is_same<Base, Derived>::value || std::is_base_of<Base, Derived>::value) && std::is_convertible<Derived *, Base *>::value>;
 template <template<typename...> class Base>
 struct is_template_base_of_impl {
@@ -656,6 +806,10 @@ template <typename T> using is_function_pointer = bool_constant<
     std::is_pointer<T>::value && std::is_function<typename std::remove_pointer<T>::type>::value>;
 template <typename F> struct strip_function_object {
+    // If you are encountering an
+    // 'error: name followed by "::" must be a class or namespace name'
+    // with the Intel compiler and a noexcept function here,
+    // try to use noexcept(true) instead of plain noexcept.
     using type = typename remove_class<decltype(&F::operator())>::type;
@@ -677,11 +831,10 @@ using function_signature_t = conditional_t<
 template <typename T> using is_lambda = satisfies_none_of<remove_reference_t<T>,
         std::is_function, std::is_pointer, std::is_member_pointer>;
-/// Ignore that a variable is unused in compiler warnings
-inline void ignore_unused(const int *) { }
+// [workaround(intel)] Internal error on fold expression
 /// Apply a function over each element of a parameter pack
-#ifdef __cpp_fold_expressions
+#if defined(__cpp_fold_expressions) && !defined(__INTEL_COMPILER)
+// Intel compiler produces an internal error on this fold expression (tested with ICC 19.0.2)
 using expand_side_effects = bool[];
@@ -690,16 +843,23 @@ using expand_side_effects = bool[];
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4275) // warning C4275: An exported class was derived from a class that wasn't exported. Can be ignored when derived from a STL class.
 /// C++ bindings of builtin Python exceptions
-class builtin_exception : public std::runtime_error {
+class PYBIND11_EXPORT_EXCEPTION builtin_exception : public std::runtime_error {
     using std::runtime_error::runtime_error;
     /// Set the error using the Python C API
     virtual void set_error() const = 0;
+#if defined(_MSC_VER)
+#  pragma warning(pop)
 #define PYBIND11_RUNTIME_EXCEPTION(name, type) \
-    class name : public builtin_exception { public: \
+    class PYBIND11_EXPORT_EXCEPTION name : public builtin_exception { public: \
         using builtin_exception::builtin_exception; \
         name() : name("") { } \
         void set_error() const override { PyErr_SetString(type, what()); } \
@@ -712,11 +872,12 @@ PYBIND11_RUNTIME_EXCEPTION(value_error, PyExc_ValueError)
 PYBIND11_RUNTIME_EXCEPTION(type_error, PyExc_TypeError)
 PYBIND11_RUNTIME_EXCEPTION(buffer_error, PyExc_BufferError)
 PYBIND11_RUNTIME_EXCEPTION(import_error, PyExc_ImportError)
+PYBIND11_RUNTIME_EXCEPTION(attribute_error, PyExc_AttributeError)
 PYBIND11_RUNTIME_EXCEPTION(cast_error, PyExc_RuntimeError) /// Thrown when pybind11::cast or handle::call fail due to a type casting error
 PYBIND11_RUNTIME_EXCEPTION(reference_cast_error, PyExc_RuntimeError) /// Used internally
-[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const char *reason) { throw std::runtime_error(reason); }
-[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const std::string &reason) { throw std::runtime_error(reason); }
+[[noreturn]] PYBIND11_NOINLINE void pybind11_fail(const char *reason) { throw std::runtime_error(reason); }
+[[noreturn]] PYBIND11_NOINLINE void pybind11_fail(const std::string &reason) { throw std::runtime_error(reason); }
 template <typename T, typename SFINAE = void> struct format_descriptor { };
@@ -761,7 +922,8 @@ struct nodelete { template <typename T> void operator()(T*) { } };
 template <typename... Args>
 struct overload_cast_impl {
-    constexpr overload_cast_impl() {}; // NOLINT(modernize-use-equals-default):  MSVC 2015 needs this
+    // NOLINTNEXTLINE(modernize-use-equals-default):  MSVC 2015 needs this
+    constexpr overload_cast_impl() {}
     template <typename Return>
     constexpr auto operator()(Return (*pf)(Args...)) const noexcept
@@ -817,6 +979,7 @@ class any_container {
     // Implicit conversion constructor from any arbitrary container type with values convertible to T
     template <typename Container, typename = enable_if_t<std::is_convertible<decltype(*std::begin(std::declval<const Container &>())), T>::value>>
+    // NOLINTNEXTLINE(google-explicit-constructor)
     any_container(const Container &c) : any_container(std::begin(c), std::end(c)) { }
     // initializer_list's aren't deducible, so don't get matched by the above template; we need this
@@ -825,9 +988,11 @@ class any_container {
     any_container(const std::initializer_list<TIn> &c) : any_container(c.begin(), c.end()) { }
     // Avoid copying if given an rvalue vector of the correct type.
+    // NOLINTNEXTLINE(google-explicit-constructor)
     any_container(std::vector<T> &&v) : v(std::move(v)) { }
     // Moves the vector out of an rvalue any_container
+    // NOLINTNEXTLINE(google-explicit-constructor)
     operator std::vector<T> &&() && { return std::move(v); }
     // Dereferencing obtains a reference to the underlying vector
@@ -839,8 +1004,60 @@ class any_container {
     const std::vector<T> *operator->() const { return &v; }
+// Forward-declaration; see detail/class.h
+std::string get_fully_qualified_tp_name(PyTypeObject*);
+template <typename T>
+inline static std::shared_ptr<T> try_get_shared_from_this(std::enable_shared_from_this<T> *holder_value_ptr) {
+// Pre C++17, this code path exploits undefined behavior, but is known to work on many platforms.
+// Use at your own risk!
+// See also https://en.cppreference.com/w/cpp/memory/enable_shared_from_this, and in particular
+// the `std::shared_ptr<Good> gp1 = not_so_good.getptr();` and `try`-`catch` parts of the example.
+#if defined(__cpp_lib_enable_shared_from_this) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
+    return holder_value_ptr->weak_from_this().lock();
+    try {
+        return holder_value_ptr->shared_from_this();
+    }
+    catch (const std::bad_weak_ptr &) {
+        return nullptr;
+    }
+// For silencing "unused" compiler warnings in special situations.
+template <typename... Args>
+#if defined(_MSC_VER) && _MSC_VER >= 1910 && _MSC_VER < 1920 // MSVC 2017
+inline void silence_unused_warnings(Args &&...) {}
+// MSVC warning C4100: Unreferenced formal parameter
+#if defined(_MSC_VER) && _MSC_VER <= 1916
+#    define PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(...)                                         \
+        detail::silence_unused_warnings(__VA_ARGS__)
+// GCC -Wunused-but-set-parameter  All GCC versions (as of July 2021).
+#if defined(__GNUG__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+        detail::silence_unused_warnings(__VA_ARGS__)
+#if defined(_MSC_VER) // All versions (as of July 2021).
+// warning C4127: Conditional expression is constant
+constexpr inline bool silence_msvc_c4127(bool cond) { return cond; }
+#    define PYBIND11_SILENCE_MSVC_C4127(...) ::pybind11::detail::silence_msvc_c4127(__VA_ARGS__)
+#    define PYBIND11_SILENCE_MSVC_C4127(...) __VA_ARGS__
diff --git a/wrap/pybind11/include/pybind11/detail/descr.h b/wrap/pybind11/include/pybind11/detail/descr.h
index 92720cd562..0f93e06b21 100644
--- a/wrap/pybind11/include/pybind11/detail/descr.h
+++ b/wrap/pybind11/include/pybind11/detail/descr.h
@@ -23,15 +23,17 @@ PYBIND11_NAMESPACE_BEGIN(detail)
 /* Concatenate type signatures at compile time */
 template <size_t N, typename... Ts>
 struct descr {
-    char text[N + 1];
+    char text[N + 1]{'\0'};
-    constexpr descr() : text{'\0'} { }
+    constexpr descr() = default;
+    // NOLINTNEXTLINE(google-explicit-constructor)
     constexpr descr(char const (&s)[N+1]) : descr(s, make_index_sequence<N>()) { }
     template <size_t... Is>
     constexpr descr(char const (&s)[N+1], index_sequence<Is...>) : text{s[Is]..., '\0'} { }
     template <typename... Chars>
+    // NOLINTNEXTLINE(google-explicit-constructor)
     constexpr descr(char c, Chars... cs) : text{c, static_cast<char>(cs)..., '\0'} { }
     static constexpr std::array<const std::type_info *, sizeof...(Ts) + 1> types() {
@@ -42,6 +44,7 @@ struct descr {
 template <size_t N1, size_t N2, typename... Ts1, typename... Ts2, size_t... Is1, size_t... Is2>
 constexpr descr<N1 + N2, Ts1..., Ts2...> plus_impl(const descr<N1, Ts1...> &a, const descr<N2, Ts2...> &b,
                                                    index_sequence<Is1...>, index_sequence<Is2...>) {
     return {a.text[Is1]..., b.text[Is2]...};
@@ -51,34 +54,64 @@ constexpr descr<N1 + N2, Ts1..., Ts2...> operator+(const descr<N1, Ts1...> &a, c
 template <size_t N>
-constexpr descr<N - 1> _(char const(&text)[N]) { return descr<N - 1>(text); }
-constexpr descr<0> _(char const(&)[1]) { return {}; }
+constexpr descr<N - 1> const_name(char const(&text)[N]) { return descr<N - 1>(text); }
+constexpr descr<0> const_name(char const(&)[1]) { return {}; }
 template <size_t Rem, size_t... Digits> struct int_to_str : int_to_str<Rem/10, Rem%10, Digits...> { };
 template <size_t...Digits> struct int_to_str<0, Digits...> {
+    // WARNING: This only works with C++17 or higher.
     static constexpr auto digits = descr<sizeof...(Digits)>(('0' + Digits)...);
 // Ternary description (like std::conditional)
 template <bool B, size_t N1, size_t N2>
-constexpr enable_if_t<B, descr<N1 - 1>> _(char const(&text1)[N1], char const(&)[N2]) {
-    return _(text1);
+constexpr enable_if_t<B, descr<N1 - 1>> const_name(char const(&text1)[N1], char const(&)[N2]) {
+    return const_name(text1);
 template <bool B, size_t N1, size_t N2>
-constexpr enable_if_t<!B, descr<N2 - 1>> _(char const(&)[N1], char const(&text2)[N2]) {
-    return _(text2);
+constexpr enable_if_t<!B, descr<N2 - 1>> const_name(char const(&)[N1], char const(&text2)[N2]) {
+    return const_name(text2);
 template <bool B, typename T1, typename T2>
-constexpr enable_if_t<B, T1> _(const T1 &d, const T2 &) { return d; }
+constexpr enable_if_t<B, T1> const_name(const T1 &d, const T2 &) { return d; }
 template <bool B, typename T1, typename T2>
-constexpr enable_if_t<!B, T2> _(const T1 &, const T2 &d) { return d; }
+constexpr enable_if_t<!B, T2> const_name(const T1 &, const T2 &d) { return d; }
-template <size_t Size> auto constexpr _() -> decltype(int_to_str<Size / 10, Size % 10>::digits) {
+template <size_t Size>
+auto constexpr const_name() -> remove_cv_t<decltype(int_to_str<Size / 10, Size % 10>::digits)> {
     return int_to_str<Size / 10, Size % 10>::digits;
-template <typename Type> constexpr descr<1, Type> _() { return {'%'}; }
+template <typename Type> constexpr descr<1, Type> const_name() { return {'%'}; }
+// If "_" is defined as a macro, py::detail::_ cannot be provided.
+// It is therefore best to use py::detail::const_name universally.
+// This block is for backward compatibility only.
+// (The const_name code is repeated to avoid introducing a "_" #define ourselves.)
+#ifndef _
+template <size_t N>
+constexpr descr<N-1> _(char const(&text)[N]) { return const_name<N>(text); }
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<B, descr<N1 - 1>> _(char const(&text1)[N1], char const(&text2)[N2]) {
+    return const_name<B,N1,N2>(text1, text2);
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<!B, descr<N2 - 1>> _(char const(&text1)[N1], char const(&text2)[N2]) {
+    return const_name<B,N1,N2>(text1, text2);
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<B, T1> _(const T1 &d1, const T2 &d2) { return const_name<B,T1,T2>(d1, d2); }
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<!B, T2> _(const T1 &d1, const T2 &d2) { return const_name<B,T1,T2>(d1, d2); }
+template <size_t Size>
+auto constexpr _() -> remove_cv_t<decltype(int_to_str<Size / 10, Size % 10>::digits)> {
+    return const_name<Size>();
+template <typename Type> constexpr descr<1, Type> _() { return const_name<Type>(); }
+#endif  // #ifndef _
 constexpr descr<0> concat() { return {}; }
@@ -88,12 +121,12 @@ constexpr descr<N, Ts...> concat(const descr<N, Ts...> &descr) { return descr; }
 template <size_t N, typename... Ts, typename... Args>
 constexpr auto concat(const descr<N, Ts...> &d, const Args &...args)
     -> decltype(std::declval<descr<N + 2, Ts...>>() + concat(args...)) {
-    return d + _(", ") + concat(args...);
+    return d + const_name(", ") + concat(args...);
 template <size_t N, typename... Ts>
 constexpr descr<N + 2, Ts...> type_descr(const descr<N, Ts...> &descr) {
-    return _("{") + descr + _("}");
+    return const_name("{") + descr + const_name("}");
diff --git a/wrap/pybind11/include/pybind11/detail/init.h b/wrap/pybind11/include/pybind11/detail/init.h
index 3ef78c1179..eaaad5a07a 100644
--- a/wrap/pybind11/include/pybind11/detail/init.h
+++ b/wrap/pybind11/include/pybind11/detail/init.h
@@ -23,8 +23,8 @@ class type_caster<value_and_holder> {
     template <typename> using cast_op_type = value_and_holder &;
-    operator value_and_holder &() { return *value; }
-    static constexpr auto name = _<value_and_holder>();
+    explicit operator value_and_holder &() { return *value; }
+    static constexpr auto name = const_name<value_and_holder>();
     value_and_holder *value = nullptr;
@@ -94,8 +94,9 @@ void construct(...) {
 // construct an Alias from the returned base instance.
 template <typename Class>
 void construct(value_and_holder &v_h, Cpp<Class> *ptr, bool need_alias) {
-    if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
+    if (PYBIND11_SILENCE_MSVC_C4127(Class::has_alias) && need_alias && !is_alias<Class>(ptr)) {
         // We're going to try to construct an alias by moving the cpp type.  Whether or not
         // that succeeds, we still need to destroy the original cpp pointer (either the
         // moved away leftover, if the alias construction works, or the value itself if we
@@ -131,10 +132,11 @@ void construct(value_and_holder &v_h, Alias<Class> *alias_ptr, bool) {
 // derived type (through those holder's implicit conversion from derived class holder constructors).
 template <typename Class>
 void construct(value_and_holder &v_h, Holder<Class> holder, bool need_alias) {
     auto *ptr = holder_helper<Holder<Class>>::get(holder);
     // If we need an alias, check that the held pointer is actually an alias instance
-    if (Class::has_alias && need_alias && !is_alias<Class>(ptr))
+    if (PYBIND11_SILENCE_MSVC_C4127(Class::has_alias) && need_alias && !is_alias<Class>(ptr))
         throw type_error("pybind11::init(): construction failed: returned holder-wrapped instance "
                          "is not an alias instance");
@@ -148,9 +150,10 @@ void construct(value_and_holder &v_h, Holder<Class> holder, bool need_alias) {
 // need it, we simply move-construct the cpp value into a new instance.
 template <typename Class>
 void construct(value_and_holder &v_h, Cpp<Class> &&result, bool need_alias) {
         "pybind11::init() return-by-value factory function requires a movable class");
-    if (Class::has_alias && need_alias)
+    if (PYBIND11_SILENCE_MSVC_C4127(Class::has_alias) && need_alias)
         construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(result));
         v_h.value_ptr() = new Cpp<Class>(std::move(result));
@@ -219,7 +222,8 @@ template <typename Func, typename Return, typename... Args>
 struct factory<Func, void_type (*)(), Return(Args...)> {
     remove_reference_t<Func> class_factory;
-    factory(Func &&f) : class_factory(std::forward<Func>(f)) { }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    factory(Func &&f) : class_factory(std::forward<Func>(f)) {}
     // The given class either has no alias or has no separate alias factory;
     // this always constructs the class itself.  If the class is registered with an alias
@@ -293,7 +297,13 @@ template <typename Class, typename T, typename O,
           enable_if_t<std::is_convertible<O, handle>::value, int> = 0>
 void setstate(value_and_holder &v_h, std::pair<T, O> &&result, bool need_alias) {
     construct<Class>(v_h, std::move(result.first), need_alias);
-    setattr((PyObject *) v_h.inst, "__dict__", result.second);
+    auto d = handle(result.second);
+    if (PyDict_Check(d.ptr()) && PyDict_Size(d.ptr()) == 0) {
+        // Skipping setattr below, to not force use of py::dynamic_attr() for Class unnecessarily.
+        // See PR #2972 for details.
+        return;
+    }
+    setattr((PyObject *) v_h.inst, "__dict__", d);
 /// Implementation for py::pickle(GetState, SetState)
diff --git a/wrap/pybind11/include/pybind11/detail/internals.h b/wrap/pybind11/include/pybind11/detail/internals.h
index 133d2f4c83..9edb9492e2 100644
--- a/wrap/pybind11/include/pybind11/detail/internals.h
+++ b/wrap/pybind11/include/pybind11/detail/internals.h
@@ -10,9 +10,32 @@
 #pragma once
 #include "../pytypes.h"
+#include <exception>
+/// Tracks the `internals` and `type_info` ABI version independent of the main library version.
+/// Some portions of the code use an ABI that is conditional depending on this
+/// version number.  That allows ABI-breaking changes to be "pre-implemented".
+/// Once the default version number is incremented, the conditional logic that
+/// no longer applies can be removed.  Additionally, users that need not
+/// maintain ABI compatibility can increase the version number in order to take
+/// advantage of any functionality/efficiency improvements that depend on the
+/// newer ABI.
+/// WARNING: If you choose to manually increase the ABI version, note that
+/// pybind11 may not be tested as thoroughly with a non-default ABI version, and
+/// further ABI-incompatible changes may be made before the ABI is officially
+/// changed to the new version.
+using ExceptionTranslator = void (*)(std::exception_ptr);
 // Forward declarations
 inline PyTypeObject *make_static_property_type();
 inline PyTypeObject *make_default_metaclass();
@@ -21,30 +44,59 @@ inline PyObject *make_object_base_type(PyTypeObject *metaclass);
 // The old Python Thread Local Storage (TLS) API is deprecated in Python 3.7 in favor of the new
 // Thread Specific Storage (TSS) API.
 #if PY_VERSION_HEX >= 0x03070000
-#    define PYBIND11_TLS_KEY_INIT(var) Py_tss_t *var = nullptr
-#    define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get((key))
-#    define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (value))
-#    define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set((key), nullptr)
-#    define PYBIND11_TLS_FREE(key) PyThread_tss_free(key)
+// Avoid unnecessary allocation of `Py_tss_t`, since we cannot use
+// `Py_LIMITED_API` anyway.
+#        define PYBIND11_TLS_KEY_REF Py_tss_t &
+#        ifdef __GNUC__
+// Clang on macOS warns due to `Py_tss_NEEDS_INIT` not specifying an initializer
+// for every field.
+#            define PYBIND11_TLS_KEY_INIT(var)                                                    \
+                _Pragma("GCC diagnostic push")                                         /**/       \
+                    _Pragma("GCC diagnostic ignored \"-Wmissing-field-initializers\"") /**/       \
+                    Py_tss_t var                                                                  \
+                    = Py_tss_NEEDS_INIT;                                                          \
+                _Pragma("GCC diagnostic pop")
+#        else
+#            define PYBIND11_TLS_KEY_INIT(var) Py_tss_t var = Py_tss_NEEDS_INIT;
+#        endif
+#        define PYBIND11_TLS_KEY_CREATE(var) (PyThread_tss_create(&(var)) == 0)
+#        define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get(&(key))
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set(&(key), (value))
+#        define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set(&(key), nullptr)
+#        define PYBIND11_TLS_FREE(key) PyThread_tss_delete(&(key))
+#    else
+#        define PYBIND11_TLS_KEY_REF Py_tss_t *
+#        define PYBIND11_TLS_KEY_INIT(var) Py_tss_t *var = nullptr;
+#        define PYBIND11_TLS_KEY_CREATE(var)                                                      \
+            (((var) = PyThread_tss_alloc()) != nullptr && (PyThread_tss_create((var)) == 0))
+#        define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get((key))
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (value))
+#        define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set((key), nullptr)
+#        define PYBIND11_TLS_FREE(key) PyThread_tss_free(key)
+#    endif
-    // Usually an int but a long on Cygwin64 with Python 3.x
-#    define PYBIND11_TLS_KEY_INIT(var) decltype(PyThread_create_key()) var = 0
+// Usually an int but a long on Cygwin64 with Python 3.x
+#    define PYBIND11_TLS_KEY_REF decltype(PyThread_create_key())
+#    define PYBIND11_TLS_KEY_INIT(var) PYBIND11_TLS_KEY_REF var = 0;
+#    define PYBIND11_TLS_KEY_CREATE(var) (((var) = PyThread_create_key()) != -1)
 #    define PYBIND11_TLS_GET_VALUE(key) PyThread_get_key_value((key))
-#    if PY_MAJOR_VERSION < 3
-#        define PYBIND11_TLS_DELETE_VALUE(key)                               \
-             PyThread_delete_key_value(key)
-#        define PYBIND11_TLS_REPLACE_VALUE(key, value)                       \
-             do {                                                            \
-                 PyThread_delete_key_value((key));                           \
-                 PyThread_set_key_value((key), (value));                     \
-             } while (false)
+#    if PY_MAJOR_VERSION < 3 || defined(PYPY_VERSION)
+// On CPython < 3.4 and on PyPy, `PyThread_set_key_value` strangely does not set
+// the value if it has already been set.  Instead, it must first be deleted and
+// then set again.
+inline void tls_replace_value(PYBIND11_TLS_KEY_REF key, void *value) {
+    PyThread_delete_key_value(key);
+    PyThread_set_key_value(key, value);
+#        define PYBIND11_TLS_DELETE_VALUE(key) PyThread_delete_key_value(key)
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value)                                            \
+            ::pybind11::detail::tls_replace_value((key), (value))
 #    else
-#        define PYBIND11_TLS_DELETE_VALUE(key)                               \
-             PyThread_set_key_value((key), nullptr)
-#        define PYBIND11_TLS_REPLACE_VALUE(key, value)                       \
-             PyThread_set_key_value((key), (value))
+#        define PYBIND11_TLS_DELETE_VALUE(key) PyThread_set_key_value((key), nullptr)
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_set_key_value((key), (value))
 #    endif
-#    define PYBIND11_TLS_FREE(key) (void)key
+#    define PYBIND11_TLS_FREE(key) (void) key
 // Python loads modules by default with dlopen with the RTLD_LOCAL flag; under libc++ and possibly
@@ -100,24 +152,33 @@ struct internals {
     std::unordered_set<std::pair<const PyObject *, const char *>, override_hash> inactive_override_cache;
     type_map<std::vector<bool (*)(PyObject *, void *&)>> direct_conversions;
     std::unordered_map<const PyObject *, std::vector<PyObject *>> patients;
-    std::forward_list<void (*) (std::exception_ptr)> registered_exception_translators;
+    std::forward_list<ExceptionTranslator> registered_exception_translators;
     std::unordered_map<std::string, void *> shared_data; // Custom data to be shared across extensions
-    std::vector<PyObject *> loader_patient_stack; // Used by `loader_life_support`
+    std::vector<PyObject *> unused_loader_patient_stack_remove_at_v5;
     std::forward_list<std::string> static_strings; // Stores the std::strings backing detail::c_str()
     PyTypeObject *static_property_type;
     PyTypeObject *default_metaclass;
     PyObject *instance_base;
 #if defined(WITH_THREAD)
-    PYBIND11_TLS_KEY_INIT(tstate);
+    PYBIND11_TLS_KEY_INIT(tstate)
+    PYBIND11_TLS_KEY_INIT(loader_life_support_tls_key)
+#    endif // PYBIND11_INTERNALS_VERSION > 4
     PyInterpreterState *istate = nullptr;
     ~internals() {
+        PYBIND11_TLS_FREE(loader_life_support_tls_key);
+#    endif // PYBIND11_INTERNALS_VERSION > 4
         // This destructor is called *after* Py_Finalize() in finalize_interpreter().
-        // That *SHOULD BE* fine. The following details what happens whe PyThread_tss_free is called.
-        // PYBIND11_TLS_FREE is PyThread_tss_free on python 3.7+. On older python, it does nothing.
-        // PyThread_tss_free calls PyThread_tss_delete and PyMem_RawFree.
-        // PyThread_tss_delete just calls TlsFree (on Windows) or pthread_key_delete (on *NIX). Neither
-        // of those have anything to do with CPython internals.
-        // PyMem_RawFree *requires* that the `tstate` be allocated with the CPython allocator.
+        // That *SHOULD BE* fine. The following details what happens when PyThread_tss_free is
+        // called. PYBIND11_TLS_FREE is PyThread_tss_free on python 3.7+. On older python, it does
+        // nothing. PyThread_tss_free calls PyThread_tss_delete and PyMem_RawFree.
+        // PyThread_tss_delete just calls TlsFree (on Windows) or pthread_key_delete (on *NIX).
+        // Neither of those have anything to do with CPython internals. PyMem_RawFree *requires*
+        // that the `tstate` be allocated with the CPython allocator.
@@ -139,7 +200,9 @@ struct type_info {
     void *get_buffer_data = nullptr;
     void *(*module_local_load)(PyObject *, const type_info *) = nullptr;
     /* A simple type never occurs as a (direct or indirect) parent
-     * of a class that makes use of multiple inheritance */
+     * of a class that makes use of multiple inheritance.
+     * A type can be simple even if it has non-simple ancestors as long as it has no descendants.
+     */
     bool simple_type : 1;
     /* True if there is no multiple inheritance in this type's inheritance tree */
     bool simple_ancestors : 1;
@@ -149,54 +212,62 @@ struct type_info {
     bool module_local : 1;
-/// Tracks the `internals` and `type_info` ABI version independent of the main library version
 /// On MSVC, debug and release builds are not ABI-compatible!
 #if defined(_MSC_VER) && defined(_DEBUG)
-#   define PYBIND11_BUILD_TYPE "_debug"
+#  define PYBIND11_BUILD_TYPE "_debug"
-#   define PYBIND11_BUILD_TYPE ""
+#  define PYBIND11_BUILD_TYPE ""
 /// Let's assume that different compilers are ABI-incompatible.
-#if defined(_MSC_VER)
-#   define PYBIND11_COMPILER_TYPE "_msvc"
-#elif defined(__INTEL_COMPILER)
-#   define PYBIND11_COMPILER_TYPE "_icc"
-#elif defined(__clang__)
-#   define PYBIND11_COMPILER_TYPE "_clang"
-#elif defined(__PGI)
-#   define PYBIND11_COMPILER_TYPE "_pgi"
-#elif defined(__MINGW32__)
-#   define PYBIND11_COMPILER_TYPE "_mingw"
-#elif defined(__CYGWIN__)
-#   define PYBIND11_COMPILER_TYPE "_gcc_cygwin"
-#elif defined(__GNUC__)
-#   define PYBIND11_COMPILER_TYPE "_gcc"
-#   define PYBIND11_COMPILER_TYPE "_unknown"
+/// A user can manually set this string if they know their
+/// compiler is compatible.
+#  if defined(_MSC_VER)
+#    define PYBIND11_COMPILER_TYPE "_msvc"
+#  elif defined(__INTEL_COMPILER)
+#    define PYBIND11_COMPILER_TYPE "_icc"
+#  elif defined(__clang__)
+#    define PYBIND11_COMPILER_TYPE "_clang"
+#  elif defined(__PGI)
+#    define PYBIND11_COMPILER_TYPE "_pgi"
+#  elif defined(__MINGW32__)
+#    define PYBIND11_COMPILER_TYPE "_mingw"
+#  elif defined(__CYGWIN__)
+#    define PYBIND11_COMPILER_TYPE "_gcc_cygwin"
+#  elif defined(__GNUC__)
+#    define PYBIND11_COMPILER_TYPE "_gcc"
+#  else
+#    define PYBIND11_COMPILER_TYPE "_unknown"
+#  endif
-#if defined(_LIBCPP_VERSION)
-#  define PYBIND11_STDLIB "_libcpp"
-#elif defined(__GLIBCXX__) || defined(__GLIBCPP__)
-#  define PYBIND11_STDLIB "_libstdcpp"
-#  define PYBIND11_STDLIB ""
+/// Also standard libs
+#ifndef PYBIND11_STDLIB
+#  if defined(_LIBCPP_VERSION)
+#    define PYBIND11_STDLIB "_libcpp"
+#  elif defined(__GLIBCXX__) || defined(__GLIBCPP__)
+#    define PYBIND11_STDLIB "_libstdcpp"
+#  else
+#    define PYBIND11_STDLIB ""
+#  endif
 /// On Linux/OSX, changes in __GXX_ABI_VERSION__ indicate ABI incompatibility.
-#if defined(__GXX_ABI_VERSION)
-#  define PYBIND11_BUILD_ABI ""
+#  if defined(__GXX_ABI_VERSION)
+#  else
+#    define PYBIND11_BUILD_ABI ""
+#  endif
-#if defined(WITH_THREAD)
-#  define PYBIND11_INTERNALS_KIND "_without_thread"
+#  if defined(WITH_THREAD)
+#    define PYBIND11_INTERNALS_KIND ""
+#  else
+#    define PYBIND11_INTERNALS_KIND "_without_thread"
+#  endif
 #define PYBIND11_INTERNALS_ID "__pybind11_internals_v" \
@@ -212,21 +283,104 @@ inline internals **&get_internals_pp() {
     return internals_pp;
+#if PY_VERSION_HEX >= 0x03030000
+// forward decl
+inline void translate_exception(std::exception_ptr);
+template <class T,
+          enable_if_t<std::is_same<std::nested_exception, remove_cvref_t<T>>::value, int> = 0>
+bool handle_nested_exception(const T &exc, const std::exception_ptr &p) {
+    std::exception_ptr nested = exc.nested_ptr();
+    if (nested != nullptr && nested != p) {
+        translate_exception(nested);
+        return true;
+    }
+    return false;
+template <class T,
+          enable_if_t<!std::is_same<std::nested_exception, remove_cvref_t<T>>::value, int> = 0>
+bool handle_nested_exception(const T &exc, const std::exception_ptr &p) {
+    if (auto *nep = dynamic_cast<const std::nested_exception *>(std::addressof(exc))) {
+        return handle_nested_exception(*nep, p);
+    }
+    return false;
+template <class T>
+bool handle_nested_exception(const T &, std::exception_ptr &) {
+    return false;
+inline bool raise_err(PyObject *exc_type, const char *msg) {
+#if PY_VERSION_HEX >= 0x03030000
+    if (PyErr_Occurred()) {
+        raise_from(exc_type, msg);
+        return true;
+    }
+    PyErr_SetString(exc_type, msg);
+    return false;
 inline void translate_exception(std::exception_ptr p) {
+    if (!p) {
+        return;
+    }
     try {
-        if (p) std::rethrow_exception(p);
-    } catch (error_already_set &e)           { e.restore();                                    return;
-    } catch (const builtin_exception &e)     { e.set_error();                                  return;
-    } catch (const std::bad_alloc &e)        { PyErr_SetString(PyExc_MemoryError,   e.what()); return;
-    } catch (const std::domain_error &e)     { PyErr_SetString(PyExc_ValueError,    e.what()); return;
-    } catch (const std::invalid_argument &e) { PyErr_SetString(PyExc_ValueError,    e.what()); return;
-    } catch (const std::length_error &e)     { PyErr_SetString(PyExc_ValueError,    e.what()); return;
-    } catch (const std::out_of_range &e)     { PyErr_SetString(PyExc_IndexError,    e.what()); return;
-    } catch (const std::range_error &e)      { PyErr_SetString(PyExc_ValueError,    e.what()); return;
-    } catch (const std::overflow_error &e)   { PyErr_SetString(PyExc_OverflowError, e.what()); return;
-    } catch (const std::exception &e)        { PyErr_SetString(PyExc_RuntimeError,  e.what()); return;
+        std::rethrow_exception(p);
+    } catch (error_already_set &e) {
+        handle_nested_exception(e, p);
+        e.restore();
+        return;
+    } catch (const builtin_exception &e) {
+        // Could not use template since it's an abstract class.
+        if (auto *nep = dynamic_cast<const std::nested_exception *>(std::addressof(e))) {
+            handle_nested_exception(*nep, p);
+        }
+        e.set_error();
+        return;
+    } catch (const std::bad_alloc &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_MemoryError, e.what());
+        return;
+    } catch (const std::domain_error &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_ValueError, e.what());
+        return;
+    } catch (const std::invalid_argument &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_ValueError, e.what());
+        return;
+    } catch (const std::length_error &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_ValueError, e.what());
+        return;
+    } catch (const std::out_of_range &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_IndexError, e.what());
+        return;
+    } catch (const std::range_error &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_ValueError, e.what());
+        return;
+    } catch (const std::overflow_error &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_OverflowError, e.what());
+        return;
+    } catch (const std::exception &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_RuntimeError, e.what());
+        return;
+    } catch (const std::nested_exception &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_RuntimeError, "Caught an unknown nested exception!");
+        return;
     } catch (...) {
-        PyErr_SetString(PyExc_RuntimeError, "Caught an unknown exception!");
+        raise_err(PyExc_RuntimeError, "Caught an unknown exception!");
@@ -242,7 +396,7 @@ inline void translate_local_exception(std::exception_ptr p) {
 /// Return a reference to the current `internals` data
-PYBIND11_NOINLINE inline internals &get_internals() {
+PYBIND11_NOINLINE internals &get_internals() {
     auto **&internals_pp = get_internals_pp();
     if (internals_pp && *internals_pp)
         return **internals_pp;
@@ -255,7 +409,7 @@ PYBIND11_NOINLINE inline internals &get_internals() {
         const PyGILState_STATE state;
     } gil;
-    constexpr auto *id = PYBIND11_INTERNALS_ID;
     auto builtins = handle(PyEval_GetBuiltins());
     if (builtins.contains(id) && isinstance<capsule>(builtins[id])) {
         internals_pp = static_cast<internals **>(capsule(builtins[id]));
@@ -265,6 +419,8 @@ PYBIND11_NOINLINE inline internals &get_internals() {
         // initial exception translator, below, so add another for our local exception classes.
         // libstdc++ doesn't require this (types there are identified only by name)
+        // libc++ with CPython doesn't require this (types are explicitly exported)
+        // libc++ with PyPy still need it, awaiting further investigation
 #if !defined(__GLIBCXX__)
@@ -274,21 +430,21 @@ PYBIND11_NOINLINE inline internals &get_internals() {
         internals_ptr = new internals();
 #if defined(WITH_THREAD)
-        #if PY_VERSION_HEX < 0x03090000
-                PyEval_InitThreads();
-        #endif
+#    if PY_VERSION_HEX < 0x03090000
+        PyEval_InitThreads();
+#    endif
         PyThreadState *tstate = PyThreadState_Get();
-        #if PY_VERSION_HEX >= 0x03070000
-            internals_ptr->tstate = PyThread_tss_alloc();
-            if (!internals_ptr->tstate || PyThread_tss_create(internals_ptr->tstate))
-                pybind11_fail("get_internals: could not successfully initialize the TSS key!");
-            PyThread_tss_set(internals_ptr->tstate, tstate);
-        #else
-            internals_ptr->tstate = PyThread_create_key();
-            if (internals_ptr->tstate == -1)
-                pybind11_fail("get_internals: could not successfully initialize the TLS key!");
-            PyThread_set_key_value(internals_ptr->tstate, tstate);
-        #endif
+        if (!PYBIND11_TLS_KEY_CREATE(internals_ptr->tstate)) {
+            pybind11_fail("get_internals: could not successfully initialize the tstate TSS key!");
+        }
+        PYBIND11_TLS_REPLACE_VALUE(internals_ptr->tstate, tstate);
+        if (!PYBIND11_TLS_KEY_CREATE(internals_ptr->loader_life_support_tls_key)) {
+            pybind11_fail("get_internals: could not successfully initialize the "
+                          "loader_life_support TSS key!");
+        }
+#    endif
         internals_ptr->istate = tstate->interp;
         builtins[id] = capsule(internals_pp);
@@ -300,12 +456,57 @@ PYBIND11_NOINLINE inline internals &get_internals() {
     return **internals_pp;
-/// Works like `internals.registered_types_cpp`, but for module-local registered types:
-inline type_map<type_info *> &registered_local_types_cpp() {
-    static type_map<type_info *> locals{};
-    return locals;
+// the internals struct (above) is shared between all the modules. local_internals are only
+// for a single module. Any changes made to internals may require an update to
+// PYBIND11_INTERNALS_VERSION, breaking backwards compatibility. local_internals is, by design,
+// restricted to a single module. Whether a module has local internals or not should not
+// impact any other modules, because the only things accessing the local internals is the
+// module that contains them.
+struct local_internals {
+    type_map<type_info *> registered_types_cpp;
+    std::forward_list<ExceptionTranslator> registered_exception_translators;
+    // For ABI compatibility, we can't store the loader_life_support TLS key in
+    // the `internals` struct directly.  Instead, we store it in `shared_data` and
+    // cache a copy in `local_internals`.  If we allocated a separate TLS key for
+    // each instance of `local_internals`, we could end up allocating hundreds of
+    // TLS keys if hundreds of different pybind11 modules are loaded (which is a
+    // plausible number).
+    PYBIND11_TLS_KEY_INIT(loader_life_support_tls_key)
+    // Holds the shared TLS key for the loader_life_support stack.
+    struct shared_loader_life_support_data {
+        PYBIND11_TLS_KEY_INIT(loader_life_support_tls_key)
+        shared_loader_life_support_data() {
+            if (!PYBIND11_TLS_KEY_CREATE(loader_life_support_tls_key)) {
+                pybind11_fail("local_internals: could not successfully initialize the "
+                              "loader_life_support TLS key!");
+            }
+        }
+        // We can't help but leak the TLS key, because Python never unloads extension modules.
+    };
+    local_internals() {
+        auto &internals = get_internals();
+        // Get or create the `loader_life_support_stack_key`.
+        auto &ptr = internals.shared_data["_life_support"];
+        if (!ptr) {
+            ptr = new shared_loader_life_support_data;
+        }
+        loader_life_support_tls_key
+            = static_cast<shared_loader_life_support_data *>(ptr)->loader_life_support_tls_key;
+    }
+#endif //  defined(WITH_THREAD) && PYBIND11_INTERNALS_VERSION == 4
+/// Works like `get_internals`, but for things which are locally registered.
+inline local_internals &get_local_internals() {
+  static local_internals locals;
+  return locals;
 /// Constructs a std::string with the given arguments, stores it in `internals`, and returns its
 /// `c_str()`.  Such strings objects have a long storage duration -- the internal strings are only
 /// cleared when the program exits or after interpreter shutdown (when embedding), and so are
@@ -322,14 +523,14 @@ PYBIND11_NAMESPACE_END(detail)
 /// Returns a named pointer that is shared among all extension modules (using the same
 /// pybind11 version) running in the current interpreter. Names starting with underscores
 /// are reserved for internal usage. Returns `nullptr` if no matching entry was found.
-inline PYBIND11_NOINLINE void *get_shared_data(const std::string &name) {
+PYBIND11_NOINLINE void *get_shared_data(const std::string &name) {
     auto &internals = detail::get_internals();
     auto it = internals.shared_data.find(name);
     return it != internals.shared_data.end() ? it->second : nullptr;
 /// Set the shared data that can be later recovered by `get_shared_data()`.
-inline PYBIND11_NOINLINE void *set_shared_data(const std::string &name, void *data) {
+PYBIND11_NOINLINE void *set_shared_data(const std::string &name, void *data) {
     detail::get_internals().shared_data[name] = data;
     return data;
diff --git a/wrap/pybind11/include/pybind11/detail/type_caster_base.h b/wrap/pybind11/include/pybind11/detail/type_caster_base.h
new file mode 100644
index 0000000000..48e218b2f3
--- /dev/null
+++ b/wrap/pybind11/include/pybind11/detail/type_caster_base.h
@@ -0,0 +1,985 @@
+    pybind11/detail/type_caster_base.h (originally first part of pybind11/cast.h)
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+#pragma once
+#include "../pytypes.h"
+#include "common.h"
+#include "descr.h"
+#include "internals.h"
+#include "typeid.h"
+#include <cstdint>
+#include <iterator>
+#include <new>
+#include <string>
+#include <type_traits>
+#include <typeindex>
+#include <typeinfo>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+/// A life support system for temporary objects created by `type_caster::load()`.
+/// Adding a patient will keep it alive up until the enclosing function returns.
+class loader_life_support {
+    loader_life_support* parent = nullptr;
+    std::unordered_set<PyObject *> keep_alive;
+#if defined(WITH_THREAD)
+    // Store stack pointer in thread-local storage.
+    static PYBIND11_TLS_KEY_REF get_stack_tls_key() {
+        return get_local_internals().loader_life_support_tls_key;
+#    else
+        return get_internals().loader_life_support_tls_key;
+#    endif
+    }
+    static loader_life_support *get_stack_top() {
+        return static_cast<loader_life_support *>(PYBIND11_TLS_GET_VALUE(get_stack_tls_key()));
+    }
+    static void set_stack_top(loader_life_support *value) {
+        PYBIND11_TLS_REPLACE_VALUE(get_stack_tls_key(), value);
+    }
+    // Use single global variable for stack.
+    static loader_life_support **get_stack_pp() {
+        static loader_life_support *global_stack = nullptr;
+        return global_stack;
+    }
+    static loader_life_support *get_stack_top() { return *get_stack_pp(); }
+    static void set_stack_top(loader_life_support *value) { *get_stack_pp() = value; }
+    /// A new patient frame is created when a function is entered
+    loader_life_support() {
+        parent = get_stack_top();
+        set_stack_top(this);
+    }
+    /// ... and destroyed after it returns
+    ~loader_life_support() {
+        if (get_stack_top() != this)
+            pybind11_fail("loader_life_support: internal error");
+        set_stack_top(parent);
+        for (auto* item : keep_alive)
+            Py_DECREF(item);
+    }
+    /// This can only be used inside a pybind11-bound function, either by `argument_loader`
+    /// at argument preparation time or by `py::cast()` at execution time.
+    PYBIND11_NOINLINE static void add_patient(handle h) {
+        loader_life_support *frame = get_stack_top();
+        if (!frame) {
+            // NOTE: It would be nice to include the stack frames here, as this indicates
+            // use of pybind11::cast<> outside the normal call framework, finding such
+            // a location is challenging. Developers could consider printing out
+            // stack frame addresses here using something like __builtin_frame_address(0)
+            throw cast_error("When called outside a bound function, py::cast() cannot "
+                             "do Python -> C++ conversions which require the creation "
+                             "of temporary values");
+        }
+        if (frame->keep_alive.insert(h.ptr()).second)
+            Py_INCREF(h.ptr());
+    }
+// Gets the cache entry for the given type, creating it if necessary.  The return value is the pair
+// returned by emplace, i.e. an iterator for the entry and a bool set to `true` if the entry was
+// just created.
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool> all_type_info_get_cache(PyTypeObject *type);
+// Populates a just-created cache entry.
+PYBIND11_NOINLINE void all_type_info_populate(PyTypeObject *t, std::vector<type_info *> &bases) {
+    std::vector<PyTypeObject *> check;
+    for (handle parent : reinterpret_borrow<tuple>(t->tp_bases))
+        check.push_back((PyTypeObject *) parent.ptr());
+    auto const &type_dict = get_internals().registered_types_py;
+    for (size_t i = 0; i < check.size(); i++) {
+        auto type = check[i];
+        // Ignore Python2 old-style class super type:
+        if (!PyType_Check((PyObject *) type)) continue;
+        // Check `type` in the current set of registered python types:
+        auto it = type_dict.find(type);
+        if (it != type_dict.end()) {
+            // We found a cache entry for it, so it's either pybind-registered or has pre-computed
+            // pybind bases, but we have to make sure we haven't already seen the type(s) before: we
+            // want to follow Python/virtual C++ rules that there should only be one instance of a
+            // common base.
+            for (auto *tinfo : it->second) {
+                // NB: Could use a second set here, rather than doing a linear search, but since
+                // having a large number of immediate pybind11-registered types seems fairly
+                // unlikely, that probably isn't worthwhile.
+                bool found = false;
+                for (auto *known : bases) {
+                    if (known == tinfo) { found = true; break; }
+                }
+                if (!found) bases.push_back(tinfo);
+            }
+        }
+        else if (type->tp_bases) {
+            // It's some python type, so keep follow its bases classes to look for one or more
+            // registered types
+            if (i + 1 == check.size()) {
+                // When we're at the end, we can pop off the current element to avoid growing
+                // `check` when adding just one base (which is typical--i.e. when there is no
+                // multiple inheritance)
+                check.pop_back();
+                i--;
+            }
+            for (handle parent : reinterpret_borrow<tuple>(type->tp_bases))
+                check.push_back((PyTypeObject *) parent.ptr());
+        }
+    }
+ * Extracts vector of type_info pointers of pybind-registered roots of the given Python type.  Will
+ * be just 1 pybind type for the Python type of a pybind-registered class, or for any Python-side
+ * derived class that uses single inheritance.  Will contain as many types as required for a Python
+ * class that uses multiple inheritance to inherit (directly or indirectly) from multiple
+ * pybind-registered classes.  Will be empty if neither the type nor any base classes are
+ * pybind-registered.
+ *
+ * The value is cached for the lifetime of the Python type.
+ */
+inline const std::vector<detail::type_info *> &all_type_info(PyTypeObject *type) {
+    auto ins = all_type_info_get_cache(type);
+    if (ins.second)
+        // New cache entry: populate it
+        all_type_info_populate(type, ins.first->second);
+    return ins.first->second;
+ * Gets a single pybind11 type info for a python type.  Returns nullptr if neither the type nor any
+ * ancestors are pybind11-registered.  Throws an exception if there are multiple bases--use
+ * `all_type_info` instead if you want to support multiple bases.
+ */
+PYBIND11_NOINLINE detail::type_info* get_type_info(PyTypeObject *type) {
+    auto &bases = all_type_info(type);
+    if (bases.empty())
+        return nullptr;
+    if (bases.size() > 1)
+        pybind11_fail("pybind11::detail::get_type_info: type has multiple pybind11-registered bases");
+    return bases.front();
+inline detail::type_info *get_local_type_info(const std::type_index &tp) {
+    auto &locals = get_local_internals().registered_types_cpp;
+    auto it = locals.find(tp);
+    if (it != locals.end())
+        return it->second;
+    return nullptr;
+inline detail::type_info *get_global_type_info(const std::type_index &tp) {
+    auto &types = get_internals().registered_types_cpp;
+    auto it = types.find(tp);
+    if (it != types.end())
+        return it->second;
+    return nullptr;
+/// Return the type info for a given C++ type; on lookup failure can either throw or return nullptr.
+PYBIND11_NOINLINE detail::type_info *get_type_info(const std::type_index &tp,
+                                                          bool throw_if_missing = false) {
+    if (auto ltype = get_local_type_info(tp))
+        return ltype;
+    if (auto gtype = get_global_type_info(tp))
+        return gtype;
+    if (throw_if_missing) {
+        std::string tname = tp.name();
+        detail::clean_type_id(tname);
+        pybind11_fail("pybind11::detail::get_type_info: unable to find type info for \"" + tname + "\"");
+    }
+    return nullptr;
+PYBIND11_NOINLINE handle get_type_handle(const std::type_info &tp, bool throw_if_missing) {
+    detail::type_info *type_info = get_type_info(tp, throw_if_missing);
+    return handle(type_info ? ((PyObject *) type_info->type) : nullptr);
+// Searches the inheritance graph for a registered Python instance, using all_type_info().
+PYBIND11_NOINLINE handle find_registered_python_instance(void *src,
+                                                                const detail::type_info *tinfo) {
+    auto it_instances = get_internals().registered_instances.equal_range(src);
+    for (auto it_i = it_instances.first; it_i != it_instances.second; ++it_i) {
+        for (auto instance_type : detail::all_type_info(Py_TYPE(it_i->second))) {
+            if (instance_type && same_type(*instance_type->cpptype, *tinfo->cpptype))
+                return handle((PyObject *) it_i->second).inc_ref();
+        }
+    }
+    return handle();
+struct value_and_holder {
+    instance *inst = nullptr;
+    size_t index = 0u;
+    const detail::type_info *type = nullptr;
+    void **vh = nullptr;
+    // Main constructor for a found value/holder:
+    value_and_holder(instance *i, const detail::type_info *type, size_t vpos, size_t index) :
+        inst{i}, index{index}, type{type},
+        vh{inst->simple_layout ? inst->simple_value_holder : &inst->nonsimple.values_and_holders[vpos]}
+    {}
+    // Default constructor (used to signal a value-and-holder not found by get_value_and_holder())
+    value_and_holder() = default;
+    // Used for past-the-end iterator
+    explicit value_and_holder(size_t index) : index{index} {}
+    template <typename V = void> V *&value_ptr() const {
+        return reinterpret_cast<V *&>(vh[0]);
+    }
+    // True if this `value_and_holder` has a non-null value pointer
+    explicit operator bool() const { return value_ptr() != nullptr; }
+    template <typename H> H &holder() const {
+        return reinterpret_cast<H &>(vh[1]);
+    }
+    bool holder_constructed() const {
+        return inst->simple_layout
+                   ? inst->simple_holder_constructed
+                   : (inst->nonsimple.status[index] & instance::status_holder_constructed) != 0u;
+    }
+    // NOLINTNEXTLINE(readability-make-member-function-const)
+    void set_holder_constructed(bool v = true) {
+        if (inst->simple_layout)
+            inst->simple_holder_constructed = v;
+        else if (v)
+            inst->nonsimple.status[index] |= instance::status_holder_constructed;
+        else
+            inst->nonsimple.status[index] &= (std::uint8_t) ~instance::status_holder_constructed;
+    }
+    bool instance_registered() const {
+        return inst->simple_layout
+            ? inst->simple_instance_registered
+            : ((inst->nonsimple.status[index] & instance::status_instance_registered) != 0);
+    }
+    // NOLINTNEXTLINE(readability-make-member-function-const)
+    void set_instance_registered(bool v = true) {
+        if (inst->simple_layout)
+            inst->simple_instance_registered = v;
+        else if (v)
+            inst->nonsimple.status[index] |= instance::status_instance_registered;
+        else
+            inst->nonsimple.status[index] &= (std::uint8_t) ~instance::status_instance_registered;
+    }
+// Container for accessing and iterating over an instance's values/holders
+struct values_and_holders {
+    instance *inst;
+    using type_vec = std::vector<detail::type_info *>;
+    const type_vec &tinfo;
+    explicit values_and_holders(instance *inst)
+        : inst{inst}, tinfo(all_type_info(Py_TYPE(inst))) {}
+    struct iterator {
+    private:
+        instance *inst = nullptr;
+        const type_vec *types = nullptr;
+        value_and_holder curr;
+        friend struct values_and_holders;
+        iterator(instance *inst, const type_vec *tinfo)
+            : inst{inst}, types{tinfo},
+            curr(inst /* instance */,
+                 types->empty() ? nullptr : (*types)[0] /* type info */,
+                 0, /* vpos: (non-simple types only): the first vptr comes first */
+                 0 /* index */)
+        {}
+        // Past-the-end iterator:
+        explicit iterator(size_t end) : curr(end) {}
+    public:
+        bool operator==(const iterator &other) const { return curr.index == other.curr.index; }
+        bool operator!=(const iterator &other) const { return curr.index != other.curr.index; }
+        iterator &operator++() {
+            if (!inst->simple_layout)
+                curr.vh += 1 + (*types)[curr.index]->holder_size_in_ptrs;
+            ++curr.index;
+            curr.type = curr.index < types->size() ? (*types)[curr.index] : nullptr;
+            return *this;
+        }
+        value_and_holder &operator*() { return curr; }
+        value_and_holder *operator->() { return &curr; }
+    };
+    iterator begin() { return iterator(inst, &tinfo); }
+    iterator end() { return iterator(tinfo.size()); }
+    iterator find(const type_info *find_type) {
+        auto it = begin(), endit = end();
+        while (it != endit && it->type != find_type) ++it;
+        return it;
+    }
+    size_t size() { return tinfo.size(); }
+ * Extracts C++ value and holder pointer references from an instance (which may contain multiple
+ * values/holders for python-side multiple inheritance) that match the given type.  Throws an error
+ * if the given type (or ValueType, if omitted) is not a pybind11 base of the given instance.  If
+ * `find_type` is omitted (or explicitly specified as nullptr) the first value/holder are returned,
+ * regardless of type (and the resulting .type will be nullptr).
+ *
+ * The returned object should be short-lived: in particular, it must not outlive the called-upon
+ * instance.
+ */
+PYBIND11_NOINLINE value_and_holder instance::get_value_and_holder(const type_info *find_type /*= nullptr default in common.h*/, bool throw_if_missing /*= true in common.h*/) {
+    // Optimize common case:
+    if (!find_type || Py_TYPE(this) == find_type->type)
+        return value_and_holder(this, find_type, 0, 0);
+    detail::values_and_holders vhs(this);
+    auto it = vhs.find(find_type);
+    if (it != vhs.end())
+        return *it;
+    if (!throw_if_missing)
+        return value_and_holder();
+#if defined(NDEBUG)
+    pybind11_fail("pybind11::detail::instance::get_value_and_holder: "
+            "type is not a pybind11 base of the given instance "
+            "(compile in debug mode for type details)");
+    pybind11_fail("pybind11::detail::instance::get_value_and_holder: `" +
+            get_fully_qualified_tp_name(find_type->type) + "' is not a pybind11 base of the given `" +
+            get_fully_qualified_tp_name(Py_TYPE(this)) + "' instance");
+PYBIND11_NOINLINE void instance::allocate_layout() {
+    auto &tinfo = all_type_info(Py_TYPE(this));
+    const size_t n_types = tinfo.size();
+    if (n_types == 0)
+        pybind11_fail("instance allocation failed: new instance has no pybind11-registered base types");
+    simple_layout =
+        n_types == 1 && tinfo.front()->holder_size_in_ptrs <= instance_simple_holder_in_ptrs();
+    // Simple path: no python-side multiple inheritance, and a small-enough holder
+    if (simple_layout) {
+        simple_value_holder[0] = nullptr;
+        simple_holder_constructed = false;
+        simple_instance_registered = false;
+    }
+    else { // multiple base types or a too-large holder
+        // Allocate space to hold: [v1*][h1][v2*][h2]...[bb...] where [vN*] is a value pointer,
+        // [hN] is the (uninitialized) holder instance for value N, and [bb...] is a set of bool
+        // values that tracks whether each associated holder has been initialized.  Each [block] is
+        // padded, if necessary, to an integer multiple of sizeof(void *).
+        size_t space = 0;
+        for (auto t : tinfo) {
+            space += 1; // value pointer
+            space += t->holder_size_in_ptrs; // holder instance
+        }
+        size_t flags_at = space;
+        space += size_in_ptrs(n_types); // status bytes (holder_constructed and instance_registered)
+        // Allocate space for flags, values, and holders, and initialize it to 0 (flags and values,
+        // in particular, need to be 0).  Use Python's memory allocation functions: in Python 3.6
+        // they default to using pymalloc, which is designed to be efficient for small allocations
+        // like the one we're doing here; in earlier versions (and for larger allocations) they are
+        // just wrappers around malloc.
+#if PY_VERSION_HEX >= 0x03050000
+        nonsimple.values_and_holders = (void **) PyMem_Calloc(space, sizeof(void *));
+        if (!nonsimple.values_and_holders) throw std::bad_alloc();
+        nonsimple.values_and_holders = (void **) PyMem_New(void *, space);
+        if (!nonsimple.values_and_holders) throw std::bad_alloc();
+        std::memset(nonsimple.values_and_holders, 0, space * sizeof(void *));
+        nonsimple.status = reinterpret_cast<std::uint8_t *>(&nonsimple.values_and_holders[flags_at]);
+    }
+    owned = true;
+// NOLINTNEXTLINE(readability-make-member-function-const)
+PYBIND11_NOINLINE void instance::deallocate_layout() {
+    if (!simple_layout)
+        PyMem_Free(nonsimple.values_and_holders);
+PYBIND11_NOINLINE bool isinstance_generic(handle obj, const std::type_info &tp) {
+    handle type = detail::get_type_handle(tp, false);
+    if (!type)
+        return false;
+    return isinstance(obj, type);
+PYBIND11_NOINLINE std::string error_string() {
+    if (!PyErr_Occurred()) {
+        PyErr_SetString(PyExc_RuntimeError, "Unknown internal error occurred");
+        return "Unknown internal error occurred";
+    }
+    error_scope scope; // Preserve error state
+    std::string errorString;
+    if (scope.type) {
+        errorString += handle(scope.type).attr("__name__").cast<std::string>();
+        errorString += ": ";
+    }
+    if (scope.value)
+        errorString += (std::string) str(scope.value);
+    PyErr_NormalizeException(&scope.type, &scope.value, &scope.trace);
+    if (scope.trace != nullptr)
+        PyException_SetTraceback(scope.value, scope.trace);
+#if !defined(PYPY_VERSION)
+    if (scope.trace) {
+        auto *trace = (PyTracebackObject *) scope.trace;
+        /* Get the deepest trace possible */
+        while (trace->tb_next)
+            trace = trace->tb_next;
+        PyFrameObject *frame = trace->tb_frame;
+        errorString += "\n\nAt:\n";
+        while (frame) {
+#if PY_VERSION_HEX >= 0x03090000
+            PyCodeObject *f_code = PyFrame_GetCode(frame);
+            PyCodeObject *f_code = frame->f_code;
+            Py_INCREF(f_code);
+            int lineno = PyFrame_GetLineNumber(frame);
+            errorString +=
+                "  " + handle(f_code->co_filename).cast<std::string>() +
+                "(" + std::to_string(lineno) + "): " +
+                handle(f_code->co_name).cast<std::string>() + "\n";
+            frame = frame->f_back;
+            Py_DECREF(f_code);
+        }
+    }
+    return errorString;
+PYBIND11_NOINLINE handle get_object_handle(const void *ptr, const detail::type_info *type ) {
+    auto &instances = get_internals().registered_instances;
+    auto range = instances.equal_range(ptr);
+    for (auto it = range.first; it != range.second; ++it) {
+        for (const auto &vh : values_and_holders(it->second)) {
+            if (vh.type == type)
+                return handle((PyObject *) it->second);
+        }
+    }
+    return handle();
+inline PyThreadState *get_thread_state_unchecked() {
+#if defined(PYPY_VERSION)
+    return PyThreadState_GET();
+#elif PY_VERSION_HEX < 0x03000000
+    return _PyThreadState_Current;
+#elif PY_VERSION_HEX < 0x03050000
+    return (PyThreadState*) _Py_atomic_load_relaxed(&_PyThreadState_Current);
+#elif PY_VERSION_HEX < 0x03050200
+    return (PyThreadState*) _PyThreadState_Current.value;
+    return _PyThreadState_UncheckedGet();
+// Forward declarations
+void keep_alive_impl(handle nurse, handle patient);
+inline PyObject *make_new_instance(PyTypeObject *type);
+class type_caster_generic {
+    PYBIND11_NOINLINE explicit type_caster_generic(const std::type_info &type_info)
+        : typeinfo(get_type_info(type_info)), cpptype(&type_info) {}
+    explicit type_caster_generic(const type_info *typeinfo)
+        : typeinfo(typeinfo), cpptype(typeinfo ? typeinfo->cpptype : nullptr) {}
+    bool load(handle src, bool convert) {
+        return load_impl<type_caster_generic>(src, convert);
+    }
+    PYBIND11_NOINLINE static handle cast(const void *_src, return_value_policy policy, handle parent,
+                                         const detail::type_info *tinfo,
+                                         void *(*copy_constructor)(const void *),
+                                         void *(*move_constructor)(const void *),
+                                         const void *existing_holder = nullptr) {
+        if (!tinfo) // no type info: error will be set already
+            return handle();
+        void *src = const_cast<void *>(_src);
+        if (src == nullptr)
+            return none().release();
+        if (handle registered_inst = find_registered_python_instance(src, tinfo))
+            return registered_inst;
+        auto inst = reinterpret_steal<object>(make_new_instance(tinfo->type));
+        auto wrapper = reinterpret_cast<instance *>(inst.ptr());
+        wrapper->owned = false;
+        void *&valueptr = values_and_holders(wrapper).begin()->value_ptr();
+        switch (policy) {
+            case return_value_policy::automatic:
+            case return_value_policy::take_ownership:
+                valueptr = src;
+                wrapper->owned = true;
+                break;
+            case return_value_policy::automatic_reference:
+            case return_value_policy::reference:
+                valueptr = src;
+                wrapper->owned = false;
+                break;
+            case return_value_policy::copy:
+                if (copy_constructor)
+                    valueptr = copy_constructor(src);
+                else {
+#if defined(NDEBUG)
+                    throw cast_error("return_value_policy = copy, but type is "
+                                     "non-copyable! (compile in debug mode for details)");
+                    std::string type_name(tinfo->cpptype->name());
+                    detail::clean_type_id(type_name);
+                    throw cast_error("return_value_policy = copy, but type " +
+                                     type_name + " is non-copyable!");
+                }
+                wrapper->owned = true;
+                break;
+            case return_value_policy::move:
+                if (move_constructor)
+                    valueptr = move_constructor(src);
+                else if (copy_constructor)
+                    valueptr = copy_constructor(src);
+                else {
+#if defined(NDEBUG)
+                    throw cast_error("return_value_policy = move, but type is neither "
+                                     "movable nor copyable! "
+                                     "(compile in debug mode for details)");
+                    std::string type_name(tinfo->cpptype->name());
+                    detail::clean_type_id(type_name);
+                    throw cast_error("return_value_policy = move, but type " +
+                                     type_name + " is neither movable nor copyable!");
+                }
+                wrapper->owned = true;
+                break;
+            case return_value_policy::reference_internal:
+                valueptr = src;
+                wrapper->owned = false;
+                keep_alive_impl(inst, parent);
+                break;
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        }
+        tinfo->init_instance(wrapper, existing_holder);
+        return inst.release();
+    }
+    // Base methods for generic caster; there are overridden in copyable_holder_caster
+    void load_value(value_and_holder &&v_h) {
+        auto *&vptr = v_h.value_ptr();
+        // Lazy allocation for unallocated values:
+        if (vptr == nullptr) {
+            auto *type = v_h.type ? v_h.type : typeinfo;
+            if (type->operator_new) {
+                vptr = type->operator_new(type->type_size);
+            } else {
+                #if defined(__cpp_aligned_new) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
+                    if (type->type_align > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
+                        vptr = ::operator new(type->type_size,
+                                              std::align_val_t(type->type_align));
+                    else
+                #endif
+                vptr = ::operator new(type->type_size);
+            }
+        }
+        value = vptr;
+    }
+    bool try_implicit_casts(handle src, bool convert) {
+        for (auto &cast : typeinfo->implicit_casts) {
+            type_caster_generic sub_caster(*cast.first);
+            if (sub_caster.load(src, convert)) {
+                value = cast.second(sub_caster.value);
+                return true;
+            }
+        }
+        return false;
+    }
+    bool try_direct_conversions(handle src) {
+        for (auto &converter : *typeinfo->direct_conversions) {
+            if (converter(src.ptr(), value))
+                return true;
+        }
+        return false;
+    }
+    void check_holder_compat() {}
+    PYBIND11_NOINLINE static void *local_load(PyObject *src, const type_info *ti) {
+        auto caster = type_caster_generic(ti);
+        if (caster.load(src, false))
+            return caster.value;
+        return nullptr;
+    }
+    /// Try to load with foreign typeinfo, if available. Used when there is no
+    /// native typeinfo, or when the native one wasn't able to produce a value.
+    PYBIND11_NOINLINE bool try_load_foreign_module_local(handle src) {
+        constexpr auto *local_key = PYBIND11_MODULE_LOCAL_ID;
+        const auto pytype = type::handle_of(src);
+        if (!hasattr(pytype, local_key))
+            return false;
+        type_info *foreign_typeinfo = reinterpret_borrow<capsule>(getattr(pytype, local_key));
+        // Only consider this foreign loader if actually foreign and is a loader of the correct cpp type
+        if (foreign_typeinfo->module_local_load == &local_load
+            || (cpptype && !same_type(*cpptype, *foreign_typeinfo->cpptype)))
+            return false;
+        if (auto result = foreign_typeinfo->module_local_load(src.ptr(), foreign_typeinfo)) {
+            value = result;
+            return true;
+        }
+        return false;
+    }
+    // Implementation of `load`; this takes the type of `this` so that it can dispatch the relevant
+    // bits of code between here and copyable_holder_caster where the two classes need different
+    // logic (without having to resort to virtual inheritance).
+    template <typename ThisT>
+    PYBIND11_NOINLINE bool load_impl(handle src, bool convert) {
+        if (!src) return false;
+        if (!typeinfo) return try_load_foreign_module_local(src);
+        auto &this_ = static_cast<ThisT &>(*this);
+        this_.check_holder_compat();
+        PyTypeObject *srctype = Py_TYPE(src.ptr());
+        // Case 1: If src is an exact type match for the target type then we can reinterpret_cast
+        // the instance's value pointer to the target type:
+        if (srctype == typeinfo->type) {
+            this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+            return true;
+        }
+        // Case 2: We have a derived class
+        if (PyType_IsSubtype(srctype, typeinfo->type)) {
+            auto &bases = all_type_info(srctype);
+            bool no_cpp_mi = typeinfo->simple_type;
+            // Case 2a: the python type is a Python-inherited derived class that inherits from just
+            // one simple (no MI) pybind11 class, or is an exact match, so the C++ instance is of
+            // the right type and we can use reinterpret_cast.
+            // (This is essentially the same as case 2b, but because not using multiple inheritance
+            // is extremely common, we handle it specially to avoid the loop iterator and type
+            // pointer lookup overhead)
+            if (bases.size() == 1 && (no_cpp_mi || bases.front()->type == typeinfo->type)) {
+                this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+                return true;
+            }
+            // Case 2b: the python type inherits from multiple C++ bases.  Check the bases to see if
+            // we can find an exact match (or, for a simple C++ type, an inherited match); if so, we
+            // can safely reinterpret_cast to the relevant pointer.
+            if (bases.size() > 1) {
+                for (auto base : bases) {
+                    if (no_cpp_mi ? PyType_IsSubtype(base->type, typeinfo->type) : base->type == typeinfo->type) {
+                        this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder(base));
+                        return true;
+                    }
+                }
+            }
+            // Case 2c: C++ multiple inheritance is involved and we couldn't find an exact type match
+            // in the registered bases, above, so try implicit casting (needed for proper C++ casting
+            // when MI is involved).
+            if (this_.try_implicit_casts(src, convert))
+                return true;
+        }
+        // Perform an implicit conversion
+        if (convert) {
+            for (auto &converter : typeinfo->implicit_conversions) {
+                auto temp = reinterpret_steal<object>(converter(src.ptr(), typeinfo->type));
+                if (load_impl<ThisT>(temp, false)) {
+                    loader_life_support::add_patient(temp);
+                    return true;
+                }
+            }
+            if (this_.try_direct_conversions(src))
+                return true;
+        }
+        // Failed to match local typeinfo. Try again with global.
+        if (typeinfo->module_local) {
+            if (auto gtype = get_global_type_info(*typeinfo->cpptype)) {
+                typeinfo = gtype;
+                return load(src, false);
+            }
+        }
+        // Global typeinfo has precedence over foreign module_local
+        if (try_load_foreign_module_local(src)) {
+           return true;
+        }
+        // Custom converters didn't take None, now we convert None to nullptr.
+        if (src.is_none()) {
+           // Defer accepting None to other overloads (if we aren't in convert mode):
+           if (!convert) return false;
+           value = nullptr;
+           return true;
+        }
+        return false;
+    }
+    // Called to do type lookup and wrap the pointer and type in a pair when a dynamic_cast
+    // isn't needed or can't be used.  If the type is unknown, sets the error and returns a pair
+    // with .second = nullptr.  (p.first = nullptr is not an error: it becomes None).
+    PYBIND11_NOINLINE static std::pair<const void *, const type_info *> src_and_type(
+            const void *src, const std::type_info &cast_type, const std::type_info *rtti_type = nullptr) {
+        if (auto *tpi = get_type_info(cast_type))
+            return {src, const_cast<const type_info *>(tpi)};
+        // Not found, set error:
+        std::string tname = rtti_type ? rtti_type->name() : cast_type.name();
+        detail::clean_type_id(tname);
+        std::string msg = "Unregistered type : " + tname;
+        PyErr_SetString(PyExc_TypeError, msg.c_str());
+        return {nullptr, nullptr};
+    }
+    const type_info *typeinfo = nullptr;
+    const std::type_info *cpptype = nullptr;
+    void *value = nullptr;
+ * Determine suitable casting operator for pointer-or-lvalue-casting type casters.  The type caster
+ * needs to provide `operator T*()` and `operator T&()` operators.
+ *
+ * If the type supports moving the value away via an `operator T&&() &&` method, it should use
+ * `movable_cast_op_type` instead.
+ */
+template <typename T>
+using cast_op_type =
+    conditional_t<std::is_pointer<remove_reference_t<T>>::value,
+        typename std::add_pointer<intrinsic_t<T>>::type,
+        typename std::add_lvalue_reference<intrinsic_t<T>>::type>;
+ * Determine suitable casting operator for a type caster with a movable value.  Such a type caster
+ * needs to provide `operator T*()`, `operator T&()`, and `operator T&&() &&`.  The latter will be
+ * called in appropriate contexts where the value can be moved rather than copied.
+ *
+ * These operator are automatically provided when using the PYBIND11_TYPE_CASTER macro.
+ */
+template <typename T>
+using movable_cast_op_type =
+    conditional_t<std::is_pointer<typename std::remove_reference<T>::type>::value,
+        typename std::add_pointer<intrinsic_t<T>>::type,
+    conditional_t<std::is_rvalue_reference<T>::value,
+        typename std::add_rvalue_reference<intrinsic_t<T>>::type,
+        typename std::add_lvalue_reference<intrinsic_t<T>>::type>>;
+// std::is_copy_constructible isn't quite enough: it lets std::vector<T> (and similar) through when
+// T is non-copyable, but code containing such a copy constructor fails to actually compile.
+template <typename T, typename SFINAE = void> struct is_copy_constructible : std::is_copy_constructible<T> {};
+// Specialization for types that appear to be copy constructible but also look like stl containers
+// (we specifically check for: has `value_type` and `reference` with `reference = value_type&`): if
+// so, copy constructability depends on whether the value_type is copy constructible.
+template <typename Container> struct is_copy_constructible<Container, enable_if_t<all_of<
+        std::is_copy_constructible<Container>,
+        std::is_same<typename Container::value_type &, typename Container::reference>,
+        // Avoid infinite recursion
+        negation<std::is_same<Container, typename Container::value_type>>
+    >::value>> : is_copy_constructible<typename Container::value_type> {};
+// Likewise for std::pair
+// (after C++17 it is mandatory that the copy constructor not exist when the two types aren't themselves
+// copy constructible, but this can not be relied upon when T1 or T2 are themselves containers).
+template <typename T1, typename T2> struct is_copy_constructible<std::pair<T1, T2>>
+    : all_of<is_copy_constructible<T1>, is_copy_constructible<T2>> {};
+// The same problems arise with std::is_copy_assignable, so we use the same workaround.
+template <typename T, typename SFINAE = void> struct is_copy_assignable : std::is_copy_assignable<T> {};
+template <typename Container> struct is_copy_assignable<Container, enable_if_t<all_of<
+        std::is_copy_assignable<Container>,
+        std::is_same<typename Container::value_type &, typename Container::reference>
+    >::value>> : is_copy_assignable<typename Container::value_type> {};
+template <typename T1, typename T2> struct is_copy_assignable<std::pair<T1, T2>>
+    : all_of<is_copy_assignable<T1>, is_copy_assignable<T2>> {};
+// polymorphic_type_hook<itype>::get(src, tinfo) determines whether the object pointed
+// to by `src` actually is an instance of some class derived from `itype`.
+// If so, it sets `tinfo` to point to the std::type_info representing that derived
+// type, and returns a pointer to the start of the most-derived object of that type
+// (in which `src` is a subobject; this will be the same address as `src` in most
+// single inheritance cases). If not, or if `src` is nullptr, it simply returns `src`
+// and leaves `tinfo` at its default value of nullptr.
+// The default polymorphic_type_hook just returns src. A specialization for polymorphic
+// types determines the runtime type of the passed object and adjusts the this-pointer
+// appropriately via dynamic_cast<void*>. This is what enables a C++ Animal* to appear
+// to Python as a Dog (if Dog inherits from Animal, Animal is polymorphic, Dog is
+// registered with pybind11, and this Animal is in fact a Dog).
+// You may specialize polymorphic_type_hook yourself for types that want to appear
+// polymorphic to Python but do not use C++ RTTI. (This is a not uncommon pattern
+// in performance-sensitive applications, used most notably in LLVM.)
+// polymorphic_type_hook_base allows users to specialize polymorphic_type_hook with
+// std::enable_if. User provided specializations will always have higher priority than
+// the default implementation and specialization provided in polymorphic_type_hook_base.
+template <typename itype, typename SFINAE = void>
+struct polymorphic_type_hook_base
+    static const void *get(const itype *src, const std::type_info*&) { return src; }
+template <typename itype>
+struct polymorphic_type_hook_base<itype, detail::enable_if_t<std::is_polymorphic<itype>::value>>
+    static const void *get(const itype *src, const std::type_info*& type) {
+        type = src ? &typeid(*src) : nullptr;
+        return dynamic_cast<const void*>(src);
+    }
+template <typename itype, typename SFINAE = void>
+struct polymorphic_type_hook : public polymorphic_type_hook_base<itype> {};
+/// Generic type caster for objects stored on the heap
+template <typename type> class type_caster_base : public type_caster_generic {
+    using itype = intrinsic_t<type>;
+    static constexpr auto name = const_name<type>();
+    type_caster_base() : type_caster_base(typeid(type)) { }
+    explicit type_caster_base(const std::type_info &info) : type_caster_generic(info) { }
+    static handle cast(const itype &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
+            policy = return_value_policy::copy;
+        return cast(&src, policy, parent);
+    }
+    static handle cast(itype &&src, return_value_policy, handle parent) {
+        return cast(&src, return_value_policy::move, parent);
+    }
+    // Returns a (pointer, type_info) pair taking care of necessary type lookup for a
+    // polymorphic type (using RTTI by default, but can be overridden by specializing
+    // polymorphic_type_hook). If the instance isn't derived, returns the base version.
+    static std::pair<const void *, const type_info *> src_and_type(const itype *src) {
+        auto &cast_type = typeid(itype);
+        const std::type_info *instance_type = nullptr;
+        const void *vsrc = polymorphic_type_hook<itype>::get(src, instance_type);
+        if (instance_type && !same_type(cast_type, *instance_type)) {
+            // This is a base pointer to a derived type. If the derived type is registered
+            // with pybind11, we want to make the full derived object available.
+            // In the typical case where itype is polymorphic, we get the correct
+            // derived pointer (which may be != base pointer) by a dynamic_cast to
+            // most derived type. If itype is not polymorphic, we won't get here
+            // except via a user-provided specialization of polymorphic_type_hook,
+            // and the user has promised that no this-pointer adjustment is
+            // required in that case, so it's OK to use static_cast.
+            if (const auto *tpi = get_type_info(*instance_type))
+                return {vsrc, tpi};
+        }
+        // Otherwise we have either a nullptr, an `itype` pointer, or an unknown derived pointer, so
+        // don't do a cast
+        return type_caster_generic::src_and_type(src, cast_type, instance_type);
+    }
+    static handle cast(const itype *src, return_value_policy policy, handle parent) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(
+            st.first, policy, parent, st.second,
+            make_copy_constructor(src), make_move_constructor(src));
+    }
+    static handle cast_holder(const itype *src, const void *holder) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(
+            st.first, return_value_policy::take_ownership, {}, st.second,
+            nullptr, nullptr, holder);
+    }
+    template <typename T> using cast_op_type = detail::cast_op_type<T>;
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator itype*() { return (type *) value; }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator itype&() { if (!value) throw reference_cast_error(); return *((itype *) value); }
+    using Constructor = void *(*)(const void *);
+    /* Only enabled when the types are {copy,move}-constructible *and* when the type
+       does not have a private operator new implementation. A comma operator is used in the decltype
+       argument to apply SFINAE to the public copy/move constructors.*/
+    template <typename T, typename = enable_if_t<is_copy_constructible<T>::value>>
+    static auto make_copy_constructor(const T *) -> decltype(new T(std::declval<const T>()), Constructor{}) {
+        return [](const void *arg) -> void * {
+            return new T(*reinterpret_cast<const T *>(arg));
+        };
+    }
+    template <typename T, typename = enable_if_t<std::is_move_constructible<T>::value>>
+    static auto make_move_constructor(const T *) -> decltype(new T(std::declval<T&&>()), Constructor{}) {
+        return [](const void *arg) -> void * {
+            return new T(std::move(*const_cast<T *>(reinterpret_cast<const T *>(arg))));
+        };
+    }
+    static Constructor make_copy_constructor(...) { return nullptr; }
+    static Constructor make_move_constructor(...) { return nullptr; }
diff --git a/wrap/pybind11/include/pybind11/detail/typeid.h b/wrap/pybind11/include/pybind11/detail/typeid.h
index 148889ffef..39ba8ce0f7 100644
--- a/wrap/pybind11/include/pybind11/detail/typeid.h
+++ b/wrap/pybind11/include/pybind11/detail/typeid.h
@@ -29,7 +29,7 @@ inline void erase_all(std::string &string, const std::string &search) {
-PYBIND11_NOINLINE inline void clean_type_id(std::string &name) {
+PYBIND11_NOINLINE void clean_type_id(std::string &name) {
 #if defined(__GNUG__)
     int status = 0;
     std::unique_ptr<char, void (*)(void *)> res {
diff --git a/wrap/pybind11/include/pybind11/eigen.h b/wrap/pybind11/include/pybind11/eigen.h
index 12ce9bd3e6..696099fa65 100644
--- a/wrap/pybind11/include/pybind11/eigen.h
+++ b/wrap/pybind11/include/pybind11/eigen.h
@@ -9,33 +9,31 @@
 #pragma once
-#include "numpy.h"
+/* HINT: To suppress warnings originating from the Eigen headers, use -isystem.
+   See also:
+       https://stackoverflow.com/questions/2579576/i-dir-vs-isystem-dir
+       https://stackoverflow.com/questions/1741816/isystem-for-ms-visual-studio-c-compiler
-#if defined(__INTEL_COMPILER)
-#  pragma warning(disable: 1682) // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem)
-#elif defined(__GNUG__) || defined(__clang__)
-#  pragma GCC diagnostic push
-#  pragma GCC diagnostic ignored "-Wconversion"
-#  pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#  ifdef __clang__
-//   Eigen generates a bunch of implicit-copy-constructor-is-deprecated warnings with -Wdeprecated
-//   under Clang, so disable that warning here:
-#    pragma GCC diagnostic ignored "-Wdeprecated"
-#  endif
-#  if __GNUC__ >= 7
-#    pragma GCC diagnostic ignored "-Wint-in-bool-context"
-#  endif
+#include "numpy.h"
+// The C4127 suppression was introduced for Eigen 3.4.0. In theory we could
+// make it version specific, or even remove it later, but considering that
+// 1. C4127 is generally far more distracting than useful for modern template code, and
+// 2. we definitely want to ignore any MSVC warnings originating from Eigen code,
+// it is probably best to keep this around indefinitely.
 #if defined(_MSC_VER)
 #  pragma warning(push)
-#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
-#  pragma warning(disable: 4996) // warning C4996: std::unary_negate is deprecated in C++17
+#  pragma warning(disable: 4127) // C4127: conditional expression is constant
 #include <Eigen/Core>
 #include <Eigen/SparseCore>
+#if defined(_MSC_VER)
+#  pragma warning(pop)
 // Eigen prior to 3.2.7 doesn't have proper move constructors--but worse, some classes get implicit
 // move constructors that break things.  We could detect this an explicitly copy, but an extra copy
 // of matrices seems highly undesirable.
@@ -52,8 +50,12 @@ PYBIND11_NAMESPACE_BEGIN(detail)
 using EigenIndex = Eigen::Index;
+template<typename Scalar, int Flags, typename StorageIndex>
+using EigenMapSparseMatrix = Eigen::Map<Eigen::SparseMatrix<Scalar, Flags, StorageIndex>>;
+template<typename Scalar, int Flags, typename StorageIndex>
+using EigenMapSparseMatrix = Eigen::MappedSparseMatrix<Scalar, Flags, StorageIndex>;
 // Matches Eigen::Map, Eigen::Ref, blocks, etc:
@@ -77,18 +79,17 @@ template <bool EigenRowMajor> struct EigenConformable {
     EigenDStride stride{0, 0};      // Only valid if negativestrides is false!
     bool negativestrides = false;   // If true, do not use stride!
+    // NOLINTNEXTLINE(google-explicit-constructor)
     EigenConformable(bool fits = false) : conformable{fits} {}
     // Matrix type:
     EigenConformable(EigenIndex r, EigenIndex c,
             EigenIndex rstride, EigenIndex cstride) :
-        conformable{true}, rows{r}, cols{c} {
-        // TODO: when Eigen bug #747 is fixed, remove the tests for non-negativity. http://eigen.tuxfamily.org/bz/show_bug.cgi?id=747
-        if (rstride < 0 || cstride < 0) {
-            negativestrides = true;
-        } else {
-            stride = {EigenRowMajor ? rstride : cstride /* outer stride */,
-                      EigenRowMajor ? cstride : rstride /* inner stride */ };
-        }
+        conformable{true}, rows{r}, cols{c},
+        //TODO: when Eigen bug #747 is fixed, remove the tests for non-negativity. http://eigen.tuxfamily.org/bz/show_bug.cgi?id=747
+        stride{EigenRowMajor ? (rstride > 0 ? rstride : 0) : (cstride > 0 ? cstride : 0) /* outer stride */,
+               EigenRowMajor ? (cstride > 0 ? cstride : 0) : (rstride > 0 ? rstride : 0) /* inner stride */ },
+        negativestrides{rstride < 0 || cstride < 0} {
     // Vector type:
     EigenConformable(EigenIndex r, EigenIndex c, EigenIndex stride)
@@ -104,6 +105,7 @@ template <bool EigenRowMajor> struct EigenConformable {
             (props::outer_stride == Eigen::Dynamic || props::outer_stride == stride.outer() ||
                 (EigenRowMajor ? rows : cols) == 1);
+    // NOLINTNEXTLINE(google-explicit-constructor)
     operator bool() const { return conformable; }
@@ -153,7 +155,8 @@ template <typename Type_> struct EigenProps {
                 np_cols = a.shape(1),
                 np_rstride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar)),
                 np_cstride = a.strides(1) / static_cast<ssize_t>(sizeof(Scalar));
-            if ((fixed_rows && np_rows != rows) || (fixed_cols && np_cols != cols))
+            if ((PYBIND11_SILENCE_MSVC_C4127(fixed_rows) && np_rows != rows) ||
+                (PYBIND11_SILENCE_MSVC_C4127(fixed_cols) && np_cols != cols))
                 return false;
             return {np_rows, np_cols, np_rstride, np_cstride};
@@ -165,25 +168,22 @@ template <typename Type_> struct EigenProps {
               stride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar));
         if (vector) { // Eigen type is a compile-time vector
-            if (fixed && size != n)
+            if (PYBIND11_SILENCE_MSVC_C4127(fixed) && size != n)
                 return false; // Vector size mismatch
             return {rows == 1 ? 1 : n, cols == 1 ? 1 : n, stride};
-        else if (fixed) {
+        if (fixed) {
             // The type has a fixed size, but is not a vector: abort
             return false;
-        else if (fixed_cols) {
+        if (fixed_cols) {
             // Since this isn't a vector, cols must be != 1.  We allow this only if it exactly
             // equals the number of elements (rows is Dynamic, and so 1 row is allowed).
             if (cols != n) return false;
             return {1, n, stride};
-        }
-        else {
-            // Otherwise it's either fully dynamic, or column dynamic; both become a column vector
-            if (fixed_rows && rows != n) return false;
+        } // Otherwise it's either fully dynamic, or column dynamic; both become a column vector
+            if (PYBIND11_SILENCE_MSVC_C4127(fixed_rows) && rows != n) return false;
             return {n, 1, stride};
-        }
     static constexpr bool show_writeable = is_eigen_dense_map<Type>::value && is_eigen_mutable_map<Type>::value;
@@ -192,20 +192,20 @@ template <typename Type_> struct EigenProps {
     static constexpr bool show_f_contiguous = !show_c_contiguous && show_order && requires_col_major;
     static constexpr auto descriptor =
-        _("numpy.ndarray[") + npy_format_descriptor<Scalar>::name +
-        _("[")  + _<fixed_rows>(_<(size_t) rows>(), _("m")) +
-        _(", ") + _<fixed_cols>(_<(size_t) cols>(), _("n")) +
-        _("]") +
+        const_name("numpy.ndarray[") + npy_format_descriptor<Scalar>::name +
+        const_name("[")  + const_name<fixed_rows>(const_name<(size_t) rows>(), const_name("m")) +
+        const_name(", ") + const_name<fixed_cols>(const_name<(size_t) cols>(), const_name("n")) +
+        const_name("]") +
         // For a reference type (e.g. Ref<MatrixXd>) we have other constraints that might need to be
         // satisfied: writeable=True (for a mutable reference), and, depending on the map's stride
         // options, possibly f_contiguous or c_contiguous.  We include them in the descriptor output
         // to provide some hint as to why a TypeError is occurring (otherwise it can be confusing to
         // see that a function accepts a 'numpy.ndarray[float64[3,2]]' and an error message that you
         // *gave* a numpy.ndarray of the right type and dimensions.
-        _<show_writeable>(", flags.writeable", "") +
-        _<show_c_contiguous>(", flags.c_contiguous", "") +
-        _<show_f_contiguous>(", flags.f_contiguous", "") +
-        _("]");
+        const_name<show_writeable>(", flags.writeable", "") +
+        const_name<show_c_contiguous>(", flags.c_contiguous", "") +
+        const_name<show_f_contiguous>(", flags.f_contiguous", "") +
+        const_name("]");
 // Casts an Eigen type to numpy array.  If given a base, the numpy array references the src data,
@@ -344,8 +344,11 @@ struct type_caster<Type, enable_if_t<is_eigen_dense_plain<Type>::value>> {
     static constexpr auto name = props::descriptor;
+    // NOLINTNEXTLINE(google-explicit-constructor)
     operator Type*() { return &value; }
+    // NOLINTNEXTLINE(google-explicit-constructor)
     operator Type&() { return value; }
+    // NOLINTNEXTLINE(google-explicit-constructor)
     operator Type&&() && { return std::move(value); }
     template <typename T> using cast_op_type = movable_cast_op_type<T>;
@@ -432,7 +435,7 @@ struct type_caster<
         if (!need_copy) {
             // We don't need a converting copy, but we also need to check whether the strides are
             // compatible with the Ref's stride requirements
-            Array aref = reinterpret_borrow<Array>(src);
+            auto aref = reinterpret_borrow<Array>(src);
             if (aref && (!need_writeable || aref.writeable())) {
                 fits = props::conformable(aref);
@@ -469,7 +472,9 @@ struct type_caster<
         return true;
+    // NOLINTNEXTLINE(google-explicit-constructor)
     operator Type*() { return ref.get(); }
+    // NOLINTNEXTLINE(google-explicit-constructor)
     operator Type&() { return *ref; }
     template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
@@ -539,9 +544,9 @@ struct type_caster<Type, enable_if_t<is_eigen_other<Type>::value>> {
 template<typename Type>
 struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
-    typedef typename Type::Scalar Scalar;
-    typedef remove_reference_t<decltype(*std::declval<Type>().outerIndexPtr())> StorageIndex;
-    typedef typename Type::Index Index;
+    using Scalar = typename Type::Scalar;
+    using StorageIndex = remove_reference_t<decltype(*std::declval<Type>().outerIndexPtr())>;
+    using Index = typename Type::Index;
     static constexpr bool rowMajor = Type::IsRowMajor;
     bool load(handle src, bool) {
@@ -549,7 +554,7 @@ struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
             return false;
         auto obj = reinterpret_borrow<object>(src);
-        object sparse_module = module::import("scipy.sparse");
+        object sparse_module = module_::import("scipy.sparse");
         object matrix_type = sparse_module.attr(
             rowMajor ? "csr_matrix" : "csc_matrix");
@@ -570,7 +575,9 @@ struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
         if (!values || !innerIndices || !outerIndices)
             return false;
-        value = Eigen::MappedSparseMatrix<Scalar, Type::Flags, StorageIndex>(
+        value = EigenMapSparseMatrix<Scalar,
+                                     Type::Flags & (Eigen::RowMajor | Eigen::ColMajor),
+                                     StorageIndex>(
             shape[0].cast<Index>(), shape[1].cast<Index>(), nnz,
             outerIndices.mutable_data(), innerIndices.mutable_data(), values.mutable_data());
@@ -580,7 +587,7 @@ struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
     static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
-        object matrix_type = module::import("scipy.sparse").attr(
+        object matrix_type = module_::import("scipy.sparse").attr(
             rowMajor ? "csr_matrix" : "csc_matrix");
         array data(src.nonZeros(), src.valuePtr());
@@ -593,15 +600,9 @@ struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
-    PYBIND11_TYPE_CASTER(Type, _<(Type::IsRowMajor) != 0>("scipy.sparse.csr_matrix[", "scipy.sparse.csc_matrix[")
-            + npy_format_descriptor<Scalar>::name + _("]"));
+    PYBIND11_TYPE_CASTER(Type, const_name<(Type::IsRowMajor) != 0>("scipy.sparse.csr_matrix[", "scipy.sparse.csc_matrix[")
+            + npy_format_descriptor<Scalar>::name + const_name("]"));
-#if defined(__GNUG__) || defined(__clang__)
-#  pragma GCC diagnostic pop
-#elif defined(_MSC_VER)
-#  pragma warning(pop)
diff --git a/wrap/pybind11/include/pybind11/embed.h b/wrap/pybind11/include/pybind11/embed.h
index eae86c714c..9ab1ce9c0a 100644
--- a/wrap/pybind11/include/pybind11/embed.h
+++ b/wrap/pybind11/include/pybind11/embed.h
@@ -12,6 +12,9 @@
 #include "pybind11.h"
 #include "eval.h"
+#include <memory>
+#include <vector>
 #if defined(PYPY_VERSION)
 #  error Embedding the interpreter is not supported with PyPy
@@ -45,27 +48,23 @@
  \endrst */
-#define PYBIND11_EMBEDDED_MODULE(name, variable)                              \
-    static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &);    \
-    static PyObject PYBIND11_CONCAT(*pybind11_init_wrapper_, name)() {        \
-        auto m = pybind11::module(PYBIND11_TOSTRING(name));                   \
-        try {                                                                 \
-            PYBIND11_CONCAT(pybind11_init_, name)(m);                         \
-            return m.ptr();                                                   \
-        } catch (pybind11::error_already_set &e) {                            \
-            PyErr_SetString(PyExc_ImportError, e.what());                     \
-            return nullptr;                                                   \
-        } catch (const std::exception &e) {                                   \
-            PyErr_SetString(PyExc_ImportError, e.what());                     \
-            return nullptr;                                                   \
-        }                                                                     \
-    }                                                                         \
-    PYBIND11_EMBEDDED_MODULE_IMPL(name)                                       \
-    pybind11::detail::embedded_module PYBIND11_CONCAT(pybind11_module_, name) \
-                              (PYBIND11_TOSTRING(name),             \
-                               PYBIND11_CONCAT(pybind11_init_impl_, name));   \
-    void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable)
+#define PYBIND11_EMBEDDED_MODULE(name, variable)                                                  \
+    static ::pybind11::module_::module_def PYBIND11_CONCAT(pybind11_module_def_, name);           \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(::pybind11::module_ &);                     \
+    static PyObject PYBIND11_CONCAT(*pybind11_init_wrapper_, name)() {                            \
+        auto m = ::pybind11::module_::create_extension_module(                                    \
+            PYBIND11_TOSTRING(name), nullptr, &PYBIND11_CONCAT(pybind11_module_def_, name));      \
+        try {                                                                                     \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                                             \
+            return m.ptr();                                                                       \
+        }                                                                                         \
+        PYBIND11_CATCH_INIT_EXCEPTIONS                                                            \
+    }                                                                                             \
+    PYBIND11_EMBEDDED_MODULE_IMPL(name)                                                           \
+    ::pybind11::detail::embedded_module PYBIND11_CONCAT(pybind11_module_, name)(                  \
+        PYBIND11_TOSTRING(name), PYBIND11_CONCAT(pybind11_init_impl_, name));                     \
+    void PYBIND11_CONCAT(pybind11_init_, name)(::pybind11::module_                                \
+                                               & variable) // NOLINT(bugprone-macro-parentheses)
@@ -78,7 +77,7 @@ struct embedded_module {
     using init_t = void (*)();
     embedded_module(const char *name, init_t init) {
-        if (Py_IsInitialized())
+        if (Py_IsInitialized() != 0)
             pybind11_fail("Can't add new modules after the interpreter has been initialized");
         auto result = PyImport_AppendInittab(name, init);
@@ -87,29 +86,118 @@ struct embedded_module {
+struct wide_char_arg_deleter {
+    void operator()(wchar_t *ptr) const {
+#if PY_VERSION_HEX >= 0x030500f0
+        // API docs: https://docs.python.org/3/c-api/sys.html#c.Py_DecodeLocale
+        PyMem_RawFree(ptr);
+        delete[] ptr;
+    }
+inline wchar_t *widen_chars(const char *safe_arg) {
+#if PY_VERSION_HEX >= 0x030500f0
+    wchar_t *widened_arg = Py_DecodeLocale(safe_arg, nullptr);
+    wchar_t *widened_arg = nullptr;
+// warning C4996: 'mbstowcs': This function or variable may be unsafe.
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable:4996)
+    size_t count = std::strlen(safe_arg);
+#    else
+    size_t count = std::mbstowcs(nullptr, safe_arg, 0);
+#    endif
+    if (count != static_cast<size_t>(-1)) {
+        widened_arg = new wchar_t[count + 1];
+        std::mbstowcs(widened_arg, safe_arg, count + 1);
+    }
+#if defined(_MSC_VER)
+#pragma warning(pop)
+    return widened_arg;
+/// Python 2.x/3.x-compatible version of `PySys_SetArgv`
+inline void set_interpreter_argv(int argc, const char *const *argv, bool add_program_dir_to_path) {
+    // Before it was special-cased in python 3.8, passing an empty or null argv
+    // caused a segfault, so we have to reimplement the special case ourselves.
+    bool special_case = (argv == nullptr || argc <= 0);
+    const char *const empty_argv[]{"\0"};
+    const char *const *safe_argv = special_case ? empty_argv : argv;
+    if (special_case)
+        argc = 1;
+    auto argv_size = static_cast<size_t>(argc);
+    // SetArgv* on python 3 takes wchar_t, so we have to convert.
+    std::unique_ptr<wchar_t *[]> widened_argv(new wchar_t *[argv_size]);
+    std::vector<std::unique_ptr<wchar_t[], wide_char_arg_deleter>> widened_argv_entries;
+    widened_argv_entries.reserve(argv_size);
+    for (size_t ii = 0; ii < argv_size; ++ii) {
+        widened_argv_entries.emplace_back(widen_chars(safe_argv[ii]));
+        if (!widened_argv_entries.back()) {
+            // A null here indicates a character-encoding failure or the python
+            // interpreter out of memory. Give up.
+            return;
+        }
+        widened_argv[ii] = widened_argv_entries.back().get();
+    }
+    auto pysys_argv = widened_argv.get();
+    // python 2.x
+    std::vector<std::string> strings{safe_argv, safe_argv + argv_size};
+    std::vector<char *> char_strings{argv_size};
+    for (std::size_t i = 0; i < argv_size; ++i)
+        char_strings[i] = &strings[i][0];
+    char **pysys_argv = char_strings.data();
+    PySys_SetArgvEx(argc, pysys_argv, static_cast<int>(add_program_dir_to_path));
 /** \rst
     Initialize the Python interpreter. No other pybind11 or CPython API functions can be
     called before this is done; with the exception of `PYBIND11_EMBEDDED_MODULE`. The
-    optional parameter can be used to skip the registration of signal handlers (see the
-    `Python documentation`_ for details). Calling this function again after the interpreter
-    has already been initialized is a fatal error.
+    optional `init_signal_handlers` parameter can be used to skip the registration of
+    signal handlers (see the `Python documentation`_ for details). Calling this function
+    again after the interpreter has already been initialized is a fatal error.
     If initializing the Python interpreter fails, then the program is terminated.  (This
     is controlled by the CPython runtime and is an exception to pybind11's normal behavior
     of throwing exceptions on errors.)
+    The remaining optional parameters, `argc`, `argv`, and `add_program_dir_to_path` are
+    used to populate ``sys.argv`` and ``sys.path``.
+    See the |PySys_SetArgvEx documentation|_ for details.
     .. _Python documentation: https://docs.python.org/3/c-api/init.html#c.Py_InitializeEx
+    .. |PySys_SetArgvEx documentation| replace:: ``PySys_SetArgvEx`` documentation
+    .. _PySys_SetArgvEx documentation: https://docs.python.org/3/c-api/init.html#c.PySys_SetArgvEx
  \endrst */
-inline void initialize_interpreter(bool init_signal_handlers = true) {
-    if (Py_IsInitialized())
+inline void initialize_interpreter(bool init_signal_handlers = true,
+                                   int argc = 0,
+                                   const char *const *argv = nullptr,
+                                   bool add_program_dir_to_path = true) {
+    if (Py_IsInitialized() != 0)
         pybind11_fail("The interpreter is already running");
     Py_InitializeEx(init_signal_handlers ? 1 : 0);
-    // Make .py files in the working directory available by default
-    module::import("sys").attr("path").cast<list>().append(".");
+    detail::set_interpreter_argv(argc, argv, add_program_dir_to_path);
 /** \rst
@@ -171,6 +259,8 @@ inline void finalize_interpreter() {
     Scope guard version of `initialize_interpreter` and `finalize_interpreter`.
     This a move-only guard and only a single instance can exist.
+    See `initialize_interpreter` for a discussion of its constructor arguments.
     .. code-block:: cpp
         #include <pybind11/embed.h>
@@ -182,8 +272,11 @@ inline void finalize_interpreter() {
  \endrst */
 class scoped_interpreter {
-    scoped_interpreter(bool init_signal_handlers = true) {
-        initialize_interpreter(init_signal_handlers);
+    explicit scoped_interpreter(bool init_signal_handlers = true,
+                                int argc = 0,
+                                const char *const *argv = nullptr,
+                                bool add_program_dir_to_path = true) {
+        initialize_interpreter(init_signal_handlers, argc, argv, add_program_dir_to_path);
     scoped_interpreter(const scoped_interpreter &) = delete;
diff --git a/wrap/pybind11/include/pybind11/eval.h b/wrap/pybind11/include/pybind11/eval.h
index ba82cf42ae..4248551e9b 100644
--- a/wrap/pybind11/include/pybind11/eval.h
+++ b/wrap/pybind11/include/pybind11/eval.h
@@ -1,5 +1,5 @@
-    pybind11/exec.h: Support for evaluating Python expressions and statements
+    pybind11/eval.h: Support for evaluating Python expressions and statements
     from strings and files
     Copyright (c) 2016 Klemens Morgenstern <klemens.morgenstern@ed-chemnitz.de> and
@@ -11,9 +11,27 @@
 #pragma once
+#include <utility>
 #include "pybind11.h"
+inline void ensure_builtins_in_globals(object &global) {
+    #if defined(PYPY_VERSION) || PY_VERSION_HEX < 0x03080000
+        // Running exec and eval on Python 2 and 3 adds `builtins` module under
+        // `__builtins__` key to globals if not yet present.
+        // Python 3.8 made PyRun_String behave similarly. Let's also do that for
+        // older versions, for consistency. This was missing from PyPy3.8 7.3.7.
+        if (!global.contains("__builtins__"))
+            global["__builtins__"] = module_::import(PYBIND11_BUILTINS_MODULE);
+    #else
+        (void) global;
+    #endif
 enum eval_mode {
     /// Evaluate a string containing an isolated expression
@@ -27,15 +45,17 @@ enum eval_mode {
 template <eval_mode mode = eval_expr>
-object eval(str expr, object global = globals(), object local = object()) {
+object eval(const str &expr, object global = globals(), object local = object()) {
     if (!local)
         local = global;
+    detail::ensure_builtins_in_globals(global);
     /* PyRun_String does not accept a PyObject / encoding specifier,
        this seems to be the only alternative */
     std::string buffer = "# -*- coding: utf-8 -*-\n" + (std::string) expr;
-    int start;
+    int start = 0;
     switch (mode) {
         case eval_expr:             start = Py_eval_input;   break;
         case eval_single_statement: start = Py_single_input; break;
@@ -52,13 +72,13 @@ object eval(str expr, object global = globals(), object local = object()) {
 template <eval_mode mode = eval_expr, size_t N>
 object eval(const char (&s)[N], object global = globals(), object local = object()) {
     /* Support raw string literals by removing common leading whitespace */
-    auto expr = (s[0] == '\n') ? str(module::import("textwrap").attr("dedent")(s))
+    auto expr = (s[0] == '\n') ? str(module_::import("textwrap").attr("dedent")(s))
                                : str(s);
     return eval<mode>(expr, global, local);
-inline void exec(str expr, object global = globals(), object local = object()) {
-    eval<eval_statements>(expr, global, local);
+inline void exec(const str &expr, object global = globals(), object local = object()) {
+    eval<eval_statements>(expr, std::move(global), std::move(local));
 template <size_t N>
@@ -66,7 +86,7 @@ void exec(const char (&s)[N], object global = globals(), object local = object()
     eval<eval_statements>(s, global, local);
-#if defined(PYPY_VERSION) && PY_VERSION_HEX >= 0x3000000
+#if defined(PYPY_VERSION) && PY_VERSION_HEX >= 0x03000000
 template <eval_mode mode = eval_statements>
 object eval_file(str, object, object) {
     pybind11_fail("eval_file not supported in PyPy3. Use eval");
@@ -85,7 +105,9 @@ object eval_file(str fname, object global = globals(), object local = object())
     if (!local)
         local = global;
-    int start;
+    detail::ensure_builtins_in_globals(global);
+    int start = 0;
     switch (mode) {
         case eval_expr:             start = Py_eval_input;   break;
         case eval_single_statement: start = Py_single_input; break;
@@ -114,6 +136,15 @@ object eval_file(str fname, object global = globals(), object local = object())
         pybind11_fail("File \"" + fname_str + "\" could not be opened!");
+    // In Python2, this should be encoded by getfilesystemencoding.
+    // We don't boher setting it since Python2 is past EOL anyway.
+    // See PR#3233
+#if PY_VERSION_HEX >= 0x03000000
+    if (!global.contains("__file__")) {
+        global["__file__"] = std::move(fname);
+    }
 #if PY_VERSION_HEX < 0x03000000 && defined(PYPY_VERSION)
     PyObject *result = PyRun_File(f, fname_str.c_str(), start, global.ptr(),
diff --git a/wrap/pybind11/include/pybind11/functional.h b/wrap/pybind11/include/pybind11/functional.h
index 57b6cd210f..7912aef175 100644
--- a/wrap/pybind11/include/pybind11/functional.h
+++ b/wrap/pybind11/include/pybind11/functional.h
@@ -43,22 +43,43 @@ struct type_caster<std::function<Return(Args...)>> {
            captured variables), in which case the roundtrip can be avoided.
         if (auto cfunc = func.cpp_function()) {
-            auto c = reinterpret_borrow<capsule>(PyCFunction_GET_SELF(cfunc.ptr()));
-            auto rec = (function_record *) c;
-            if (rec && rec->is_stateless &&
-                    same_type(typeid(function_type), *reinterpret_cast<const std::type_info *>(rec->data[1]))) {
-                struct capture { function_type f; };
-                value = ((capture *) &rec->data)->f;
-                return true;
+            auto cfunc_self = PyCFunction_GET_SELF(cfunc.ptr());
+            if (isinstance<capsule>(cfunc_self)) {
+                auto c = reinterpret_borrow<capsule>(cfunc_self);
+                auto rec = (function_record *) c;
+                while (rec != nullptr) {
+                    if (rec->is_stateless
+                        && same_type(typeid(function_type),
+                                     *reinterpret_cast<const std::type_info *>(rec->data[1]))) {
+                        struct capture {
+                            function_type f;
+                        };
+                        value = ((capture *) &rec->data)->f;
+                        return true;
+                    }
+                    rec = rec->next;
+                }
+            // PYPY segfaults here when passing builtin function like sum.
+            // Raising an fail exception here works to prevent the segfault, but only on gcc.
+            // See PR #1413 for full details
         // ensure GIL is held during functor destruction
         struct func_handle {
             function f;
-            func_handle(function&& f_) : f(std::move(f_)) {}
-            func_handle(const func_handle&) = default;
+#if !(defined(_MSC_VER) && _MSC_VER == 1916 && defined(PYBIND11_CPP17))
+            // This triggers a syntax error under very special conditions (very weird indeed).
+            explicit
+            func_handle(function &&f_) noexcept : f(std::move(f_)) {}
+            func_handle(const func_handle &f_) { operator=(f_); }
+            func_handle &operator=(const func_handle &f_) {
+                gil_scoped_acquire acq;
+                f = f_.f;
+                return *this;
+            }
             ~func_handle() {
                 gil_scoped_acquire acq;
                 function kill_f(std::move(f));
@@ -68,7 +89,7 @@ struct type_caster<std::function<Return(Args...)>> {
         // to emulate 'move initialization capture' in C++11
         struct func_wrapper {
             func_handle hfunc;
-            func_wrapper(func_handle&& hf): hfunc(std::move(hf)) {}
+            explicit func_wrapper(func_handle &&hf) noexcept : hfunc(std::move(hf)) {}
             Return operator()(Args... args) const {
                 gil_scoped_acquire acq;
                 object retval(hfunc.f(std::forward<Args>(args)...));
@@ -89,12 +110,11 @@ struct type_caster<std::function<Return(Args...)>> {
         auto result = f_.template target<function_type>();
         if (result)
             return cpp_function(*result, policy).release();
-        else
-            return cpp_function(std::forward<Func>(f_), policy).release();
+        return cpp_function(std::forward<Func>(f_), policy).release();
-    PYBIND11_TYPE_CASTER(type, _("Callable[[") + concat(make_caster<Args>::name...) + _("], ")
-                               + make_caster<retval_type>::name + _("]"));
+    PYBIND11_TYPE_CASTER(type, const_name("Callable[[") + concat(make_caster<Args>::name...) + const_name("], ")
+                               + make_caster<retval_type>::name + const_name("]"));
diff --git a/wrap/pybind11/include/pybind11/gil.h b/wrap/pybind11/include/pybind11/gil.h
new file mode 100644
index 0000000000..b73aaa3f54
--- /dev/null
+++ b/wrap/pybind11/include/pybind11/gil.h
@@ -0,0 +1,193 @@
+    pybind11/gil.h: RAII helpers for managing the GIL
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+#pragma once
+#include "detail/common.h"
+#include "detail/internals.h"
+// forward declarations
+PyThreadState *get_thread_state_unchecked();
+#if defined(WITH_THREAD) && !defined(PYPY_VERSION)
+/* The functions below essentially reproduce the PyGILState_* API using a RAII
+ * pattern, but there are a few important differences:
+ *
+ * 1. When acquiring the GIL from an non-main thread during the finalization
+ *    phase, the GILState API blindly terminates the calling thread, which
+ *    is often not what is wanted. This API does not do this.
+ *
+ * 2. The gil_scoped_release function can optionally cut the relationship
+ *    of a PyThreadState and its associated thread, which allows moving it to
+ *    another thread (this is a fairly rare/advanced use case).
+ *
+ * 3. The reference count of an acquired thread state can be controlled. This
+ *    can be handy to prevent cases where callbacks issued from an external
+ *    thread would otherwise constantly construct and destroy thread state data
+ *    structures.
+ *
+ * See the Python bindings of NanoGUI (http://github.com/wjakob/nanogui) for an
+ * example which uses features 2 and 3 to migrate the Python thread of
+ * execution to another thread (to run the event loop on the original thread,
+ * in this case).
+ */
+class gil_scoped_acquire {
+    PYBIND11_NOINLINE gil_scoped_acquire() {
+        auto &internals = detail::get_internals();
+        tstate = (PyThreadState *) PYBIND11_TLS_GET_VALUE(internals.tstate);
+        if (!tstate) {
+            /* Check if the GIL was acquired using the PyGILState_* API instead (e.g. if
+               calling from a Python thread). Since we use a different key, this ensures
+               we don't create a new thread state and deadlock in PyEval_AcquireThread
+               below. Note we don't save this state with internals.tstate, since we don't
+               create it we would fail to clear it (its reference count should be > 0). */
+            tstate = PyGILState_GetThisThreadState();
+        }
+        if (!tstate) {
+            tstate = PyThreadState_New(internals.istate);
+            #if !defined(NDEBUG)
+                if (!tstate)
+                    pybind11_fail("scoped_acquire: could not create thread state!");
+            #endif
+            tstate->gilstate_counter = 0;
+            PYBIND11_TLS_REPLACE_VALUE(internals.tstate, tstate);
+        } else {
+            release = detail::get_thread_state_unchecked() != tstate;
+        }
+        if (release) {
+            PyEval_AcquireThread(tstate);
+        }
+        inc_ref();
+    }
+    void inc_ref() {
+        ++tstate->gilstate_counter;
+    }
+    PYBIND11_NOINLINE void dec_ref() {
+        --tstate->gilstate_counter;
+        #if !defined(NDEBUG)
+            if (detail::get_thread_state_unchecked() != tstate)
+                pybind11_fail("scoped_acquire::dec_ref(): thread state must be current!");
+            if (tstate->gilstate_counter < 0)
+                pybind11_fail("scoped_acquire::dec_ref(): reference count underflow!");
+        #endif
+        if (tstate->gilstate_counter == 0) {
+            #if !defined(NDEBUG)
+                if (!release)
+                    pybind11_fail("scoped_acquire::dec_ref(): internal error!");
+            #endif
+            PyThreadState_Clear(tstate);
+            if (active)
+                PyThreadState_DeleteCurrent();
+            PYBIND11_TLS_DELETE_VALUE(detail::get_internals().tstate);
+            release = false;
+        }
+    }
+    /// This method will disable the PyThreadState_DeleteCurrent call and the
+    /// GIL won't be acquired. This method should be used if the interpreter
+    /// could be shutting down when this is called, as thread deletion is not
+    /// allowed during shutdown. Check _Py_IsFinalizing() on Python 3.7+, and
+    /// protect subsequent code.
+    PYBIND11_NOINLINE void disarm() {
+        active = false;
+    }
+    PYBIND11_NOINLINE ~gil_scoped_acquire() {
+        dec_ref();
+        if (release)
+           PyEval_SaveThread();
+    }
+    PyThreadState *tstate = nullptr;
+    bool release = true;
+    bool active = true;
+class gil_scoped_release {
+    explicit gil_scoped_release(bool disassoc = false) : disassoc(disassoc) {
+        // `get_internals()` must be called here unconditionally in order to initialize
+        // `internals.tstate` for subsequent `gil_scoped_acquire` calls. Otherwise, an
+        // initialization race could occur as multiple threads try `gil_scoped_acquire`.
+        auto &internals = detail::get_internals();
+        tstate = PyEval_SaveThread();
+        if (disassoc) {
+            auto key = internals.tstate;
+            PYBIND11_TLS_DELETE_VALUE(key);
+        }
+    }
+    /// This method will disable the PyThreadState_DeleteCurrent call and the
+    /// GIL won't be acquired. This method should be used if the interpreter
+    /// could be shutting down when this is called, as thread deletion is not
+    /// allowed during shutdown. Check _Py_IsFinalizing() on Python 3.7+, and
+    /// protect subsequent code.
+    PYBIND11_NOINLINE void disarm() {
+        active = false;
+    }
+    ~gil_scoped_release() {
+        if (!tstate)
+            return;
+        // `PyEval_RestoreThread()` should not be called if runtime is finalizing
+        if (active)
+            PyEval_RestoreThread(tstate);
+        if (disassoc) {
+            auto key = detail::get_internals().tstate;
+            PYBIND11_TLS_REPLACE_VALUE(key, tstate);
+        }
+    }
+    PyThreadState *tstate;
+    bool disassoc;
+    bool active = true;
+#elif defined(PYPY_VERSION)
+class gil_scoped_acquire {
+    PyGILState_STATE state;
+    gil_scoped_acquire() { state = PyGILState_Ensure(); }
+    ~gil_scoped_acquire() { PyGILState_Release(state); }
+    void disarm() {}
+class gil_scoped_release {
+    PyThreadState *state;
+    gil_scoped_release() { state = PyEval_SaveThread(); }
+    ~gil_scoped_release() { PyEval_RestoreThread(state); }
+    void disarm() {}
+class gil_scoped_acquire {
+    void disarm() {}
+class gil_scoped_release {
+    void disarm() {}
diff --git a/wrap/pybind11/include/pybind11/iostream.h b/wrap/pybind11/include/pybind11/iostream.h
index 48479f2d17..95449a07ba 100644
--- a/wrap/pybind11/include/pybind11/iostream.h
+++ b/wrap/pybind11/include/pybind11/iostream.h
@@ -5,17 +5,31 @@
     All rights reserved. Use of this source code is governed by a
     BSD-style license that can be found in the LICENSE file.
+    WARNING: The implementation in this file is NOT thread safe. Multiple
+    threads writing to a redirected ostream concurrently cause data races
+    and potentially buffer overflows. Therefore it is currently a requirement
+    that all (possibly) concurrent redirected ostream writes are protected by
+    a mutex.
+    #HelpAppreciated: Work on iostream.h thread safety.
+    For more background see the discussions under
+    https://github.com/pybind/pybind11/pull/2982 and
+    https://github.com/pybind/pybind11/pull/2995.
 #pragma once
 #include "pybind11.h"
-#include <streambuf>
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+#include <iterator>
+#include <memory>
 #include <ostream>
+#include <streambuf>
 #include <string>
-#include <memory>
-#include <iostream>
+#include <utility>
@@ -38,21 +52,68 @@ class pythonbuf : public std::streambuf {
         return sync() == 0 ? traits_type::not_eof(c) : traits_type::eof();
-    // This function must be non-virtual to be called in a destructor. If the
-    // rare MSVC test failure shows up with this version, then this should be
-    // simplified to a fully qualified call.
-    int _sync() {
-        if (pbase() != pptr()) {
-            // This subtraction cannot be negative, so dropping the sign
-            str line(pbase(), static_cast<size_t>(pptr() - pbase()));
+    // Computes how many bytes at the end of the buffer are part of an
+    // incomplete sequence of UTF-8 bytes.
+    // Precondition: pbase() < pptr()
+    size_t utf8_remainder() const {
+        const auto rbase = std::reverse_iterator<char *>(pbase());
+        const auto rpptr = std::reverse_iterator<char *>(pptr());
+        auto is_ascii = [](char c) {
+            return (static_cast<unsigned char>(c) & 0x80) == 0x00;
+        };
+        auto is_leading = [](char c) {
+            return (static_cast<unsigned char>(c) & 0xC0) == 0xC0;
+        };
+        auto is_leading_2b = [](char c) {
+            return static_cast<unsigned char>(c) <= 0xDF;
+        };
+        auto is_leading_3b = [](char c) {
+            return static_cast<unsigned char>(c) <= 0xEF;
+        };
+        // If the last character is ASCII, there are no incomplete code points
+        if (is_ascii(*rpptr))
+            return 0;
+        // Otherwise, work back from the end of the buffer and find the first
+        // UTF-8 leading byte
+        const auto rpend   = rbase - rpptr >= 3 ? rpptr + 3 : rbase;
+        const auto leading = std::find_if(rpptr, rpend, is_leading);
+        if (leading == rbase)
+            return 0;
+        const auto dist    = static_cast<size_t>(leading - rpptr);
+        size_t remainder   = 0;
+        if (dist == 0)
+            remainder = 1; // 1-byte code point is impossible
+        else if (dist == 1)
+            remainder = is_leading_2b(*leading) ? 0 : dist + 1;
+        else if (dist == 2)
+            remainder = is_leading_3b(*leading) ? 0 : dist + 1;
+        // else if (dist >= 3), at least 4 bytes before encountering an UTF-8
+        // leading byte, either no remainder or invalid UTF-8.
+        // Invalid UTF-8 will cause an exception later when converting
+        // to a Python string, so that's not handled here.
+        return remainder;
+    }
-            {
-                gil_scoped_acquire tmp;
+    // This function must be non-virtual to be called in a destructor.
+    int _sync() {
+        if (pbase() != pptr()) { // If buffer is not empty
+            gil_scoped_acquire tmp;
+            // This subtraction cannot be negative, so dropping the sign.
+            auto size        = static_cast<size_t>(pptr() - pbase());
+            size_t remainder = utf8_remainder();
+            if (size > remainder) {
+                str line(pbase(), size - remainder);
+            // Copy the remainder at the end of the buffer to the beginning:
+            if (remainder > 0)
+                std::memmove(pbase(), pptr() - remainder, remainder);
             setp(pbase(), epptr());
+            pbump(static_cast<int>(remainder));
         return 0;
@@ -62,11 +123,8 @@ class pythonbuf : public std::streambuf {
-    pythonbuf(object pyostream, size_t buffer_size = 1024)
-        : buf_size(buffer_size),
-          d_buffer(new char[buf_size]),
-          pywrite(pyostream.attr("write")),
+    explicit pythonbuf(const object &pyostream, size_t buffer_size = 1024)
+        : buf_size(buffer_size), d_buffer(new char[buf_size]), pywrite(pyostream.attr("write")),
           pyflush(pyostream.attr("flush")) {
         setp(d_buffer.get(), d_buffer.get() + buf_size - 1);
@@ -103,7 +161,7 @@ PYBIND11_NAMESPACE_END(detail)
             py::scoped_ostream_redirect output{std::cerr, py::module::import("sys").attr("stderr")};
-            std::cerr << "Hello, World!";
+            std::cout << "Hello, World!";
  \endrst */
 class scoped_ostream_redirect {
@@ -113,9 +171,9 @@ class scoped_ostream_redirect {
     detail::pythonbuf buffer;
-    scoped_ostream_redirect(
-            std::ostream &costream = std::cout,
-            object pyostream = module::import("sys").attr("stdout"))
+    explicit scoped_ostream_redirect(std::ostream &costream = std::cout,
+                                     const object &pyostream
+                                     = module_::import("sys").attr("stdout"))
         : costream(costream), buffer(pyostream) {
         old = costream.rdbuf(&buffer);
@@ -144,10 +202,10 @@ class scoped_ostream_redirect {
 \endrst */
 class scoped_estream_redirect : public scoped_ostream_redirect {
-    scoped_estream_redirect(
-            std::ostream &costream = std::cerr,
-            object pyostream = module::import("sys").attr("stderr"))
-        : scoped_ostream_redirect(costream,pyostream) {}
+    explicit scoped_estream_redirect(std::ostream &costream = std::cerr,
+                                     const object &pyostream
+                                     = module_::import("sys").attr("stderr"))
+        : scoped_ostream_redirect(costream, pyostream) {}
@@ -161,7 +219,7 @@ class OstreamRedirect {
     std::unique_ptr<scoped_estream_redirect> redirect_stderr;
-    OstreamRedirect(bool do_stdout = true, bool do_stderr = true)
+    explicit OstreamRedirect(bool do_stdout = true, bool do_stderr = true)
         : do_stdout_(do_stdout), do_stderr_(do_stderr) {}
     void enter() {
@@ -206,11 +264,12 @@ PYBIND11_NAMESPACE_END(detail)
  \endrst */
-inline class_<detail::OstreamRedirect> add_ostream_redirect(module m, std::string name = "ostream_redirect") {
-    return class_<detail::OstreamRedirect>(m, name.c_str(), module_local())
-        .def(init<bool,bool>(), arg("stdout")=true, arg("stderr")=true)
+inline class_<detail::OstreamRedirect>
+add_ostream_redirect(module_ m, const std::string &name = "ostream_redirect") {
+    return class_<detail::OstreamRedirect>(std::move(m), name.c_str(), module_local())
+        .def(init<bool, bool>(), arg("stdout") = true, arg("stderr") = true)
         .def("__enter__", &detail::OstreamRedirect::enter)
-        .def("__exit__", [](detail::OstreamRedirect &self_, args) { self_.exit(); });
+        .def("__exit__", [](detail::OstreamRedirect &self_, const args &) { self_.exit(); });
diff --git a/wrap/pybind11/include/pybind11/numpy.h b/wrap/pybind11/include/pybind11/numpy.h
index 03e1ed61ed..95a743acec 100644
--- a/wrap/pybind11/include/pybind11/numpy.h
+++ b/wrap/pybind11/include/pybind11/numpy.h
@@ -20,20 +20,18 @@
 #include <sstream>
 #include <string>
 #include <functional>
+#include <type_traits>
 #include <utility>
 #include <vector>
 #include <typeindex>
-#if defined(_MSC_VER)
-#  pragma warning(push)
-#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
 /* This will be true on all flat address space platforms and allows us to reduce the
    whole npy_intp / ssize_t / Py_intptr_t business down to just ssize_t for all size
    and dimension types (e.g. shape, strides, indexing), instead of inflicting this
    upon the library user. */
-static_assert(sizeof(ssize_t) == sizeof(Py_intptr_t), "ssize_t != Py_intptr_t");
+static_assert(sizeof(::pybind11::ssize_t) == sizeof(Py_intptr_t), "ssize_t != Py_intptr_t");
+static_assert(std::is_signed<Py_intptr_t>::value, "Py_intptr_t must be signed");
+// We now can reinterpret_cast between py::ssize_t and Py_intptr_t (MSVC + PyPy cares)
@@ -41,7 +39,7 @@ class array; // Forward declaration
-template <> struct handle_type_name<array> { static constexpr auto name = _("numpy.ndarray"); };
+template <> struct handle_type_name<array> { static constexpr auto name = const_name("numpy.ndarray"); };
 template <typename type, typename SFINAE = void> struct npy_format_descriptor;
@@ -101,7 +99,7 @@ struct numpy_internals {
-inline PYBIND11_NOINLINE void load_numpy_internals(numpy_internals* &ptr) {
+PYBIND11_NOINLINE void load_numpy_internals(numpy_internals* &ptr) {
     ptr = &get_or_create_shared_data<numpy_internals>("_numpy_internals");
@@ -161,10 +159,10 @@ struct npy_api {
-    typedef struct {
+    struct PyArray_Dims {
         Py_intptr_t *ptr;
         int len;
-    } PyArray_Dims;
+    };
     static npy_api& get() {
         static npy_api api = lookup();
@@ -172,10 +170,10 @@ struct npy_api {
     bool PyArray_Check_(PyObject *obj) const {
-        return (bool) PyObject_TypeCheck(obj, PyArray_Type_);
+        return PyObject_TypeCheck(obj, PyArray_Type_) != 0;
     bool PyArrayDescr_Check_(PyObject *obj) const {
-        return (bool) PyObject_TypeCheck(obj, PyArrayDescr_Type_);
+        return PyObject_TypeCheck(obj, PyArrayDescr_Type_) != 0;
     unsigned int (*PyArray_GetNDArrayCFeatureVersion_)();
@@ -200,6 +198,9 @@ struct npy_api {
     // Unused. Not removed because that affects ABI of the class.
     int (*PyArray_SetBaseObject_)(PyObject *, PyObject *);
     PyObject* (*PyArray_Resize_)(PyObject*, PyArray_Dims*, int, int);
+    PyObject* (*PyArray_Newshape_)(PyObject*, PyArray_Dims*, int);
+    PyObject* (*PyArray_View_)(PyObject*, PyObject*, PyObject*);
     enum functions {
         API_PyArray_GetNDArrayCFeatureVersion = 211,
@@ -214,15 +215,17 @@ struct npy_api {
         API_PyArray_NewCopy = 85,
         API_PyArray_NewFromDescr = 94,
         API_PyArray_DescrNewFromType = 96,
+        API_PyArray_Newshape = 135,
+        API_PyArray_Squeeze = 136,
+        API_PyArray_View = 137,
         API_PyArray_DescrConverter = 174,
         API_PyArray_EquivTypes = 182,
         API_PyArray_GetArrayParamsFromObject = 278,
-        API_PyArray_Squeeze = 136,
         API_PyArray_SetBaseObject = 282
     static npy_api lookup() {
-        module_ m = module::import("numpy.core.multiarray");
+        module_ m = module_::import("numpy.core.multiarray");
         auto c = m.attr("_ARRAY_API");
         void **api_ptr = (void **) PyCapsule_GetPointer(c.ptr(), NULL);
@@ -245,11 +248,14 @@ struct npy_api {
+        DECL_NPY_API(PyArray_Newshape);
+        DECL_NPY_API(PyArray_Squeeze);
+        DECL_NPY_API(PyArray_View);
-        DECL_NPY_API(PyArray_Squeeze);
 #undef DECL_NPY_API
         return api;
@@ -284,7 +290,7 @@ template <typename T> struct array_info_scalar {
     using type = T;
     static constexpr bool is_array = false;
     static constexpr bool is_empty = false;
-    static constexpr auto extents = _("");
+    static constexpr auto extents = const_name("");
     static void append_extents(list& /* shape */) { }
 // Computes underlying type and a comma-separated list of extents for array
@@ -303,8 +309,8 @@ template <typename T, size_t N> struct array_info<std::array<T, N>> {
-    static constexpr auto extents = _<array_info<T>::is_array>(
-        concat(_<N>(), array_info<T>::extents), _<N>()
+    static constexpr auto extents = const_name<array_info<T>::is_array>(
+        concat(const_name<N>(), array_info<T>::extents), const_name<N>()
 // For numpy we have special handling for arrays of characters, so we don't include
@@ -316,18 +322,23 @@ template <typename T> using remove_all_extents_t = typename array_info<T>::type;
 template <typename T> using is_pod_struct = all_of<
     std::is_standard_layout<T>,     // since we're accessing directly in memory we need a standard layout type
-#if !defined(__GNUG__) || defined(_LIBCPP_VERSION) || defined(_GLIBCXX_USE_CXX11_ABI)
-    // _GLIBCXX_USE_CXX11_ABI indicates that we're using libstdc++ from GCC 5 or newer, independent
-    // of the actual compiler (Clang can also use libstdc++, but it always defines __GNUC__ == 4).
-    std::is_trivially_copyable<T>,
-    // GCC 4 doesn't implement is_trivially_copyable, so approximate it
+#if defined(__GLIBCXX__) && (__GLIBCXX__ < 20150422 || __GLIBCXX__ == 20150426 || __GLIBCXX__ == 20150623 || __GLIBCXX__ == 20150626 || __GLIBCXX__ == 20160803)
+    // libstdc++ < 5 (including versions 4.8.5, 4.9.3 and 4.9.4 which were released after 5)
+    // don't implement is_trivially_copyable, so approximate it
     satisfies_any_of<T, std::has_trivial_copy_constructor, std::has_trivial_copy_assign>,
+    std::is_trivially_copyable<T>,
     satisfies_none_of<T, std::is_reference, std::is_array, is_std_array, std::is_arithmetic, is_complex, std::is_enum>
+// Replacement for std::is_pod (deprecated in C++20)
+template <typename T> using is_pod = all_of<
+    std::is_standard_layout<T>,
+    std::is_trivial<T>
 template <ssize_t Dim = 0, typename Strides> ssize_t byte_offset_unsafe(const Strides &) { return 0; }
 template <ssize_t Dim = 0, typename Strides, typename... Ix>
 ssize_t byte_offset_unsafe(const Strides &strides, ssize_t i, Ix... index) {
@@ -419,6 +430,10 @@ class unchecked_mutable_reference : public unchecked_reference<T, Dims> {
     using ConstBase::ConstBase;
     using ConstBase::Dynamic;
+    // Bring in const-qualified versions from base class
+    using ConstBase::operator();
+    using ConstBase::operator[];
     /// Mutable, unchecked access to data at the given indices.
     template <typename... Ix> T& operator()(Ix... index) {
         static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
@@ -453,28 +468,30 @@ class dtype : public object {
     explicit dtype(const buffer_info &info) {
         dtype descr(_dtype_from_pep3118()(PYBIND11_STR_TYPE(info.format)));
         // If info.itemsize == 0, use the value calculated from the format string
-        m_ptr = descr.strip_padding(info.itemsize ? info.itemsize : descr.itemsize()).release().ptr();
+        m_ptr = descr.strip_padding(info.itemsize != 0 ? info.itemsize : descr.itemsize())
+                    .release()
+                    .ptr();
     explicit dtype(const std::string &format) {
         m_ptr = from_args(pybind11::str(format)).release().ptr();
-    dtype(const char *format) : dtype(std::string(format)) { }
+    explicit dtype(const char *format) : dtype(std::string(format)) {}
     dtype(list names, list formats, list offsets, ssize_t itemsize) {
         dict args;
-        args["names"] = names;
-        args["formats"] = formats;
-        args["offsets"] = offsets;
+        args["names"] = std::move(names);
+        args["formats"] = std::move(formats);
+        args["offsets"] = std::move(offsets);
         args["itemsize"] = pybind11::int_(itemsize);
-        m_ptr = from_args(args).release().ptr();
+        m_ptr = from_args(std::move(args)).release().ptr();
     /// This is essentially the same as calling numpy.dtype(args) in Python.
     static dtype from_args(object args) {
         PyObject *ptr = nullptr;
-        if (!detail::npy_api::get().PyArray_DescrConverter_(args.ptr(), &ptr) || !ptr)
+        if ((detail::npy_api::get().PyArray_DescrConverter_(args.ptr(), &ptr) == 0) || !ptr)
             throw error_already_set();
         return reinterpret_steal<dtype>(ptr);
@@ -494,14 +511,24 @@ class dtype : public object {
         return detail::array_descriptor_proxy(m_ptr)->names != nullptr;
-    /// Single-character type code.
+    /// Single-character code for dtype's kind.
+    /// For example, floating point types are 'f' and integral types are 'i'.
     char kind() const {
         return detail::array_descriptor_proxy(m_ptr)->kind;
+    /// Single-character for dtype's type.
+    /// For example, ``float`` is 'f', ``double`` 'd', ``int`` 'i', and ``long`` 'l'.
+    char char_() const {
+        // Note: The signature, `dtype::char_` follows the naming of NumPy's
+        // public Python API (i.e., ``dtype.char``), rather than its internal
+        // C API (``PyArray_Descr::type``).
+        return detail::array_descriptor_proxy(m_ptr)->type;
+    }
     static object _dtype_from_pep3118() {
-        static PyObject *obj = module::import("numpy.core._internal")
+        static PyObject *obj = module_::import("numpy.core._internal")
         return reinterpret_borrow<object>(obj);
@@ -520,7 +547,7 @@ class dtype : public object {
             auto name = spec[0].cast<pybind11::str>();
             auto format = spec[1].cast<tuple>()[0].cast<dtype>();
             auto offset = spec[1].cast<tuple>()[1].cast<pybind11::int_>();
-            if (!len(name) && format.kind() == 'V')
+            if ((len(name) == 0u) && format.kind() == 'V')
             field_descriptors.push_back({(PYBIND11_STR_TYPE) name, format.strip_padding(format.itemsize()), offset});
@@ -536,7 +563,7 @@ class dtype : public object {
-        return dtype(names, formats, offsets, itemsize);
+        return dtype(std::move(names), std::move(formats), std::move(offsets), itemsize);
@@ -560,7 +587,7 @@ class array : public buffer {
           const void *ptr = nullptr, handle base = handle()) {
         if (strides->empty())
-            *strides = c_strides(*shape, dt.itemsize());
+            *strides = detail::c_strides(*shape, dt.itemsize());
         auto ndim = shape->size();
         if (ndim != strides->size())
@@ -579,7 +606,10 @@ class array : public buffer {
         auto &api = detail::npy_api::get();
         auto tmp = reinterpret_steal<object>(api.PyArray_NewFromDescr_(
-            api.PyArray_Type_, descr.release().ptr(), (int) ndim, shape->data(), strides->data(),
+            api.PyArray_Type_, descr.release().ptr(), (int) ndim,
+            // Use reinterpret_cast for PyPy on Windows (remove if fixed, checked on 7.3.1)
+            reinterpret_cast<Py_intptr_t*>(shape->data()),
+            reinterpret_cast<Py_intptr_t*>(strides->data()),
             const_cast<void *>(ptr), flags, nullptr));
         if (!tmp)
             throw error_already_set();
@@ -720,7 +750,7 @@ class array : public buffer {
      * and the caller must take care not to access invalid dimensions or dimension indices.
     template <typename T, ssize_t Dims = -1> detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
-        if (Dims >= 0 && ndim() != Dims)
+        if (PYBIND11_SILENCE_MSVC_C4127(Dims >= 0) && ndim() != Dims)
             throw std::domain_error("array has incorrect number of dimensions: " + std::to_string(ndim()) +
                     "; expected " + std::to_string(Dims));
         return detail::unchecked_mutable_reference<T, Dims>(mutable_data(), shape(), strides(), ndim());
@@ -734,7 +764,7 @@ class array : public buffer {
      * invalid dimensions or dimension indices.
     template <typename T, ssize_t Dims = -1> detail::unchecked_reference<T, Dims> unchecked() const & {
-        if (Dims >= 0 && ndim() != Dims)
+        if (PYBIND11_SILENCE_MSVC_C4127(Dims >= 0) && ndim() != Dims)
             throw std::domain_error("array has incorrect number of dimensions: " + std::to_string(ndim()) +
                     "; expected " + std::to_string(Dims));
         return detail::unchecked_reference<T, Dims>(data(), shape(), strides(), ndim());
@@ -751,16 +781,45 @@ class array : public buffer {
     /// then resize will succeed only if it makes a reshape, i.e. original size doesn't change
     void resize(ShapeContainer new_shape, bool refcheck = true) {
         detail::npy_api::PyArray_Dims d = {
-            new_shape->data(), int(new_shape->size())
+            // Use reinterpret_cast for PyPy on Windows (remove if fixed, checked on 7.3.1)
+            reinterpret_cast<Py_intptr_t*>(new_shape->data()),
+            int(new_shape->size())
         // try to resize, set ordering param to -1 cause it's not used anyway
-        object new_array = reinterpret_steal<object>(
+        auto new_array = reinterpret_steal<object>(
             detail::npy_api::get().PyArray_Resize_(m_ptr, &d, int(refcheck), -1)
         if (!new_array) throw error_already_set();
         if (isinstance<array>(new_array)) { *this = std::move(new_array); }
+    /// Optional `order` parameter omitted, to be added as needed.
+    array reshape(ShapeContainer new_shape) {
+        detail::npy_api::PyArray_Dims d
+            = {reinterpret_cast<Py_intptr_t *>(new_shape->data()), int(new_shape->size())};
+        auto new_array
+            = reinterpret_steal<array>(detail::npy_api::get().PyArray_Newshape_(m_ptr, &d, 0));
+        if (!new_array) {
+            throw error_already_set();
+        }
+        return new_array;
+    }
+    /// Create a view of an array in a different data type.
+    /// This function may fundamentally reinterpret the data in the array.
+    /// It is the responsibility of the caller to ensure that this is safe.
+    /// Only supports the `dtype` argument, the `type` argument is omitted,
+    /// to be added as needed.
+    array view(const std::string &dtype) {
+        auto &api = detail::npy_api::get();
+        auto new_view = reinterpret_steal<array>(api.PyArray_View_(
+            m_ptr, dtype::from_args(pybind11::str(dtype)).release().ptr(), nullptr));
+        if (!new_view) {
+            throw error_already_set();
+        }
+        return new_view;
+    }
     /// Ensure that the argument is a NumPy array
     /// In case of an error, nullptr is returned and the Python error is cleared.
     static array ensure(handle h, int ExtraFlags = 0) {
@@ -788,25 +847,6 @@ class array : public buffer {
             throw std::domain_error("array is not writeable");
-    // Default, C-style strides
-    static std::vector<ssize_t> c_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
-        auto ndim = shape.size();
-        std::vector<ssize_t> strides(ndim, itemsize);
-        if (ndim > 0)
-            for (size_t i = ndim - 1; i > 0; --i)
-                strides[i - 1] = strides[i] * shape[i];
-        return strides;
-    }
-    // F-style strides; default when constructing an array_t with `ExtraFlags & f_style`
-    static std::vector<ssize_t> f_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
-        auto ndim = shape.size();
-        std::vector<ssize_t> strides(ndim, itemsize);
-        for (size_t i = 1; i < ndim; ++i)
-            strides[i] = strides[i - 1] * shape[i - 1];
-        return strides;
-    }
     template<typename... Ix> void check_dimensions(Ix... index) const {
         check_dimensions_impl(ssize_t(0), shape(), ssize_t(index)...);
@@ -854,6 +894,7 @@ template <typename T, int ExtraFlags = array::forcecast> class array_t : public
         if (!is_borrowed) Py_XDECREF(h.ptr());
+    // NOLINTNEXTLINE(google-explicit-constructor)
     array_t(const object &o) : array(raw_array_t(o.ptr()), stolen_t{}) {
         if (!m_ptr) throw error_already_set();
@@ -864,9 +905,12 @@ template <typename T, int ExtraFlags = array::forcecast> class array_t : public
         : array(std::move(shape), std::move(strides), ptr, base) { }
     explicit array_t(ShapeContainer shape, const T *ptr = nullptr, handle base = handle())
-        : array_t(private_ctor{}, std::move(shape),
-                ExtraFlags & f_style ? f_strides(*shape, itemsize()) : c_strides(*shape, itemsize()),
-                ptr, base) { }
+        : array_t(private_ctor{},
+                  std::move(shape),
+                  (ExtraFlags & f_style) != 0 ? detail::f_strides(*shape, itemsize())
+                                              : detail::c_strides(*shape, itemsize()),
+                  ptr,
+                  base) {}
     explicit array_t(ssize_t count, const T *ptr = nullptr, handle base = handle())
         : array({count}, {}, ptr, base) { }
@@ -977,7 +1021,7 @@ template <typename T>
 struct format_descriptor<T, detail::enable_if_t<detail::array_info<T>::is_array>> {
     static std::string format() {
         using namespace detail;
-        static constexpr auto extents = _("(") + array_info<T>::extents + _(")");
+        static constexpr auto extents = const_name("(") + array_info<T>::extents + const_name(")");
         return extents.text + format_descriptor<remove_all_extents_t<T>>::format();
@@ -1012,23 +1056,28 @@ struct npy_format_descriptor_name;
 template <typename T>
 struct npy_format_descriptor_name<T, enable_if_t<std::is_integral<T>::value>> {
-    static constexpr auto name = _<std::is_same<T, bool>::value>(
-        _("bool"), _<std::is_signed<T>::value>("numpy.int", "numpy.uint") + _<sizeof(T)*8>()
+    static constexpr auto name = const_name<std::is_same<T, bool>::value>(
+        const_name("bool"), const_name<std::is_signed<T>::value>("numpy.int", "numpy.uint") + const_name<sizeof(T)*8>()
 template <typename T>
 struct npy_format_descriptor_name<T, enable_if_t<std::is_floating_point<T>::value>> {
-    static constexpr auto name = _<std::is_same<T, float>::value || std::is_same<T, double>::value>(
-        _("numpy.float") + _<sizeof(T)*8>(), _("numpy.longdouble")
+    static constexpr auto name = const_name<std::is_same<T, float>::value
+                                   || std::is_same<T, const float>::value
+                                   || std::is_same<T, double>::value
+                                   || std::is_same<T, const double>::value>(
+        const_name("numpy.float") + const_name<sizeof(T)*8>(), const_name("numpy.longdouble")
 template <typename T>
 struct npy_format_descriptor_name<T, enable_if_t<is_complex<T>::value>> {
-    static constexpr auto name = _<std::is_same<typename T::value_type, float>::value
-                                   || std::is_same<typename T::value_type, double>::value>(
-        _("numpy.complex") + _<sizeof(typename T::value_type)*16>(), _("numpy.longcomplex")
+    static constexpr auto name = const_name<std::is_same<typename T::value_type, float>::value
+                                   || std::is_same<typename T::value_type, const float>::value
+                                   || std::is_same<typename T::value_type, double>::value
+                                   || std::is_same<typename T::value_type, const double>::value>(
+        const_name("numpy.complex") + const_name<sizeof(typename T::value_type)*16>(), const_name("numpy.longcomplex")
@@ -1056,7 +1105,7 @@ struct npy_format_descriptor<T, enable_if_t<satisfies_any_of<T, std::is_arithmet
-    static constexpr auto name = _("S") + _<N>(); \
+    static constexpr auto name = const_name("S") + const_name<N>(); \
     static pybind11::dtype dtype() { return pybind11::dtype(std::string("S") + std::to_string(N)); }
 template <size_t N> struct npy_format_descriptor<char[N]> { PYBIND11_DECL_CHAR_FMT };
 template <size_t N> struct npy_format_descriptor<std::array<char, N>> { PYBIND11_DECL_CHAR_FMT };
@@ -1068,7 +1117,7 @@ template<typename T> struct npy_format_descriptor<T, enable_if_t<array_info<T>::
     static_assert(!array_info<T>::is_empty, "Zero-sized arrays are not supported");
-    static constexpr auto name = _("(") + array_info<T>::extents + _(")") + base_descr::name;
+    static constexpr auto name = const_name("(") + array_info<T>::extents + const_name(")") + base_descr::name;
     static pybind11::dtype dtype() {
         list shape;
@@ -1092,7 +1141,7 @@ struct field_descriptor {
     dtype descr;
-inline PYBIND11_NOINLINE void register_structured_dtype(
+PYBIND11_NOINLINE void register_structured_dtype(
     any_container<field_descriptor> fields,
     const std::type_info& tinfo, ssize_t itemsize,
     bool (*direct_converter)(PyObject *, void *&)) {
@@ -1116,7 +1165,10 @@ inline PYBIND11_NOINLINE void register_structured_dtype(
-    auto dtype_ptr = pybind11::dtype(names, formats, offsets, itemsize).release().ptr();
+    auto dtype_ptr
+        = pybind11::dtype(std::move(names), std::move(formats), std::move(offsets), itemsize)
+              .release()
+              .ptr();
     // There is an existing bug in NumPy (as of v1.11): trailing bytes are
     // not encoded explicitly into the format string. This will supposedly
@@ -1270,26 +1322,13 @@ template <typename T, typename SFINAE> struct npy_format_descriptor {
 #endif // __CLION_IDE__
-template  <class T>
-using array_iterator = typename std::add_pointer<T>::type;
-template <class T>
-array_iterator<T> array_begin(const buffer_info& buffer) {
-    return array_iterator<T>(reinterpret_cast<T*>(buffer.ptr));
-template <class T>
-array_iterator<T> array_end(const buffer_info& buffer) {
-    return array_iterator<T>(reinterpret_cast<T*>(buffer.ptr) + buffer.size);
 class common_iterator {
     using container_type = std::vector<ssize_t>;
     using value_type = container_type::value_type;
     using size_type = container_type::size_type;
-    common_iterator() : p_ptr(0), m_strides() {}
+    common_iterator() : m_strides() {}
     common_iterator(void* ptr, const container_type& strides, const container_type& shape)
         : p_ptr(reinterpret_cast<char*>(ptr)), m_strides(strides.size()) {
@@ -1310,7 +1349,7 @@ class common_iterator {
-    char* p_ptr;
+    char *p_ptr{0};
     container_type m_strides;
@@ -1338,9 +1377,8 @@ template <size_t N> class multi_array_iterator {
             if (++m_index[i] != m_shape[i]) {
-            } else {
-                m_index[i] = 0;
+            m_index[i] = 0;
         return *this;
@@ -1474,7 +1512,7 @@ struct vectorize_arg {
     using call_type = remove_reference_t<T>;
     // Is this a vectorized argument?
     static constexpr bool vectorize =
-        satisfies_any_of<call_type, std::is_arithmetic, is_complex, std::is_pod>::value &&
+        satisfies_any_of<call_type, std::is_arithmetic, is_complex, is_pod>::value &&
         satisfies_none_of<call_type, std::is_pointer, std::is_array, is_std_array, std::is_enum>::value &&
         (!std::is_reference<T>::value ||
          (std::is_lvalue_reference<T>::value && std::is_const<call_type>::value));
@@ -1482,6 +1520,55 @@ struct vectorize_arg {
     using type = conditional_t<vectorize, array_t<remove_cv_t<call_type>, array::forcecast>, T>;
+// py::vectorize when a return type is present
+template <typename Func, typename Return, typename... Args>
+struct vectorize_returned_array {
+    using Type = array_t<Return>;
+    static Type create(broadcast_trivial trivial, const std::vector<ssize_t> &shape) {
+        if (trivial == broadcast_trivial::f_trivial)
+            return array_t<Return, array::f_style>(shape);
+        return array_t<Return>(shape);
+    }
+    static Return *mutable_data(Type &array) {
+        return array.mutable_data();
+    }
+    static Return call(Func &f, Args &... args) {
+        return f(args...);
+    }
+    static void call(Return *out, size_t i, Func &f, Args &... args) {
+        out[i] = f(args...);
+    }
+// py::vectorize when a return type is not present
+template <typename Func, typename... Args>
+struct vectorize_returned_array<Func, void, Args...> {
+    using Type = none;
+    static Type create(broadcast_trivial, const std::vector<ssize_t> &) {
+        return none();
+    }
+    static void *mutable_data(Type &) {
+        return nullptr;
+    }
+    static detail::void_type call(Func &f, Args &... args) {
+        f(args...);
+        return {};
+    }
+    static void call(void *, size_t, Func &f, Args &... args) {
+        f(args...);
+    }
 template <typename Func, typename Return, typename... Args>
 struct vectorize_helper {
@@ -1498,8 +1585,11 @@ struct vectorize_helper {
             "pybind11::vectorize(...) requires a function with at least one vectorizable argument");
-    template <typename T>
-    explicit vectorize_helper(T &&f) : f(std::forward<T>(f)) { }
+    template <typename T,
+              // SFINAE to prevent shadowing the copy constructor.
+              typename = detail::enable_if_t<
+                  !std::is_same<vectorize_helper, typename std::decay<T>::type>::value>>
+    explicit vectorize_helper(T &&f) : f(std::forward<T>(f)) {}
     object operator()(typename vectorize_arg<Args>::type... args) {
         return run(args...,
@@ -1516,6 +1606,8 @@ struct vectorize_helper {
     using arg_call_types = std::tuple<typename vectorize_arg<Args>::call_type...>;
     template <size_t Index> using param_n_t = typename std::tuple_element<Index, arg_call_types>::type;
+    using returned_array = vectorize_returned_array<Func, Return, Args...>;
     // Runs a vectorized function given arguments tuple and three index sequences:
     //     - Index is the full set of 0 ... (N-1) argument indices;
     //     - VIndex is the subset of argument indices with vectorized parameters, letting us access
@@ -1547,20 +1639,19 @@ struct vectorize_helper {
         // not wrapped in an array).
         if (size == 1 && ndim == 0) {
             PYBIND11_EXPAND_SIDE_EFFECTS(params[VIndex] = buffers[BIndex].ptr);
-            return cast(f(*reinterpret_cast<param_n_t<Index> *>(params[Index])...));
+            return cast(returned_array::call(f, *reinterpret_cast<param_n_t<Index> *>(params[Index])...));
-        array_t<Return> result;
-        if (trivial == broadcast_trivial::f_trivial) result = array_t<Return, array::f_style>(shape);
-        else result = array_t<Return>(shape);
+        auto result = returned_array::create(trivial, shape);
         if (size == 0) return std::move(result);
         /* Call the function */
+        auto mutable_data = returned_array::mutable_data(result);
         if (trivial == broadcast_trivial::non_trivial)
-            apply_broadcast(buffers, params, result, i_seq, vi_seq, bi_seq);
+            apply_broadcast(buffers, params, mutable_data, size, shape, i_seq, vi_seq, bi_seq);
-            apply_trivial(buffers, params, result.mutable_data(), size, i_seq, vi_seq, bi_seq);
+            apply_trivial(buffers, params, mutable_data, size, i_seq, vi_seq, bi_seq);
         return std::move(result);
@@ -1583,7 +1674,7 @@ struct vectorize_helper {
         for (size_t i = 0; i < size; ++i) {
-            out[i] = f(*reinterpret_cast<param_n_t<Index> *>(params[Index])...);
+            returned_array::call(out, i, f, *reinterpret_cast<param_n_t<Index> *>(params[Index])...);
             for (auto &x : vecparams) x.first += x.second;
@@ -1591,19 +1682,18 @@ struct vectorize_helper {
     template <size_t... Index, size_t... VIndex, size_t... BIndex>
     void apply_broadcast(std::array<buffer_info, NVectorized> &buffers,
                          std::array<void *, N> &params,
-                         array_t<Return> &output_array,
+                         Return *out,
+                         size_t size,
+                         const std::vector<ssize_t> &output_shape,
                          index_sequence<Index...>, index_sequence<VIndex...>, index_sequence<BIndex...>) {
-        buffer_info output = output_array.request();
-        multi_array_iterator<NVectorized> input_iter(buffers, output.shape);
+        multi_array_iterator<NVectorized> input_iter(buffers, output_shape);
-        for (array_iterator<Return> iter = array_begin<Return>(output), end = array_end<Return>(output);
-             iter != end;
-             ++iter, ++input_iter) {
+        for (size_t i = 0; i < size; ++i, ++input_iter) {
                 params[VIndex] = input_iter.template data<BIndex>()
-            *iter = f(*reinterpret_cast<param_n_t<Index> *>(std::get<Index>(params))...);
+            returned_array::call(out, i, f, *reinterpret_cast<param_n_t<Index> *>(std::get<Index>(params))...);
@@ -1615,7 +1705,7 @@ vectorize_extractor(const Func &f, Return (*) (Args ...)) {
 template <typename T, int Flags> struct handle_type_name<array_t<T, Flags>> {
-    static constexpr auto name = _("numpy.ndarray[") + npy_format_descriptor<T>::name + _("]");
+    static constexpr auto name = const_name("numpy.ndarray[") + npy_format_descriptor<T>::name + const_name("]");
@@ -1649,7 +1739,3 @@ Helper vectorize(Return (Class::*f)(Args...) const) {
-#if defined(_MSC_VER)
-#pragma warning(pop)
diff --git a/wrap/pybind11/include/pybind11/operators.h b/wrap/pybind11/include/pybind11/operators.h
index 086cb4cfd8..2a61531589 100644
--- a/wrap/pybind11/include/pybind11/operators.h
+++ b/wrap/pybind11/include/pybind11/operators.h
@@ -11,13 +11,6 @@
 #include "pybind11.h"
-#if defined(__clang__) && !defined(__INTEL_COMPILER)
-#  pragma clang diagnostic ignored "-Wunsequenced" // multiple unsequenced modifications to 'self' (when using def(py::self OP Type()))
-#elif defined(_MSC_VER)
-#  pragma warning(push)
-#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
@@ -58,7 +51,8 @@ template <op_id id, op_type ot, typename L, typename R> struct op_ {
         using op = op_impl<id, ot, Base, L_type, R_type>;
         cl.def(op::name(), &op::execute, is_operator(), extra...);
         #if PY_MAJOR_VERSION < 3
-        if (id == op_truediv || id == op_itruediv)
+        if (PYBIND11_SILENCE_MSVC_C4127(id == op_truediv) ||
+            PYBIND11_SILENCE_MSVC_C4127(id == op_itruediv))
             cl.def(id == op_itruediv ? "__idiv__" : ot == op_l ? "__div__" : "__rdiv__",
                     &op::execute, is_operator(), extra...);
@@ -167,7 +161,3 @@ using detail::self;
 using detail::hash;
-#if defined(_MSC_VER)
-#  pragma warning(pop)
diff --git a/wrap/pybind11/include/pybind11/pybind11.h b/wrap/pybind11/include/pybind11/pybind11.h
index f6dba4ed20..7aa93bb5aa 100644
--- a/wrap/pybind11/include/pybind11/pybind11.h
+++ b/wrap/pybind11/include/pybind11/pybind11.h
@@ -10,56 +10,84 @@
 #pragma once
-#if defined(__INTEL_COMPILER)
-#  pragma warning push
-#  pragma warning disable 68    // integer conversion resulted in a change of sign
-#  pragma warning disable 186   // pointless comparison of unsigned integer with zero
-#  pragma warning disable 878   // incompatible exception specifications
-#  pragma warning disable 1334  // the "template" keyword used for syntactic disambiguation may only be used within a template
-#  pragma warning disable 1682  // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem)
-#  pragma warning disable 1786  // function "strdup" was declared deprecated
-#  pragma warning disable 1875  // offsetof applied to non-POD (Plain Old Data) types is nonstandard
-#  pragma warning disable 2196  // warning #2196: routine is both "inline" and "noinline"
-#elif defined(_MSC_VER)
-#  pragma warning(push)
-#  pragma warning(disable: 4100) // warning C4100: Unreferenced formal parameter
-#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
-#  pragma warning(disable: 4512) // warning C4512: Assignment operator was implicitly defined as deleted
-#  pragma warning(disable: 4800) // warning C4800: 'int': forcing value to bool 'true' or 'false' (performance warning)
-#  pragma warning(disable: 4996) // warning C4996: The POSIX name for this item is deprecated. Instead, use the ISO C and C++ conformant name
-#  pragma warning(disable: 4702) // warning C4702: unreachable code
-#  pragma warning(disable: 4522) // warning C4522: multiple assignment operators specified
-#elif defined(__GNUG__) && !defined(__clang__)
-#  pragma GCC diagnostic push
-#  pragma GCC diagnostic ignored "-Wunused-but-set-parameter"
-#  pragma GCC diagnostic ignored "-Wunused-but-set-variable"
-#  pragma GCC diagnostic ignored "-Wmissing-field-initializers"
-#  pragma GCC diagnostic ignored "-Wstrict-aliasing"
-#  pragma GCC diagnostic ignored "-Wattributes"
-#  if __GNUC__ >= 7
-#    pragma GCC diagnostic ignored "-Wnoexcept-type"
-#  endif
 #include "attr.h"
+#include "gil.h"
 #include "options.h"
 #include "detail/class.h"
 #include "detail/init.h"
+#include <cstdlib>
+#include <memory>
+#include <new>
+#include <vector>
+#include <string>
+#include <utility>
+#include <cstring>
+#if defined(__cpp_lib_launder) && !(defined(_MSC_VER) && (_MSC_VER < 1914))
+#  define PYBIND11_STD_LAUNDER std::launder
 #if defined(__GNUG__) && !defined(__clang__)
 #  include <cxxabi.h>
+/* https://stackoverflow.com/questions/46798456/handling-gccs-noexcept-type-warning
+   This warning is about ABI compatibility, not code health.
+   It is only actually needed in a couple places, but apparently GCC 7 "generates this warning if
+   and only if the first template instantiation ... involves noexcept" [stackoverflow], therefore
+   it could get triggered from seemingly random places, depending on user code.
+   No other GCC version generates this warning.
+ */
+#if defined(__GNUC__) && __GNUC__ == 7
+#    pragma GCC diagnostic push
+#    pragma GCC diagnostic ignored "-Wnoexcept-type"
+// Apply all the extensions translators from a list
+// Return true if one of the translators completed without raising an exception
+// itself. Return of false indicates that if there are other translators
+// available, they should be tried.
+inline bool apply_exception_translators(std::forward_list<ExceptionTranslator>& translators) {
+    auto last_exception = std::current_exception();
+    for (auto &translator : translators) {
+        try {
+            translator(last_exception);
+            return true;
+        } catch (...) {
+            last_exception = std::current_exception();
+        }
+    }
+    return false;
+#if defined(_MSC_VER)
+#    define PYBIND11_COMPAT_STRDUP _strdup
+#    define PYBIND11_COMPAT_STRDUP strdup
 /// Wraps an arbitrary C++ function/method/lambda function/.. into a callable Python object
 class cpp_function : public function {
     cpp_function() = default;
+    // NOLINTNEXTLINE(google-explicit-constructor)
     cpp_function(std::nullptr_t) { }
     /// Construct a cpp_function from a vanilla function pointer
     template <typename Return, typename... Args, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
     cpp_function(Return (*f)(Args...), const Extra&... extra) {
         initialize(f, f, extra...);
@@ -67,6 +95,7 @@ class cpp_function : public function {
     /// Construct a cpp_function from a lambda function (possibly with internal state)
     template <typename Func, typename... Extra,
               typename = detail::enable_if_t<detail::is_lambda<Func>::value>>
+    // NOLINTNEXTLINE(google-explicit-constructor)
     cpp_function(Func &&f, const Extra&... extra) {
                    (detail::function_signature_t<Func> *) nullptr, extra...);
@@ -74,6 +103,7 @@ class cpp_function : public function {
     /// Construct a cpp_function from a class method (non-const, no ref-qualifier)
     template <typename Return, typename Class, typename... Arg, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
     cpp_function(Return (Class::*f)(Arg...), const Extra&... extra) {
         initialize([f](Class *c, Arg... args) -> Return { return (c->*f)(std::forward<Arg>(args)...); },
                    (Return (*) (Class *, Arg...)) nullptr, extra...);
@@ -83,13 +113,15 @@ class cpp_function : public function {
     /// A copy of the overload for non-const functions without explicit ref-qualifier
     /// but with an added `&`.
     template <typename Return, typename Class, typename... Arg, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
     cpp_function(Return (Class::*f)(Arg...)&, const Extra&... extra) {
-        initialize([f](Class *c, Arg... args) -> Return { return (c->*f)(args...); },
+        initialize([f](Class *c, Arg... args) -> Return { return (c->*f)(std::forward<Arg>(args)...); },
                    (Return (*) (Class *, Arg...)) nullptr, extra...);
     /// Construct a cpp_function from a class method (const, no ref-qualifier)
     template <typename Return, typename Class, typename... Arg, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
     cpp_function(Return (Class::*f)(Arg...) const, const Extra&... extra) {
         initialize([f](const Class *c, Arg... args) -> Return { return (c->*f)(std::forward<Arg>(args)...); },
                    (Return (*)(const Class *, Arg ...)) nullptr, extra...);
@@ -99,8 +131,9 @@ class cpp_function : public function {
     /// A copy of the overload for const functions without explicit ref-qualifier
     /// but with an added `&`.
     template <typename Return, typename Class, typename... Arg, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
     cpp_function(Return (Class::*f)(Arg...) const&, const Extra&... extra) {
-        initialize([f](const Class *c, Arg... args) -> Return { return (c->*f)(args...); },
+        initialize([f](const Class *c, Arg... args) -> Return { return (c->*f)(std::forward<Arg>(args)...); },
                    (Return (*)(const Class *, Arg ...)) nullptr, extra...);
@@ -108,9 +141,16 @@ class cpp_function : public function {
     object name() const { return attr("__name__"); }
+    struct InitializingFunctionRecordDeleter {
+        // `destruct(function_record, false)`: `initialize_generic` copies strings and
+        // takes care of cleaning up in case of exceptions. So pass `false` to `free_strings`.
+        void operator()(detail::function_record * rec) { destruct(rec, false); }
+    };
+    using unique_function_record = std::unique_ptr<detail::function_record, InitializingFunctionRecordDeleter>;
     /// Space optimization: don't inline this frequently instantiated fragment
-    PYBIND11_NOINLINE detail::function_record *make_function_record() {
-        return new detail::function_record();
+    PYBIND11_NOINLINE unique_function_record make_function_record() {
+        return unique_function_record(new detail::function_record());
     /// Special internal constructor for functors, lambda functions, etc.
@@ -120,23 +160,38 @@ class cpp_function : public function {
         struct capture { remove_reference_t<Func> f; };
         /* Store the function including any extra state it might have (e.g. a lambda capture object) */
-        auto rec = make_function_record();
+        // The unique_ptr makes sure nothing is leaked in case of an exception.
+        auto unique_rec = make_function_record();
+        auto rec = unique_rec.get();
         /* Store the capture object directly in the function record if there is enough space */
-        if (sizeof(capture) <= sizeof(rec->data)) {
+        if (PYBIND11_SILENCE_MSVC_C4127(sizeof(capture) <= sizeof(rec->data))) {
             /* Without these pragmas, GCC warns that there might not be
                enough space to use the placement new operator. However, the
                'if' statement above ensures that this is the case. */
-#if defined(__GNUG__) && !defined(__clang__) && __GNUC__ >= 6
+#if defined(__GNUG__) && __GNUC__ >= 6 && !defined(__clang__) && !defined(__INTEL_COMPILER)
 #  pragma GCC diagnostic push
 #  pragma GCC diagnostic ignored "-Wplacement-new"
             new ((capture *) &rec->data) capture { std::forward<Func>(f) };
-#if defined(__GNUG__) && !defined(__clang__) && __GNUC__ >= 6
+#if defined(__GNUG__) && __GNUC__ >= 6 && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#  pragma GCC diagnostic pop
+#if defined(__GNUG__) && !PYBIND11_HAS_STD_LAUNDER && !defined(__INTEL_COMPILER)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wstrict-aliasing"
+            // UB without std::launder, but without breaking ABI and/or
+            // a significant refactoring it's "impossible" to solve.
+            if (!std::is_trivially_destructible<capture>::value)
+                rec->free_data = [](function_record *r) {
+                    auto data = PYBIND11_STD_LAUNDER((capture *) &r->data);
+                    (void) data;
+                    data->~capture();
+                };
+#if defined(__GNUG__) && !PYBIND11_HAS_STD_LAUNDER && !defined(__INTEL_COMPILER)
 #  pragma GCC diagnostic pop
-            if (!std::is_trivially_destructible<Func>::value)
-                rec->free_data = [](function_record *r) { ((capture *) &r->data)->~capture(); };
         } else {
             rec->data[0] = new capture { std::forward<Func>(f) };
             rec->free_data = [](function_record *r) { delete ((capture *) r->data[0]); };
@@ -148,7 +203,7 @@ class cpp_function : public function {
             conditional_t<std::is_void<Return>::value, void_type, Return>
-        static_assert(expected_num_args<Extra...>(sizeof...(Args), cast_in::has_args, cast_in::has_kwargs),
+        static_assert(expected_num_args<Extra...>(sizeof...(Args), cast_in::args_pos >= 0, cast_in::has_kwargs),
                       "The number of argument annotations does not match the number of function arguments");
         /* Dispatch code which converts function arguments and performs the actual function call */
@@ -183,28 +238,36 @@ class cpp_function : public function {
             return result;
+        rec->nargs_pos = cast_in::args_pos >= 0
+            ? static_cast<std::uint16_t>(cast_in::args_pos)
+            : sizeof...(Args) - cast_in::has_kwargs; // Will get reduced more if we have a kw_only
+        rec->has_args = cast_in::args_pos >= 0;
+        rec->has_kwargs = cast_in::has_kwargs;
         /* Process any user-provided function attributes */
         process_attributes<Extra...>::init(extra..., rec);
             constexpr bool has_kw_only_args = any_of<std::is_same<kw_only, Extra>...>::value,
                            has_pos_only_args = any_of<std::is_same<pos_only, Extra>...>::value,
-                           has_args = any_of<std::is_same<args, Args>...>::value,
                            has_arg_annotations = any_of<is_keyword<Extra>...>::value;
             static_assert(has_arg_annotations || !has_kw_only_args, "py::kw_only requires the use of argument annotations");
             static_assert(has_arg_annotations || !has_pos_only_args, "py::pos_only requires the use of argument annotations (for docstrings and aligning the annotations to the argument)");
-            static_assert(!(has_args && has_kw_only_args), "py::kw_only cannot be combined with a py::args argument");
+            static_assert(constexpr_sum(is_kw_only<Extra>::value...) <= 1, "py::kw_only may be specified only once");
+            static_assert(constexpr_sum(is_pos_only<Extra>::value...) <= 1, "py::pos_only may be specified only once");
+            constexpr auto kw_only_pos = constexpr_first<is_kw_only, Extra...>();
+            constexpr auto pos_only_pos = constexpr_first<is_pos_only, Extra...>();
+            static_assert(!(has_kw_only_args && has_pos_only_args) || pos_only_pos < kw_only_pos, "py::pos_only must come before py::kw_only");
         /* Generate a readable signature describing the function's arguments and return value types */
-        static constexpr auto signature = _("(") + cast_in::arg_names + _(") -> ") + cast_out::name;
+        static constexpr auto signature = const_name("(") + cast_in::arg_names + const_name(") -> ") + cast_out::name;
         PYBIND11_DESCR_CONSTEXPR auto types = decltype(signature)::types();
         /* Register the function with Python from generic (non-templated) code */
-        initialize_generic(rec, signature.text, types.data(), sizeof...(Args));
-        if (cast_in::has_args) rec->has_args = true;
-        if (cast_in::has_kwargs) rec->has_kwargs = true;
+        // Pass on the ownership over the `unique_rec` to `initialize_generic`. `rec` stays valid.
+        initialize_generic(std::move(unique_rec), signature.text, types.data(), sizeof...(Args));
         /* Stash some additional information used by an important optimization in 'functional.h' */
         using FunctionType = Return (*)(Args...);
@@ -217,27 +280,59 @@ class cpp_function : public function {
+    // Utility class that keeps track of all duplicated strings, and cleans them up in its destructor,
+    // unless they are released. Basically a RAII-solution to deal with exceptions along the way.
+    class strdup_guard {
+    public:
+        ~strdup_guard() {
+            for (auto s : strings)
+                std::free(s);
+        }
+        char *operator()(const char *s) {
+            auto t = PYBIND11_COMPAT_STRDUP(s);
+            strings.push_back(t);
+            return t;
+        }
+        void release() {
+            strings.clear();
+        }
+    private:
+        std::vector<char *> strings;
+    };
     /// Register a function call with Python (generic non-templated code goes here)
-    void initialize_generic(detail::function_record *rec, const char *text,
+    void initialize_generic(unique_function_record &&unique_rec, const char *text,
                             const std::type_info *const *types, size_t args) {
+        // Do NOT receive `unique_rec` by value. If this function fails to move out the unique_ptr,
+        // we do not want this to destuct the pointer. `initialize` (the caller) still relies on the
+        // pointee being alive after this call. Only move out if a `capsule` is going to keep it alive.
+        auto rec = unique_rec.get();
+        // Keep track of strdup'ed strings, and clean them up as long as the function's capsule
+        // has not taken ownership yet (when `unique_rec.release()` is called).
+        // Note: This cannot easily be fixed by a `unique_ptr` with custom deleter, because the strings
+        // are only referenced before strdup'ing. So only *after* the following block could `destruct`
+        // safely be called, but even then, `repr` could still throw in the middle of copying all strings.
+        strdup_guard guarded_strdup;
         /* Create copies of all referenced C-style strings */
-        rec->name = strdup(rec->name ? rec->name : "");
-        if (rec->doc) rec->doc = strdup(rec->doc);
+        rec->name = guarded_strdup(rec->name ? rec->name : "");
+        if (rec->doc) rec->doc = guarded_strdup(rec->doc);
         for (auto &a: rec->args) {
             if (a.name)
-                a.name = strdup(a.name);
+                a.name = guarded_strdup(a.name);
             if (a.descr)
-                a.descr = strdup(a.descr);
+                a.descr = guarded_strdup(a.descr);
             else if (a.value)
-                a.descr = strdup(repr(a.value).cast<std::string>().c_str());
+                a.descr = guarded_strdup(repr(a.value).cast<std::string>().c_str());
-        rec->is_constructor = !strcmp(rec->name, "__init__") || !strcmp(rec->name, "__setstate__");
+        rec->is_constructor = (std::strcmp(rec->name, "__init__") == 0)
+                              || (std::strcmp(rec->name, "__setstate__") == 0);
         if (rec->is_constructor && !rec->is_new_style_constructor) {
-            const auto class_name = std::string(((PyTypeObject *) rec->scope.ptr())->tp_name);
+            const auto class_name = detail::get_fully_qualified_tp_name((PyTypeObject *) rec->scope.ptr());
             const auto func_name = std::string(rec->name);
@@ -252,16 +347,18 @@ class cpp_function : public function {
         /* Generate a proper function signature */
         std::string signature;
         size_t type_index = 0, arg_index = 0;
+        bool is_starred = false;
         for (auto *pc = text; *pc != '\0'; ++pc) {
             const auto c = *pc;
             if (c == '{') {
                 // Write arg name for everything except *args and **kwargs.
-                if (*(pc + 1) == '*')
+                is_starred = *(pc + 1) == '*';
+                if (is_starred)
                 // Separator for keyword-only arguments, placed before the kw
-                // arguments start
-                if (rec->nargs_kw_only > 0 && arg_index + rec->nargs_kw_only == args)
+                // arguments start (unless we are already putting an *args)
+                if (!rec->has_args && arg_index == rec->nargs_pos)
                     signature += "*, ";
                 if (arg_index < rec->args.size() && rec->args[arg_index].name) {
                     signature += rec->args[arg_index].name;
@@ -273,7 +370,7 @@ class cpp_function : public function {
                 signature += ": ";
             } else if (c == '}') {
                 // Write default value if available.
-                if (arg_index < rec->args.size() && rec->args[arg_index].descr) {
+                if (!is_starred && arg_index < rec->args.size() && rec->args[arg_index].descr) {
                     signature += " = ";
                     signature += rec->args[arg_index].descr;
@@ -281,7 +378,8 @@ class cpp_function : public function {
                 // argument, rather than before like *
                 if (rec->nargs_pos_only > 0 && (arg_index + 1) == rec->nargs_pos_only)
                     signature += ", /";
-                arg_index++;
+                if (!is_starred)
+                    arg_index++;
             } else if (c == '%') {
                 const std::type_info *t = types[type_index++];
                 if (!t)
@@ -307,19 +405,19 @@ class cpp_function : public function {
-        if (arg_index != args || types[type_index] != nullptr)
+        if (arg_index != args - rec->has_args - rec->has_kwargs || types[type_index] != nullptr)
             pybind11_fail("Internal error while parsing type signature (2)");
-        if (strcmp(rec->name, "__next__") == 0) {
+        if (std::strcmp(rec->name, "__next__") == 0) {
-            rec->name = strdup("next");
-        } else if (strcmp(rec->name, "__bool__") == 0) {
+            rec->name = guarded_strdup("next");
+        } else if (std::strcmp(rec->name, "__bool__") == 0) {
-            rec->name = strdup("__nonzero__");
+            rec->name = guarded_strdup("__nonzero__");
-        rec->signature = strdup(signature.c_str());
+        rec->signature = guarded_strdup(signature.c_str());
         rec->nargs = (std::uint16_t) args;
@@ -329,7 +427,8 @@ class cpp_function : public function {
         detail::function_record *chain = nullptr, *chain_start = rec;
         if (rec->sibling) {
             if (PyCFunction_Check(rec->sibling.ptr())) {
-                auto rec_capsule = reinterpret_borrow<capsule>(PyCFunction_GET_SELF(rec->sibling.ptr()));
+                auto *self = PyCFunction_GET_SELF(rec->sibling.ptr());
+                capsule rec_capsule = isinstance<capsule>(self) ? reinterpret_borrow<capsule>(self) : capsule(self);
                 chain = (detail::function_record *) rec_capsule;
                 /* Never append a method to an overload chain of a parent class;
                    instead, hide the parent's overloads in this case */
@@ -347,12 +446,14 @@ class cpp_function : public function {
             rec->def = new PyMethodDef();
             std::memset(rec->def, 0, sizeof(PyMethodDef));
             rec->def->ml_name = rec->name;
-            rec->def->ml_meth = reinterpret_cast<PyCFunction>(reinterpret_cast<void (*) (void)>(*dispatcher));
+            rec->def->ml_meth
+                = reinterpret_cast<PyCFunction>(reinterpret_cast<void (*)()>(dispatcher));
             rec->def->ml_flags = METH_VARARGS | METH_KEYWORDS;
-            capsule rec_capsule(rec, [](void *ptr) {
+            capsule rec_capsule(unique_rec.release(), [](void *ptr) {
                 destruct((detail::function_record *) ptr);
+            guarded_strdup.release();
             object scope_module;
             if (rec->scope) {
@@ -367,10 +468,9 @@ class cpp_function : public function {
             if (!m_ptr)
                 pybind11_fail("cpp_function::cpp_function(): Could not allocate function object");
         } else {
-            /* Append at the end of the overload chain */
+            /* Append at the beginning or end of the overload chain */
             m_ptr = rec->sibling.ptr();
-            chain_start = chain;
             if (chain->is_method != rec->is_method)
                 pybind11_fail("overloading a method with both static and instance methods is not supported; "
                     #if defined(NDEBUG)
@@ -380,9 +480,24 @@ class cpp_function : public function {
                         std::string(pybind11::str(rec->scope.attr("__name__"))) + "." + std::string(rec->name) + signature
-            while (chain->next)
-                chain = chain->next;
-            chain->next = rec;
+            if (rec->prepend) {
+                // Beginning of chain; we need to replace the capsule's current head-of-the-chain
+                // pointer with this one, then make this one point to the previous head of the
+                // chain.
+                chain_start = rec;
+                rec->next = chain;
+                auto rec_capsule = reinterpret_borrow<capsule>(((PyCFunctionObject *) m_ptr)->m_self);
+                rec_capsule.set_pointer(unique_rec.release());
+                guarded_strdup.release();
+            } else {
+                // Or end of chain (normal behavior)
+                chain_start = chain;
+                while (chain->next)
+                    chain = chain->next;
+                chain->next = unique_rec.release();
+                guarded_strdup.release();
+            }
         std::string signatures;
@@ -406,7 +521,7 @@ class cpp_function : public function {
                 signatures += it->signature;
                 signatures += "\n";
-            if (it->doc && strlen(it->doc) > 0 && options::show_user_defined_docstrings()) {
+            if (it->doc && it->doc[0] != '\0' && options::show_user_defined_docstrings()) {
                 // If we're appending another docstring, and aren't printing function signatures, we
                 // need to append a newline first:
                 if (!options::show_function_signatures()) {
@@ -421,9 +536,10 @@ class cpp_function : public function {
         /* Install docstring */
         auto *func = (PyCFunctionObject *) m_ptr;
-        if (func->m_ml->ml_doc)
-            std::free(const_cast<char *>(func->m_ml->ml_doc));
-        func->m_ml->ml_doc = strdup(signatures.c_str());
+        std::free(const_cast<char *>(func->m_ml->ml_doc));
+        // Install docstring if it's non-empty (when at least one option is enabled)
+        func->m_ml->ml_doc
+            = signatures.empty() ? nullptr : PYBIND11_COMPAT_STRDUP(signatures.c_str());
         if (rec->is_method) {
             m_ptr = PYBIND11_INSTANCE_METHOD_NEW(m_ptr, rec->scope.ptr());
@@ -434,28 +550,49 @@ class cpp_function : public function {
     /// When a cpp_function is GCed, release any memory allocated by pybind11
-    static void destruct(detail::function_record *rec) {
+    static void destruct(detail::function_record *rec, bool free_strings = true) {
+        // If on Python 3.9, check the interpreter "MICRO" (patch) version.
+        // If this is running on 3.9.0, we have to work around a bug.
+        #if !defined(PYPY_VERSION) && PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION == 9
+            static bool is_zero = Py_GetVersion()[4] == '0';
+        #endif
         while (rec) {
             detail::function_record *next = rec->next;
             if (rec->free_data)
-            std::free((char *) rec->name);
-            std::free((char *) rec->doc);
-            std::free((char *) rec->signature);
-            for (auto &arg: rec->args) {
-                std::free(const_cast<char *>(arg.name));
-                std::free(const_cast<char *>(arg.descr));
-                arg.value.dec_ref();
+            // During initialization, these strings might not have been copied yet,
+            // so they cannot be freed. Once the function has been created, they can.
+            // Check `make_function_record` for more details.
+            if (free_strings) {
+                std::free((char *) rec->name);
+                std::free((char *) rec->doc);
+                std::free((char *) rec->signature);
+                for (auto &arg: rec->args) {
+                    std::free(const_cast<char *>(arg.name));
+                    std::free(const_cast<char *>(arg.descr));
+                }
+            for (auto &arg: rec->args)
+                arg.value.dec_ref();
             if (rec->def) {
                 std::free(const_cast<char *>(rec->def->ml_doc));
-                delete rec->def;
+                // Python 3.9.0 decref's these in the wrong order; rec->def
+                // If loaded on 3.9.0, let these leak (use Python 3.9.1 at runtime to fix)
+                // See https://github.com/python/cpython/pull/22670
+                #if !defined(PYPY_VERSION) && PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION == 9
+                    if (!is_zero)
+                        delete rec->def;
+                #else
+                    delete rec->def;
+                #endif
             delete rec;
             rec = next;
     /// Main dispatch logic for calls to functions bound using pybind11
     static PyObject *dispatcher(PyObject *self, PyObject *args_in, PyObject *kwargs_in) {
         using namespace detail;
@@ -472,15 +609,15 @@ class cpp_function : public function {
         auto self_value_and_holder = value_and_holder();
         if (overloads->is_constructor) {
-            const auto tinfo = get_type_info((PyTypeObject *) overloads->scope.ptr());
-            const auto pi = reinterpret_cast<instance *>(parent.ptr());
-            self_value_and_holder = pi->get_value_and_holder(tinfo, false);
-            if (!self_value_and_holder.type || !self_value_and_holder.inst) {
-                PyErr_SetString(PyExc_TypeError, "__init__(self, ...) called with invalid `self` argument");
+            if (!parent || !PyObject_TypeCheck(parent.ptr(), (PyTypeObject *) overloads->scope.ptr())) {
+                PyErr_SetString(PyExc_TypeError, "__init__(self, ...) called with invalid or missing `self` argument");
                 return nullptr;
+            const auto tinfo = get_type_info((PyTypeObject *) overloads->scope.ptr());
+            const auto pi = reinterpret_cast<instance *>(parent.ptr());
+            self_value_and_holder = pi->get_value_and_holder(tinfo, true);
             // If this value is already registered it must mean __init__ is invoked multiple times;
             // we really can't support that in C++, so just ignore the second __init__.
             if (self_value_and_holder.instance_registered())
@@ -504,7 +641,7 @@ class cpp_function : public function {
                       named positional arguments weren't *also* specified via kwarg.
                    2. If we weren't given enough, try to make up the omitted ones by checking
                       whether they were provided by a kwarg matching the `py::arg("name")` name.  If
-                      so, use it (and remove it from kwargs; if not, see if the function binding
+                      so, use it (and remove it from kwargs); if not, see if the function binding
                       provided a default that we can use.
                    3. Ensure that either all keyword arguments were "consumed", or that the function
                       takes a kwargs argument to accept unconsumed kwargs.
@@ -522,7 +659,7 @@ class cpp_function : public function {
                 size_t num_args = func.nargs;    // Number of positional arguments that we need
                 if (func.has_args) --num_args;   // (but don't count py::args
                 if (func.has_kwargs) --num_args; //  or py::kwargs)
-                size_t pos_args = num_args - func.nargs_kw_only;
+                size_t pos_args = func.nargs_pos;
                 if (!func.has_args && n_args_in > pos_args)
                     continue; // Too many positional arguments for this overload
@@ -552,7 +689,7 @@ class cpp_function : public function {
                 bool bad_arg = false;
                 for (; args_copied < args_to_copy; ++args_copied) {
                     const argument_record *arg_rec = args_copied < func.args.size() ? &func.args[args_copied] : nullptr;
-                    if (kwargs_in && arg_rec && arg_rec->name && PyDict_GetItemString(kwargs_in, arg_rec->name)) {
+                    if (kwargs_in && arg_rec && arg_rec->name && dict_getitemstring(kwargs_in, arg_rec->name)) {
                         bad_arg = true;
@@ -568,21 +705,25 @@ class cpp_function : public function {
                 if (bad_arg)
                     continue; // Maybe it was meant for another overload (issue #688)
+                // Keep track of how many position args we copied out in case we need to come back
+                // to copy the rest into a py::args argument.
+                size_t positional_args_copied = args_copied;
                 // We'll need to copy this if we steal some kwargs for defaults
                 dict kwargs = reinterpret_borrow<dict>(kwargs_in);
                 // 1.5. Fill in any missing pos_only args from defaults if they exist
                 if (args_copied < func.nargs_pos_only) {
                     for (; args_copied < func.nargs_pos_only; ++args_copied) {
-                        const auto &arg = func.args[args_copied];
+                        const auto &arg_rec = func.args[args_copied];
                         handle value;
-                        if (arg.value) {
-                            value = arg.value;
+                        if (arg_rec.value) {
+                            value = arg_rec.value;
                         if (value) {
-                            call.args_convert.push_back(arg.convert);
+                            call.args_convert.push_back(arg_rec.convert);
                         } else
@@ -596,11 +737,11 @@ class cpp_function : public function {
                     bool copied_kwargs = false;
                     for (; args_copied < num_args; ++args_copied) {
-                        const auto &arg = func.args[args_copied];
+                        const auto &arg_rec = func.args[args_copied];
                         handle value;
-                        if (kwargs_in && arg.name)
-                            value = PyDict_GetItemString(kwargs.ptr(), arg.name);
+                        if (kwargs_in && arg_rec.name)
+                            value = dict_getitemstring(kwargs.ptr(), arg_rec.name);
                         if (value) {
                             // Consume a kwargs value
@@ -608,14 +749,24 @@ class cpp_function : public function {
                                 kwargs = reinterpret_steal<dict>(PyDict_Copy(kwargs.ptr()));
                                 copied_kwargs = true;
-                            PyDict_DelItemString(kwargs.ptr(), arg.name);
-                        } else if (arg.value) {
-                            value = arg.value;
+                            if (PyDict_DelItemString(kwargs.ptr(), arg_rec.name) == -1) {
+                                throw error_already_set();
+                            }
+                        } else if (arg_rec.value) {
+                            value = arg_rec.value;
+                        }
+                        if (!arg_rec.none && value.is_none()) {
+                            break;
                         if (value) {
+                            // If we're at the py::args index then first insert a stub for it to be replaced later
+                            if (func.has_args && call.args.size() == func.nargs_pos)
+                                call.args.push_back(none());
-                            call.args_convert.push_back(arg.convert);
+                            call.args_convert.push_back(arg_rec.convert);
@@ -636,16 +787,19 @@ class cpp_function : public function {
                         // We didn't copy out any position arguments from the args_in tuple, so we
                         // can reuse it directly without copying:
                         extra_args = reinterpret_borrow<tuple>(args_in);
-                    } else if (args_copied >= n_args_in) {
+                    } else if (positional_args_copied >= n_args_in) {
                         extra_args = tuple(0);
                     } else {
-                        size_t args_size = n_args_in - args_copied;
+                        size_t args_size = n_args_in - positional_args_copied;
                         extra_args = tuple(args_size);
                         for (size_t i = 0; i < args_size; ++i) {
-                            extra_args[i] = PyTuple_GET_ITEM(args_in, args_copied + i);
+                            extra_args[i] = PyTuple_GET_ITEM(args_in, positional_args_copied + i);
-                    call.args.push_back(extra_args);
+                    if (call.args.size() <= func.nargs_pos)
+                        call.args.push_back(extra_args);
+                    else
+                        call.args[func.nargs_pos] = extra_args;
                     call.args_ref = std::move(extra_args);
@@ -724,14 +878,18 @@ class cpp_function : public function {
         } catch (error_already_set &e) {
             return nullptr;
-#if defined(__GNUG__) && !defined(__clang__)
+#ifdef __GLIBCXX__
         } catch ( abi::__forced_unwind& ) {
         } catch (...) {
             /* When an exception is caught, give each registered exception
-               translator a chance to translate it to a Python exception
-               in reverse order of registration.
+               translator a chance to translate it to a Python exception. First
+               all module-local translators will be tried in reverse order of
+               registration. If none of the module-locale translators handle
+               the exception (or there are no module-locale translators) then
+               the global translators will be tried, also in reverse order of
+               registration.
                A translator may choose to do one of the following:
@@ -740,17 +898,15 @@ class cpp_function : public function {
                 - do nothing and let the exception fall through to the next translator, or
                 - delegate translation to the next translator by throwing a new type of exception. */
-            auto last_exception = std::current_exception();
-            auto &registered_exception_translators = get_internals().registered_exception_translators;
-            for (auto& translator : registered_exception_translators) {
-                try {
-                    translator(last_exception);
-                } catch (...) {
-                    last_exception = std::current_exception();
-                    continue;
-                }
+            auto &local_exception_translators = get_local_internals().registered_exception_translators;
+            if (detail::apply_exception_translators(local_exception_translators)) {
                 return nullptr;
+            auto &exception_translators = get_internals().registered_exception_translators;
+            if (detail::apply_exception_translators(exception_translators)) {
+                return nullptr;
+            }
             PyErr_SetString(PyExc_SystemError, "Exception escaped from default exception translator!");
             return nullptr;
@@ -832,47 +988,54 @@ class cpp_function : public function {
+#if PY_VERSION_HEX >= 0x03030000
+            // Attach additional error info to the exception if supported
+            if (PyErr_Occurred()) {
+                // #HelpAppreciated: unit test coverage for this branch.
+                raise_from(PyExc_TypeError, msg.c_str());
+                return nullptr;
+            }
             PyErr_SetString(PyExc_TypeError, msg.c_str());
             return nullptr;
-        } else if (!result) {
+        }
+        if (!result) {
             std::string msg = "Unable to convert function return value to a "
                               "Python type! The signature was\n\t";
             msg += it->signature;
+#if PY_VERSION_HEX >= 0x03030000
+            // Attach additional error info to the exception if supported
+            if (PyErr_Occurred()) {
+                raise_from(PyExc_TypeError, msg.c_str());
+                return nullptr;
+            }
             PyErr_SetString(PyExc_TypeError, msg.c_str());
             return nullptr;
-        } else {
-            if (overloads->is_constructor && !self_value_and_holder.holder_constructed()) {
-                auto *pi = reinterpret_cast<instance *>(parent.ptr());
-                self_value_and_holder.type->init_instance(pi, nullptr);
-            }
-            return result.ptr();
+        if (overloads->is_constructor && !self_value_and_holder.holder_constructed()) {
+            auto *pi = reinterpret_cast<instance *>(parent.ptr());
+            self_value_and_holder.type->init_instance(pi, nullptr);
+        }
+        return result.ptr();
 /// Wrapper for Python extension modules
 class module_ : public object {
     PYBIND11_OBJECT_DEFAULT(module_, object, PyModule_Check)
     /// Create a new top-level Python module with the given name and docstring
+    PYBIND11_DEPRECATED("Use PYBIND11_MODULE or module_::create_extension_module instead")
     explicit module_(const char *name, const char *doc = nullptr) {
-        if (!options::show_user_defined_docstrings()) doc = nullptr;
-        auto *def = new PyModuleDef();
-        std::memset(def, 0, sizeof(PyModuleDef));
-        def->m_name = name;
-        def->m_doc = doc;
-        def->m_size = -1;
-        Py_INCREF(def);
-        m_ptr = PyModule_Create(def);
+        *this = create_extension_module(name, doc, new PyModuleDef());
-        m_ptr = Py_InitModule3(name, nullptr, doc);
+        *this = create_extension_module(name, doc, nullptr);
-        if (m_ptr == nullptr)
-            pybind11_fail("Internal error in module_::module_()");
-        inc_ref();
     /** \rst
@@ -896,9 +1059,9 @@ class module_ : public object {
         .. code-block:: cpp
-            py::module m("example", "pybind11 example plugin");
-            py::module m2 = m.def_submodule("sub", "A submodule of 'example'");
-            py::module m3 = m2.def_submodule("subsub", "A submodule of 'example.sub'");
+            py::module_ m("example", "pybind11 example plugin");
+            py::module_ m2 = m.def_submodule("sub", "A submodule of 'example'");
+            py::module_ m3 = m2.def_submodule("subsub", "A submodule of 'example.sub'");
     \endrst */
     module_ def_submodule(const char *name, const char *doc = nullptr) {
         std::string full_name = std::string(PyModule_GetName(m_ptr))
@@ -926,11 +1089,13 @@ class module_ : public object {
         *this = reinterpret_steal<module_>(obj);
-    // Adds an object to the module using the given name.  Throws if an object with the given name
-    // already exists.
-    //
-    // overwrite should almost always be false: attempting to overwrite objects that pybind11 has
-    // established will, in most cases, break things.
+    /** \rst
+        Adds an object to the module using the given name.  Throws if an object with the given name
+        already exists.
+        ``overwrite`` should almost always be false: attempting to overwrite objects that pybind11 has
+        established will, in most cases, break things.
+    \endrst */
     PYBIND11_NOINLINE void add_object(const char *name, handle obj, bool overwrite = false) {
         if (!overwrite && hasattr(*this, name))
             pybind11_fail("Error during initialization: multiple incompatible definitions with name \"" +
@@ -938,8 +1103,53 @@ class module_ : public object {
         PyModule_AddObject(ptr(), name, obj.inc_ref().ptr() /* steals a reference */);
+    using module_def = PyModuleDef;
+    struct module_def {};
+    /** \rst
+        Create a new top-level module that can be used as the main module of a C extension.
+        For Python 3, ``def`` should point to a statically allocated module_def.
+        For Python 2, ``def`` can be a nullptr and is completely ignored.
+    \endrst */
+    static module_ create_extension_module(const char *name, const char *doc, module_def *def) {
+        // module_def is PyModuleDef
+        def = new (def) PyModuleDef {  // Placement new (not an allocation).
+            /* m_base */     PyModuleDef_HEAD_INIT,
+            /* m_name */     name,
+            /* m_doc */      options::show_user_defined_docstrings() ? doc : nullptr,
+            /* m_size */     -1,
+            /* m_methods */  nullptr,
+            /* m_slots */    nullptr,
+            /* m_traverse */ nullptr,
+            /* m_clear */    nullptr,
+            /* m_free */     nullptr
+        };
+        auto m = PyModule_Create(def);
+        // Ignore module_def *def; only necessary for Python 3
+        (void) def;
+        auto m = Py_InitModule3(name, nullptr, options::show_user_defined_docstrings() ? doc : nullptr);
+        if (m == nullptr) {
+            if (PyErr_Occurred())
+                throw error_already_set();
+            pybind11_fail("Internal error in module_::create_extension_module()");
+        }
+        // TODO: Should be reinterpret_steal for Python 3, but Python also steals it again when returned from PyInit_...
+        //       For Python 2, reinterpret_borrow is correct.
+        return reinterpret_borrow<module_>(m);
+    }
+// When inside a namespace (or anywhere as long as it's not the first item on a line),
+// C++20 allows "module" to be used. This is provided for backward compatibility, and for
+// simplicity, if someone wants to use py::module for example, that is perfectly safe.
 using module = module_;
 /// \ingroup python_builtins
@@ -947,22 +1157,31 @@ using module = module_;
 /// or ``__main__.__dict__`` if there is no frame (usually when the interpreter is embedded).
 inline dict globals() {
     PyObject *p = PyEval_GetGlobals();
-    return reinterpret_borrow<dict>(p ? p : module::import("__main__").attr("__dict__").ptr());
+    return reinterpret_borrow<dict>(p ? p : module_::import("__main__").attr("__dict__").ptr());
+#if PY_VERSION_HEX >= 0x03030000
+template <typename... Args,
+          typename = detail::enable_if_t<args_are_all_keyword_or_ds<Args...>()>>
+PYBIND11_DEPRECATED("make_simple_namespace should be replaced with py::module_::import(\"types\").attr(\"SimpleNamespace\") ")
+object make_simple_namespace(Args&&... args_) {
+    return module_::import("types").attr("SimpleNamespace")(std::forward<Args>(args_)...);
 /// Generic support for creating new Python heap types
 class generic_type : public object {
-    template <typename...> friend class class_;
     PYBIND11_OBJECT_DEFAULT(generic_type, object, PyType_Check)
     void initialize(const type_record &rec) {
-        if (rec.scope && hasattr(rec.scope, rec.name))
+        if (rec.scope && hasattr(rec.scope, "__dict__") && rec.scope.attr("__dict__").contains(rec.name))
             pybind11_fail("generic_type: cannot initialize type \"" + std::string(rec.name) +
                           "\": an object with that name is already defined");
-        if (rec.module_local ? get_local_type_info(*rec.type) : get_global_type_info(*rec.type))
+        if ((rec.module_local ? get_local_type_info(*rec.type) : get_global_type_info(*rec.type))
+            != nullptr)
             pybind11_fail("generic_type: type \"" + std::string(rec.name) +
                           "\" is already registered!");
@@ -987,7 +1206,7 @@ class generic_type : public object {
         auto tindex = std::type_index(*rec.type);
         tinfo->direct_conversions = &internals.direct_conversions[tindex];
         if (rec.module_local)
-            registered_local_types_cpp()[tindex] = tinfo;
+            get_local_internals().registered_types_cpp[tindex] = tinfo;
             internals.registered_types_cpp[tindex] = tinfo;
         internals.registered_types_py[(PyTypeObject *) m_ptr] = { tinfo };
@@ -997,8 +1216,12 @@ class generic_type : public object {
             tinfo->simple_ancestors = false;
         else if (rec.bases.size() == 1) {
-            auto parent_tinfo = get_type_info((PyTypeObject *) rec.bases[0].ptr());
-            tinfo->simple_ancestors = parent_tinfo->simple_ancestors;
+            auto *parent_tinfo = get_type_info((PyTypeObject *) rec.bases[0].ptr());
+            assert(parent_tinfo != nullptr);
+            bool parent_simple_ancestors = parent_tinfo->simple_ancestors;
+            tinfo->simple_ancestors = parent_simple_ancestors;
+            // The parent can no longer be a simple type if it has MI and has a child
+            parent_tinfo->simple_type = parent_tinfo->simple_type && parent_simple_ancestors;
         if (rec.module_local) {
@@ -1028,7 +1251,7 @@ class generic_type : public object {
         if (!type->ht_type.tp_as_buffer)
                 "To be able to register buffer protocol support for the type '" +
-                std::string(tinfo->type->tp_name) +
+                get_fully_qualified_tp_name(tinfo->type) +
                 "' the associated class<>(..) invocation must "
                 "include the pybind11::buffer_protocol() annotation!");
@@ -1040,8 +1263,9 @@ class generic_type : public object {
     void def_property_static_impl(const char *name,
                                   handle fget, handle fset,
                                   detail::function_record *rec_func) {
-        const auto is_static = rec_func && !(rec_func->is_method && rec_func->scope);
-        const auto has_doc = rec_func && rec_func->doc && pybind11::options::show_user_defined_docstrings();
+        const auto is_static = (rec_func != nullptr) && !(rec_func->is_method && rec_func->scope);
+        const auto has_doc = (rec_func != nullptr) && (rec_func->doc != nullptr)
+                             && pybind11::options::show_user_defined_docstrings();
         auto property = handle((PyObject *) (is_static ? get_internals().static_property_type
                                                        : &PyProperty_Type));
         attr(name) = property(fget.ptr() ? fget : none(),
@@ -1090,8 +1314,8 @@ inline void call_operator_delete(void *p, size_t s, size_t a) {
 inline void add_class_method(object& cls, const char *name_, const cpp_function &cf) {
     cls.attr(cf.name()) = cf;
-    if (strcmp(name_, "__eq__") == 0 && !cls.attr("__dict__").contains("__hash__")) {
-      cls.attr("__hash__") = none();
+    if (std::strcmp(name_, "__eq__") == 0 && !cls.attr("__dict__").contains("__hash__")) {
+        cls.attr("__hash__") = none();
@@ -1173,7 +1397,7 @@ class class_ : public detail::generic_type {
         if (has_alias) {
-            auto &instances = record.module_local ? registered_local_types_cpp() : get_internals().registered_types_cpp;
+            auto &instances = record.module_local ? get_local_internals().registered_types_cpp : get_internals().registered_types_cpp;
             instances[std::type_index(typeid(type_alias))] = instances[std::type_index(typeid(type))];
@@ -1220,12 +1444,14 @@ class class_ : public detail::generic_type {
     template <typename... Args, typename... Extra>
     class_ &def(const detail::initimpl::constructor<Args...> &init, const Extra&... extra) {
         init.execute(*this, extra...);
         return *this;
     template <typename... Args, typename... Extra>
     class_ &def(const detail::initimpl::alias_constructor<Args...> &init, const Extra&... extra) {
         init.execute(*this, extra...);
         return *this;
@@ -1242,7 +1468,8 @@ class class_ : public detail::generic_type {
         return *this;
-    template <typename Func> class_& def_buffer(Func &&func) {
+    template <typename Func>
+    class_& def_buffer(Func &&func) {
         struct capture { Func func; };
         auto *ptr = new capture { std::forward<Func>(func) };
         install_buffer_funcs([](PyObject *obj, void *ptr) -> buffer_info* {
@@ -1251,6 +1478,10 @@ class class_ : public detail::generic_type {
                 return nullptr;
             return new buffer_info(((capture *) ptr)->func(caster));
         }, ptr);
+        weakref(m_ptr, cpp_function([ptr](handle wr) {
+            delete ptr;
+            wr.dec_ref();
+        })).release();
         return *this;
@@ -1283,15 +1514,15 @@ class class_ : public detail::generic_type {
     template <typename D, typename... Extra>
     class_ &def_readwrite_static(const char *name, D *pm, const Extra& ...extra) {
-        cpp_function fget([pm](object) -> const D &{ return *pm; }, scope(*this)),
-                     fset([pm](object, const D &value) { *pm = value; }, scope(*this));
+        cpp_function fget([pm](const object &) -> const D & { return *pm; }, scope(*this)),
+            fset([pm](const object &, const D &value) { *pm = value; }, scope(*this));
         def_property_static(name, fget, fset, return_value_policy::reference, extra...);
         return *this;
     template <typename D, typename... Extra>
     class_ &def_readonly_static(const char *name, const D *pm, const Extra& ...extra) {
-        cpp_function fget([pm](object) -> const D &{ return *pm; }, scope(*this));
+        cpp_function fget([pm](const object &) -> const D & { return *pm; }, scope(*this));
         def_property_readonly_static(name, fget, return_value_policy::reference, extra...);
         return *this;
@@ -1355,16 +1586,16 @@ class class_ : public detail::generic_type {
            char *doc_prev = rec_fget->doc; /* 'extra' field may include a property-specific documentation string */
            detail::process_attributes<Extra...>::init(extra..., rec_fget);
            if (rec_fget->doc && rec_fget->doc != doc_prev) {
-              free(doc_prev);
-              rec_fget->doc = strdup(rec_fget->doc);
+              std::free(doc_prev);
+              rec_fget->doc = PYBIND11_COMPAT_STRDUP(rec_fget->doc);
         if (rec_fset) {
             char *doc_prev = rec_fset->doc;
             detail::process_attributes<Extra...>::init(extra..., rec_fset);
             if (rec_fset->doc && rec_fset->doc != doc_prev) {
-                free(doc_prev);
-                rec_fset->doc = strdup(rec_fset->doc);
+                std::free(doc_prev);
+                rec_fset->doc = PYBIND11_COMPAT_STRDUP(rec_fset->doc);
             if (! rec_active) rec_active = rec_fset;
@@ -1377,14 +1608,13 @@ class class_ : public detail::generic_type {
     template <typename T>
     static void init_holder(detail::instance *inst, detail::value_and_holder &v_h,
             const holder_type * /* unused */, const std::enable_shared_from_this<T> * /* dummy */) {
-        try {
-            auto sh = std::dynamic_pointer_cast<typename holder_type::element_type>(
-                    v_h.value_ptr<type>()->shared_from_this());
-            if (sh) {
-                new (std::addressof(v_h.holder<holder_type>())) holder_type(std::move(sh));
-                v_h.set_holder_constructed();
-            }
-        } catch (const std::bad_weak_ptr &) {}
+        auto sh = std::dynamic_pointer_cast<typename holder_type::element_type>(
+                detail::try_get_shared_from_this(v_h.value_ptr<type>()));
+        if (sh) {
+            new (std::addressof(v_h.holder<holder_type>())) holder_type(std::move(sh));
+            v_h.set_holder_constructed();
+        }
         if (!v_h.holder_constructed() && inst->owned) {
             new (std::addressof(v_h.holder<holder_type>())) holder_type(v_h.value_ptr<type>());
@@ -1481,8 +1711,18 @@ detail::initimpl::pickle_factory<GetState, SetState> pickle(GetState &&g, SetSta
+inline str enum_name(handle arg) {
+    dict entries = arg.get_type().attr("__entries");
+    for (auto kv : entries) {
+        if (handle(kv.second[int_(0)]).equal(arg))
+            return pybind11::str(kv.first);
+    }
+    return "???";
 struct enum_base {
-    enum_base(handle base, handle parent) : m_base(base), m_parent(parent) { }
+    enum_base(const handle &base, const handle &parent) : m_base(base), m_parent(parent) { }
     PYBIND11_NOINLINE void init(bool is_arithmetic, bool is_convertible) {
         m_base.attr("__entries") = dict();
@@ -1490,29 +1730,22 @@ struct enum_base {
         auto static_property = handle((PyObject *) get_internals().static_property_type);
         m_base.attr("__repr__") = cpp_function(
-            [](handle arg) -> str {
+            [](const object &arg) -> str {
                 handle type = type::handle_of(arg);
                 object type_name = type.attr("__name__");
-                dict entries = type.attr("__entries");
-                for (const auto &kv : entries) {
-                    object other = kv.second[int_(0)];
-                    if (other.equal(arg))
-                        return pybind11::str("{}.{}").format(type_name, kv.first);
-                }
-                return pybind11::str("{}.???").format(type_name);
-            }, name("__repr__"), is_method(m_base)
-        );
+                return pybind11::str("<{}.{}: {}>").format(type_name, enum_name(arg), int_(arg));
+            },
+            name("__repr__"),
+            is_method(m_base));
+        m_base.attr("name") = property(cpp_function(&enum_name, name("name"), is_method(m_base)));
-        m_base.attr("name") = property(cpp_function(
+        m_base.attr("__str__") = cpp_function(
             [](handle arg) -> str {
-                dict entries = type::handle_of(arg).attr("__entries");
-                for (const auto &kv : entries) {
-                    if (handle(kv.second[int_(0)]).equal(arg))
-                        return pybind11::str(kv.first);
-                }
-                return "???";
+                object type_name = type::handle_of(arg).attr("__name__");
+                return pybind11::str("{}.{}").format(type_name, enum_name(arg));
             }, name("name"), is_method(m_base)
-        ));
+        );
         m_base.attr("__doc__") = static_property(cpp_function(
             [](handle arg) -> std::string {
@@ -1521,7 +1754,7 @@ struct enum_base {
                 if (((PyTypeObject *) arg.ptr())->tp_doc)
                     docstring += std::string(((PyTypeObject *) arg.ptr())->tp_doc) + "\n\n";
                 docstring += "Members:";
-                for (const auto &kv : entries) {
+                for (auto kv : entries) {
                     auto key = std::string(pybind11::str(kv.first));
                     auto comment = kv.second[int_(1)];
                     docstring += "\n\n  " + key;
@@ -1535,36 +1768,42 @@ struct enum_base {
         m_base.attr("__members__") = static_property(cpp_function(
             [](handle arg) -> dict {
                 dict entries = arg.attr("__entries"), m;
-                for (const auto &kv : entries)
+                for (auto kv : entries)
                     m[kv.first] = kv.second[int_(0)];
                 return m;
             }, name("__members__")), none(), none(), ""
-        #define PYBIND11_ENUM_OP_STRICT(op, expr, strict_behavior)                     \
-            m_base.attr(op) = cpp_function(                                            \
-                [](object a, object b) {                                               \
-                    if (!type::handle_of(a).is(type::handle_of(b)))                    \
-                        strict_behavior;                                               \
-                    return expr;                                                       \
-                },                                                                     \
-                name(op), is_method(m_base))
-        #define PYBIND11_ENUM_OP_CONV(op, expr)                                        \
-            m_base.attr(op) = cpp_function(                                            \
-                [](object a_, object b_) {                                             \
-                    int_ a(a_), b(b_);                                                 \
-                    return expr;                                                       \
-                },                                                                     \
-                name(op), is_method(m_base))
-        #define PYBIND11_ENUM_OP_CONV_LHS(op, expr)                                    \
-            m_base.attr(op) = cpp_function(                                            \
-                [](object a_, object b) {                                              \
-                    int_ a(a_);                                                        \
-                    return expr;                                                       \
-                },                                                                     \
-                name(op), is_method(m_base))
+#define PYBIND11_ENUM_OP_STRICT(op, expr, strict_behavior)                                        \
+    m_base.attr(op) = cpp_function(                                                               \
+        [](const object &a, const object &b) {                                                    \
+            if (!type::handle_of(a).is(type::handle_of(b)))                                       \
+                strict_behavior; /* NOLINT(bugprone-macro-parentheses) */                         \
+            return expr;                                                                          \
+        },                                                                                        \
+        name(op),                                                                                 \
+        is_method(m_base),                                                                        \
+        arg("other"))
+#define PYBIND11_ENUM_OP_CONV(op, expr)                                                           \
+    m_base.attr(op) = cpp_function(                                                               \
+        [](const object &a_, const object &b_) {                                                  \
+            int_ a(a_), b(b_);                                                                    \
+            return expr;                                                                          \
+        },                                                                                        \
+        name(op),                                                                                 \
+        is_method(m_base),                                                                        \
+        arg("other"))
+#define PYBIND11_ENUM_OP_CONV_LHS(op, expr)                                                       \
+    m_base.attr(op) = cpp_function(                                                               \
+        [](const object &a_, const object &b) {                                                   \
+            int_ a(a_);                                                                           \
+            return expr;                                                                          \
+        },                                                                                        \
+        name(op),                                                                                 \
+        is_method(m_base),                                                                        \
+        arg("other"))
         if (is_convertible) {
             PYBIND11_ENUM_OP_CONV_LHS("__eq__", !b.is_none() &&  a.equal(b));
@@ -1581,8 +1820,10 @@ struct enum_base {
                 PYBIND11_ENUM_OP_CONV("__ror__",  a |  b);
                 PYBIND11_ENUM_OP_CONV("__xor__",  a ^  b);
                 PYBIND11_ENUM_OP_CONV("__rxor__", a ^  b);
-                m_base.attr("__invert__") = cpp_function(
-                    [](object arg) { return ~(int_(arg)); }, name("__invert__"), is_method(m_base));
+                m_base.attr("__invert__")
+                    = cpp_function([](const object &arg) { return ~(int_(arg)); },
+                                   name("__invert__"),
+                                   is_method(m_base));
         } else {
             PYBIND11_ENUM_OP_STRICT("__eq__",  int_(a).equal(int_(b)), return false);
@@ -1603,10 +1844,10 @@ struct enum_base {
         #undef PYBIND11_ENUM_OP_STRICT
         m_base.attr("__getstate__") = cpp_function(
-            [](object arg) { return int_(arg); }, name("__getstate__"), is_method(m_base));
+            [](const object &arg) { return int_(arg); }, name("__getstate__"), is_method(m_base));
         m_base.attr("__hash__") = cpp_function(
-            [](object arg) { return int_(arg); }, name("__hash__"), is_method(m_base));
+            [](const object &arg) { return int_(arg); }, name("__hash__"), is_method(m_base));
     PYBIND11_NOINLINE void value(char const* name_, object value, const char *doc = nullptr) {
@@ -1623,7 +1864,7 @@ struct enum_base {
     PYBIND11_NOINLINE void export_values() {
         dict entries = m_base.attr("__entries");
-        for (const auto &kv : entries)
+        for (auto kv : entries)
             m_parent.attr(kv.first) = kv.second[int_(0)];
@@ -1631,6 +1872,19 @@ struct enum_base {
     handle m_parent;
+template <bool is_signed, size_t length> struct equivalent_integer {};
+template <> struct equivalent_integer<true,  1> { using type = int8_t;   };
+template <> struct equivalent_integer<false, 1> { using type = uint8_t;  };
+template <> struct equivalent_integer<true,  2> { using type = int16_t;  };
+template <> struct equivalent_integer<false, 2> { using type = uint16_t; };
+template <> struct equivalent_integer<true,  4> { using type = int32_t;  };
+template <> struct equivalent_integer<false, 4> { using type = uint32_t; };
+template <> struct equivalent_integer<true,  8> { using type = int64_t;  };
+template <> struct equivalent_integer<false, 8> { using type = uint64_t; };
+template <typename IntLike>
+using equivalent_integer_t = typename equivalent_integer<std::is_signed<IntLike>::value, sizeof(IntLike)>::type;
 /// Binds C++ enumerations and enumeration classes to Python
@@ -1641,16 +1895,21 @@ template <typename Type> class enum_ : public class_<Type> {
     using Base::attr;
     using Base::def_property_readonly;
     using Base::def_property_readonly_static;
-    using Scalar = typename std::underlying_type<Type>::type;
+    using Underlying = typename std::underlying_type<Type>::type;
+    // Scalar is the integer representation of underlying type
+    using Scalar = detail::conditional_t<detail::any_of<
+        detail::is_std_char_type<Underlying>, std::is_same<Underlying, bool>
+    >::value, detail::equivalent_integer_t<Underlying>, Underlying>;
     template <typename... Extra>
     enum_(const handle &scope, const char *name, const Extra&... extra)
       : class_<Type>(scope, name, extra...), m_base(*this, scope) {
         constexpr bool is_arithmetic = detail::any_of<std::is_same<arithmetic, Extra>...>::value;
-        constexpr bool is_convertible = std::is_convertible<Type, Scalar>::value;
+        constexpr bool is_convertible = std::is_convertible<Type, Underlying>::value;
         m_base.init(is_arithmetic, is_convertible);
-        def(init([](Scalar i) { return static_cast<Type>(i); }));
+        def(init([](Scalar i) { return static_cast<Type>(i); }), arg("value"));
+        def_property_readonly("value", [](Type value) { return (Scalar) value; });
         def("__int__", [](Type value) { return (Scalar) value; });
         #if PY_MAJOR_VERSION < 3
             def("__long__", [](Type value) { return (Scalar) value; });
@@ -1664,7 +1923,7 @@ template <typename Type> class enum_ : public class_<Type> {
                 detail::initimpl::setstate<Base>(v_h, static_cast<Type>(arg),
                         Py_TYPE(v_h.inst) != v_h.type->type); },
-            pybind11::name("__setstate__"), is_method(*this));
+            pybind11::name("__setstate__"), is_method(*this), arg("state"));
     /// Export enumeration entries into the parent scope
@@ -1686,7 +1945,7 @@ template <typename Type> class enum_ : public class_<Type> {
-inline void keep_alive_impl(handle nurse, handle patient) {
+PYBIND11_NOINLINE void keep_alive_impl(handle nurse, handle patient) {
     if (!nurse || !patient)
         pybind11_fail("Could not activate keep_alive!");
@@ -1713,13 +1972,13 @@ inline void keep_alive_impl(handle nurse, handle patient) {
-PYBIND11_NOINLINE inline void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret) {
+PYBIND11_NOINLINE void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret) {
     auto get_arg = [&](size_t n) {
         if (n == 0)
             return ret;
-        else if (n == 1 && call.init_self)
+        if (n == 1 && call.init_self)
             return call.init_self;
-        else if (n <= call.args.size())
+        if (n <= call.args.size())
             return call.args[n - 1];
         return handle();
@@ -1739,6 +1998,16 @@ inline std::pair<decltype(internals::registered_types_py)::iterator, bool> all_t
         // gets destroyed:
         weakref((PyObject *) type, cpp_function([type](handle wr) {
+            // TODO consolidate the erasure code in pybind11_meta_dealloc() in class.h
+            auto &cache = get_internals().inactive_override_cache;
+            for (auto it = cache.begin(), last = cache.end(); it != last; ) {
+                if (it->first == reinterpret_cast<PyObject *>(type))
+                    it = cache.erase(it);
+                else
+                    ++it;
+            }
@@ -1746,23 +2015,79 @@ inline std::pair<decltype(internals::registered_types_py)::iterator, bool> all_t
     return res;
-template <typename Iterator, typename Sentinel, bool KeyIterator, return_value_policy Policy>
+/* There are a large number of apparently unused template arguments because
+ * each combination requires a separate py::class_ registration.
+ */
+template <typename Access, return_value_policy Policy, typename Iterator, typename Sentinel, typename ValueType, typename... Extra>
 struct iterator_state {
     Iterator it;
     Sentinel end;
     bool first_or_done;
+// Note: these helpers take the iterator by non-const reference because some
+// iterators in the wild can't be dereferenced when const. The & after Iterator
+// is required for MSVC < 16.9. SFINAE cannot be reused for result_type due to
+// bugs in ICC, NVCC, and PGI compilers. See PR #3293.
+template <typename Iterator, typename SFINAE = decltype(*std::declval<Iterator &>())>
+struct iterator_access {
+    using result_type = decltype(*std::declval<Iterator &>());
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    result_type operator()(Iterator &it) const {
+        return *it;
+    }
-/// Makes a python iterator from a first and past-the-end C++ InputIterator.
-template <return_value_policy Policy = return_value_policy::reference_internal,
+template <typename Iterator, typename SFINAE = decltype((*std::declval<Iterator &>()).first) >
+class iterator_key_access {
+    using pair_type = decltype(*std::declval<Iterator &>());
+    /* If either the pair itself or the element of the pair is a reference, we
+     * want to return a reference, otherwise a value. When the decltype
+     * expression is parenthesized it is based on the value category of the
+     * expression; otherwise it is the declared type of the pair member.
+     * The use of declval<pair_type> in the second branch rather than directly
+     * using *std::declval<Iterator &>() is a workaround for nvcc
+     * (it's not used in the first branch because going via decltype and back
+     * through declval does not perfectly preserve references).
+     */
+    using result_type = conditional_t<
+        std::is_reference<decltype(*std::declval<Iterator &>())>::value,
+        decltype(((*std::declval<Iterator &>()).first)),
+        decltype(std::declval<pair_type>().first)
+    >;
+    result_type operator()(Iterator &it) const {
+        return (*it).first;
+    }
+template <typename Iterator, typename SFINAE = decltype((*std::declval<Iterator &>()).second)>
+class iterator_value_access {
+    using pair_type = decltype(*std::declval<Iterator &>());
+    using result_type = conditional_t<
+        std::is_reference<decltype(*std::declval<Iterator &>())>::value,
+        decltype(((*std::declval<Iterator &>()).second)),
+        decltype(std::declval<pair_type>().second)
+    >;
+    result_type operator()(Iterator &it) const {
+        return (*it).second;
+    }
+template <typename Access,
+          return_value_policy Policy,
           typename Iterator,
           typename Sentinel,
-          typename ValueType = decltype(*std::declval<Iterator>()),
+          typename ValueType,
           typename... Extra>
-iterator make_iterator(Iterator first, Sentinel last, Extra &&... extra) {
-    typedef detail::iterator_state<Iterator, Sentinel, false, Policy> state;
+iterator make_iterator_impl(Iterator first, Sentinel last, Extra &&... extra) {
+    using state = detail::iterator_state<Access, Policy, Iterator, Sentinel, ValueType, Extra...>;
+    // TODO: state captures only the types of Extra, not the values
     if (!detail::get_type_info(typeid(state), false)) {
         class_<state>(handle(), "iterator", pybind11::module_local())
@@ -1776,40 +2101,63 @@ iterator make_iterator(Iterator first, Sentinel last, Extra &&... extra) {
                     s.first_or_done = true;
                     throw stop_iteration();
-                return *s.it;
+                return Access()(s.it);
+            // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
             }, std::forward<Extra>(extra)..., Policy);
     return cast(state{first, last, true});
-/// Makes an python iterator over the keys (`.first`) of a iterator over pairs from a
-/// first and past-the-end InputIterator.
+/// Makes a python iterator from a first and past-the-end C++ InputIterator.
 template <return_value_policy Policy = return_value_policy::reference_internal,
           typename Iterator,
           typename Sentinel,
-          typename KeyType = decltype((*std::declval<Iterator>()).first),
+          typename ValueType = typename detail::iterator_access<Iterator>::result_type,
           typename... Extra>
-iterator make_key_iterator(Iterator first, Sentinel last, Extra &&... extra) {
-    using state = detail::iterator_state<Iterator, Sentinel, true, Policy>;
+iterator make_iterator(Iterator first, Sentinel last, Extra &&... extra) {
+    return detail::make_iterator_impl<
+        detail::iterator_access<Iterator>,
+        Policy,
+        Iterator,
+        Sentinel,
+        ValueType,
+        Extra...>(first, last, std::forward<Extra>(extra)...);
-    if (!detail::get_type_info(typeid(state), false)) {
-        class_<state>(handle(), "iterator", pybind11::module_local())
-            .def("__iter__", [](state &s) -> state& { return s; })
-            .def("__next__", [](state &s) -> KeyType {
-                if (!s.first_or_done)
-                    ++s.it;
-                else
-                    s.first_or_done = false;
-                if (s.it == s.end) {
-                    s.first_or_done = true;
-                    throw stop_iteration();
-                }
-                return (*s.it).first;
-            }, std::forward<Extra>(extra)..., Policy);
-    }
+/// Makes a python iterator over the keys (`.first`) of a iterator over pairs from a
+/// first and past-the-end InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename KeyType = typename detail::iterator_key_access<Iterator>::result_type,
+          typename... Extra>
+iterator make_key_iterator(Iterator first, Sentinel last, Extra &&...extra) {
+    return detail::make_iterator_impl<
+        detail::iterator_key_access<Iterator>,
+        Policy,
+        Iterator,
+        Sentinel,
+        KeyType,
+        Extra...>(first, last, std::forward<Extra>(extra)...);
-    return cast(state{first, last, true});
+/// Makes a python iterator over the values (`.second`) of a iterator over pairs from a
+/// first and past-the-end InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename ValueType = typename detail::iterator_value_access<Iterator>::result_type,
+          typename... Extra>
+iterator make_value_iterator(Iterator first, Sentinel last, Extra &&...extra) {
+    return detail::make_iterator_impl<
+        detail::iterator_value_access<Iterator>,
+        Policy, Iterator,
+        Sentinel,
+        ValueType,
+        Extra...>(first, last, std::forward<Extra>(extra)...);
 /// Makes an iterator over values of an stl container or other container supporting
@@ -1826,10 +2174,17 @@ template <return_value_policy Policy = return_value_policy::reference_internal,
     return make_key_iterator<Policy>(std::begin(value), std::end(value), extra...);
+/// Makes an iterator over the values (`.second`) of a stl map-like container supporting
+/// `std::begin()`/`std::end()`
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Type, typename... Extra> iterator make_value_iterator(Type &value, Extra&&... extra) {
+    return make_value_iterator<Policy>(std::begin(value), std::end(value), extra...);
 template <typename InputType, typename OutputType> void implicitly_convertible() {
     struct set_flag {
         bool &flag;
-        set_flag(bool &flag) : flag(flag) { flag = true; }
+        explicit set_flag(bool &flag_) : flag(flag_) { flag_ = true; }
         ~set_flag() { flag = false; }
     auto implicit_caster = [](PyObject *obj, PyTypeObject *type) -> PyObject * {
@@ -1853,12 +2208,24 @@ template <typename InputType, typename OutputType> void implicitly_convertible()
         pybind11_fail("implicitly_convertible: Unable to find type " + type_id<OutputType>());
-template <typename ExceptionTranslator>
-void register_exception_translator(ExceptionTranslator&& translator) {
+inline void register_exception_translator(ExceptionTranslator &&translator) {
+  * Add a new module-local exception translator. Locally registered functions
+  * will be tried before any globally registered exception translators, which
+  * will only be invoked if the module-local handlers do not deal with
+  * the exception.
+  */
+inline void register_local_exception_translator(ExceptionTranslator &&translator) {
+    detail::get_local_internals().registered_exception_translators.push_front(
+        std::forward<ExceptionTranslator>(translator));
  * Wrapper to generate a new Python exception type.
@@ -1874,7 +2241,7 @@ class exception : public object {
         std::string full_name = scope.attr("__name__").cast<std::string>() +
                                 std::string(".") + name;
         m_ptr = PyErr_NewException(const_cast<char *>(full_name.c_str()), base.ptr(), NULL);
-        if (hasattr(scope, name))
+        if (hasattr(scope, "__dict__") && scope.attr("__dict__").contains(name))
             pybind11_fail("Error during initialization: multiple incompatible "
                           "definitions with name \"" + std::string(name) + "\"");
         scope.attr(name) = *this;
@@ -1892,22 +2259,20 @@ PYBIND11_NAMESPACE_BEGIN(detail)
 // directly in register_exception, but that makes clang <3.5 segfault - issue #1349).
 template <typename CppException>
 exception<CppException> &get_exception_object() { static exception<CppException> ex; return ex; }
- * Registers a Python exception in `m` of the given `name` and installs an exception translator to
- * translate the C++ exception to the created Python exception using the exceptions what() method.
- * This is intended for simple exception translations; for more complex translation, register the
- * exception object and translator directly.
- */
+// Helper function for register_exception and register_local_exception
 template <typename CppException>
-exception<CppException> &register_exception(handle scope,
-                                            const char *name,
-                                            handle base = PyExc_Exception) {
+exception<CppException> &register_exception_impl(handle scope,
+                                                const char *name,
+                                                handle base,
+                                                bool isLocal) {
     auto &ex = detail::get_exception_object<CppException>();
     if (!ex) ex = exception<CppException>(scope, name, base);
-    register_exception_translator([](std::exception_ptr p) {
+    auto register_func = isLocal ? &register_local_exception_translator
+                                 : &register_exception_translator;
+    register_func([](std::exception_ptr p) {
         if (!p) return;
         try {
@@ -1918,8 +2283,38 @@ exception<CppException> &register_exception(handle scope,
     return ex;
+ * Registers a Python exception in `m` of the given `name` and installs a translator to
+ * translate the C++ exception to the created Python exception using the what() method.
+ * This is intended for simple exception translations; for more complex translation, register the
+ * exception object and translator directly.
+ */
+template <typename CppException>
+exception<CppException> &register_exception(handle scope,
+                                            const char *name,
+                                            handle base = PyExc_Exception) {
+    return detail::register_exception_impl<CppException>(scope, name, base, false /* isLocal */);
+ * Registers a Python exception in `m` of the given `name` and installs a translator to
+ * translate the C++ exception to the created Python exception using the what() method.
+ * This translator will only be used for exceptions that are thrown in this module and will be
+ * tried before global exception translators, including those registered with register_exception.
+ * This is intended for simple exception translations; for more complex translation, register the
+ * exception object and translator directly.
+ */
+template <typename CppException>
+exception<CppException> &register_local_exception(handle scope,
+                                                  const char *name,
+                                                  handle base = PyExc_Exception) {
+    return detail::register_exception_impl<CppException>(scope, name, base, true /* isLocal */);
-PYBIND11_NOINLINE inline void print(tuple args, dict kwargs) {
+PYBIND11_NOINLINE void print(const tuple &args, const dict &kwargs) {
     auto strings = tuple(args.size());
     for (size_t i = 0; i < args.size(); ++i) {
         strings[i] = str(args[i]);
@@ -1932,7 +2327,7 @@ PYBIND11_NOINLINE inline void print(tuple args, dict kwargs) {
         file = kwargs["file"].cast<object>();
     } else {
         try {
-            file = module::import("sys").attr("stdout");
+            file = module_::import("sys").attr("stdout");
         } catch (const error_already_set &) {
             /* If print() is called from code that is executed as
                part of garbage collection during interpreter shutdown,
@@ -1957,151 +2352,6 @@ void print(Args &&...args) {
     detail::print(c.args(), c.kwargs());
-#if defined(WITH_THREAD) && !defined(PYPY_VERSION)
-/* The functions below essentially reproduce the PyGILState_* API using a RAII
- * pattern, but there are a few important differences:
- *
- * 1. When acquiring the GIL from an non-main thread during the finalization
- *    phase, the GILState API blindly terminates the calling thread, which
- *    is often not what is wanted. This API does not do this.
- *
- * 2. The gil_scoped_release function can optionally cut the relationship
- *    of a PyThreadState and its associated thread, which allows moving it to
- *    another thread (this is a fairly rare/advanced use case).
- *
- * 3. The reference count of an acquired thread state can be controlled. This
- *    can be handy to prevent cases where callbacks issued from an external
- *    thread would otherwise constantly construct and destroy thread state data
- *    structures.
- *
- * See the Python bindings of NanoGUI (http://github.com/wjakob/nanogui) for an
- * example which uses features 2 and 3 to migrate the Python thread of
- * execution to another thread (to run the event loop on the original thread,
- * in this case).
- */
-class gil_scoped_acquire {
-    PYBIND11_NOINLINE gil_scoped_acquire() {
-        auto const &internals = detail::get_internals();
-        tstate = (PyThreadState *) PYBIND11_TLS_GET_VALUE(internals.tstate);
-        if (!tstate) {
-            /* Check if the GIL was acquired using the PyGILState_* API instead (e.g. if
-               calling from a Python thread). Since we use a different key, this ensures
-               we don't create a new thread state and deadlock in PyEval_AcquireThread
-               below. Note we don't save this state with internals.tstate, since we don't
-               create it we would fail to clear it (its reference count should be > 0). */
-            tstate = PyGILState_GetThisThreadState();
-        }
-        if (!tstate) {
-            tstate = PyThreadState_New(internals.istate);
-            #if !defined(NDEBUG)
-                if (!tstate)
-                    pybind11_fail("scoped_acquire: could not create thread state!");
-            #endif
-            tstate->gilstate_counter = 0;
-            PYBIND11_TLS_REPLACE_VALUE(internals.tstate, tstate);
-        } else {
-            release = detail::get_thread_state_unchecked() != tstate;
-        }
-        if (release) {
-            /* Work around an annoying assertion in PyThreadState_Swap */
-            #if defined(Py_DEBUG)
-                PyInterpreterState *interp = tstate->interp;
-                tstate->interp = nullptr;
-            #endif
-            PyEval_AcquireThread(tstate);
-            #if defined(Py_DEBUG)
-                tstate->interp = interp;
-            #endif
-        }
-        inc_ref();
-    }
-    void inc_ref() {
-        ++tstate->gilstate_counter;
-    }
-    PYBIND11_NOINLINE void dec_ref() {
-        --tstate->gilstate_counter;
-        #if !defined(NDEBUG)
-            if (detail::get_thread_state_unchecked() != tstate)
-                pybind11_fail("scoped_acquire::dec_ref(): thread state must be current!");
-            if (tstate->gilstate_counter < 0)
-                pybind11_fail("scoped_acquire::dec_ref(): reference count underflow!");
-        #endif
-        if (tstate->gilstate_counter == 0) {
-            #if !defined(NDEBUG)
-                if (!release)
-                    pybind11_fail("scoped_acquire::dec_ref(): internal error!");
-            #endif
-            PyThreadState_Clear(tstate);
-            PyThreadState_DeleteCurrent();
-            PYBIND11_TLS_DELETE_VALUE(detail::get_internals().tstate);
-            release = false;
-        }
-    }
-    PYBIND11_NOINLINE ~gil_scoped_acquire() {
-        dec_ref();
-        if (release)
-           PyEval_SaveThread();
-    }
-    PyThreadState *tstate = nullptr;
-    bool release = true;
-class gil_scoped_release {
-    explicit gil_scoped_release(bool disassoc = false) : disassoc(disassoc) {
-        // `get_internals()` must be called here unconditionally in order to initialize
-        // `internals.tstate` for subsequent `gil_scoped_acquire` calls. Otherwise, an
-        // initialization race could occur as multiple threads try `gil_scoped_acquire`.
-        const auto &internals = detail::get_internals();
-        tstate = PyEval_SaveThread();
-        if (disassoc) {
-            auto key = internals.tstate;
-            PYBIND11_TLS_DELETE_VALUE(key);
-        }
-    }
-    ~gil_scoped_release() {
-        if (!tstate)
-            return;
-        PyEval_RestoreThread(tstate);
-        if (disassoc) {
-            auto key = detail::get_internals().tstate;
-            PYBIND11_TLS_REPLACE_VALUE(key, tstate);
-        }
-    }
-    PyThreadState *tstate;
-    bool disassoc;
-#elif defined(PYPY_VERSION)
-class gil_scoped_acquire {
-    PyGILState_STATE state;
-    gil_scoped_acquire() { state = PyGILState_Ensure(); }
-    ~gil_scoped_acquire() { PyGILState_Release(state); }
-class gil_scoped_release {
-    PyThreadState *state;
-    gil_scoped_release() { state = PyEval_SaveThread(); }
-    ~gil_scoped_release() { PyEval_RestoreThread(state); }
-class gil_scoped_acquire { };
-class gil_scoped_release { };
 error_already_set::~error_already_set() {
     if (m_type) {
         gil_scoped_acquire gil;
@@ -2134,16 +2384,42 @@ inline function get_type_override(const void *this_ptr, const type_info *this_ty
     /* Don't call dispatch code if invoked from overridden function.
        Unfortunately this doesn't work on PyPy. */
-#if !defined(PYPY_VERSION)
+#if !defined(PYPY_VERSION) && PY_VERSION_HEX < 0x030B0000
+    // TODO: Remove PyPy workaround for Python 3.11.
+    // Current API fails on 3.11 since co_varnames can be null.
+#if PY_VERSION_HEX >= 0x03090000
+    PyFrameObject *frame = PyThreadState_GetFrame(PyThreadState_Get());
+    if (frame != nullptr) {
+        PyCodeObject *f_code = PyFrame_GetCode(frame);
+        // f_code is guaranteed to not be NULL
+        if ((std::string) str(f_code->co_name) == name && f_code->co_argcount > 0) {
+            PyObject* locals = PyEval_GetLocals();
+            if (locals != nullptr && f_code->co_varnames != nullptr) {
+                PyObject *self_caller = dict_getitem(
+                    locals, PyTuple_GET_ITEM(f_code->co_varnames, 0)
+                );
+                if (self_caller == self.ptr()) {
+                    Py_DECREF(f_code);
+                    Py_DECREF(frame);
+                    return function();
+                }
+            }
+        }
+        Py_DECREF(f_code);
+        Py_DECREF(frame);
+    }
     PyFrameObject *frame = PyThreadState_Get()->frame;
-    if (frame && (std::string) str(frame->f_code->co_name) == name &&
-        frame->f_code->co_argcount > 0) {
+    if (frame != nullptr && (std::string) str(frame->f_code->co_name) == name
+        && frame->f_code->co_argcount > 0) {
-        PyObject *self_caller = PyDict_GetItem(
+        PyObject *self_caller = dict_getitem(
             frame->f_locals, PyTuple_GET_ITEM(frame->f_code->co_varnames, 0));
         if (self_caller == self.ptr())
             return function();
     /* PyPy currently doesn't provide a detailed cpyext emulation of
        frame objects, so we have to emulate this using Python. This
@@ -2174,7 +2450,7 @@ PYBIND11_NAMESPACE_END(detail)
 /** \rst
   Try to retrieve a python method by the provided name from the instance pointed to by the this_ptr.
-  :this_ptr: The pointer to the object the overriden method should be retrieved for. This should be
+  :this_ptr: The pointer to the object the overridden method should be retrieved for. This should be
              the first non-trampoline class encountered in the inheritance chain.
   :name: The name of the overridden Python method to retrieve.
   :return: The Python method by this name from the object or an empty function wrapper.
@@ -2184,18 +2460,19 @@ template <class T> function get_override(const T *this_ptr, const char *name) {
     return tinfo ? detail::get_type_override(this_ptr, tinfo, name) : function();
-#define PYBIND11_OVERRIDE_IMPL(ret_type, cname, name, ...) \
-    do { \
-        pybind11::gil_scoped_acquire gil; \
-        pybind11::function override = pybind11::get_override(static_cast<const cname *>(this), name); \
-        if (override) { \
-            auto o = override(__VA_ARGS__); \
-            if (pybind11::detail::cast_is_temporary_value_reference<ret_type>::value) { \
-                static pybind11::detail::override_caster_t<ret_type> caster; \
-                return pybind11::detail::cast_ref<ret_type>(std::move(o), caster); \
-            } \
-            else return pybind11::detail::cast_safe<ret_type>(std::move(o)); \
-        } \
+#define PYBIND11_OVERRIDE_IMPL(ret_type, cname, name, ...)                                        \
+    do {                                                                                          \
+        pybind11::gil_scoped_acquire gil;                                                         \
+        pybind11::function override                                                               \
+            = pybind11::get_override(static_cast<const cname *>(this), name);                     \
+        if (override) {                                                                           \
+            auto o = override(__VA_ARGS__);                                                       \
+            if (pybind11::detail::cast_is_temporary_value_reference<ret_type>::value) {           \
+                static pybind11::detail::override_caster_t<ret_type> caster;                      \
+                return pybind11::detail::cast_ref<ret_type>(std::move(o), caster);                \
+            }                                                                                     \
+            return pybind11::detail::cast_safe<ret_type>(std::move(o));                           \
+        }                                                                                         \
     } while (false)
 /** \rst
@@ -2291,8 +2568,6 @@ inline function get_overload(const T *this_ptr, const char *name) {
-#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
-#  pragma warning(pop)
-#elif defined(__GNUG__) && !defined(__clang__)
-#  pragma GCC diagnostic pop
+#if defined(__GNUC__) && __GNUC__ == 7
+#    pragma GCC diagnostic pop // -Wnoexcept-type
diff --git a/wrap/pybind11/include/pybind11/pytypes.h b/wrap/pybind11/include/pybind11/pytypes.h
index a2f7cec486..902fb1f07d 100644
--- a/wrap/pybind11/include/pybind11/pytypes.h
+++ b/wrap/pybind11/include/pybind11/pytypes.h
@@ -14,6 +14,14 @@
 #include <utility>
 #include <type_traits>
+#if defined(PYBIND11_HAS_OPTIONAL)
+#  include <optional>
+#  include <string_view>
 /* A few forward declarations */
@@ -24,7 +32,7 @@ struct arg; struct arg_v;
 class args_proxy;
-inline bool isinstance_generic(handle obj, const std::type_info &tp);
+bool isinstance_generic(handle obj, const std::type_info &tp);
 // Accessor forward declarations
 template <typename Policy> class accessor;
@@ -153,7 +161,7 @@ class object_api : public pyobject_tag {
     /// Return the object's current reference count
     int ref_count() const { return static_cast<int>(Py_REFCNT(derived().ptr())); }
-    PYBIND11_DEPRECATED("Call py::type::handle_of(h) or py::type::of(h) instead of h.get_type()")
+    // TODO PYBIND11_DEPRECATED("Call py::type::handle_of(h) or py::type::of(h) instead of h.get_type()")
     handle get_type() const;
@@ -178,6 +186,7 @@ class handle : public detail::object_api<handle> {
     /// The default constructor creates a handle with a ``nullptr``-valued pointer
     handle() = default;
     /// Creates a ``handle`` from the given raw Python object pointer
+    // NOLINTNEXTLINE(google-explicit-constructor)
     handle(PyObject *ptr) : m_ptr(ptr) { } // Allow implicit conversion from PyObject*
     /// Return the underlying ``PyObject *`` pointer
@@ -254,8 +263,11 @@ class object : public handle {
     object& operator=(const object &other) {
-        dec_ref();
+        // Use temporary variable to ensure `*this` remains valid while
+        // `Py_XDECREF` executes, in case `*this` is accessible from Python.
+        handle temp(m_ptr);
         m_ptr = other.m_ptr;
+        temp.dec_ref();
         return *this;
@@ -279,8 +291,10 @@ class object : public handle {
     struct borrowed_t { };
     struct stolen_t { };
+    /// @cond BROKEN
     template <typename T> friend T reinterpret_borrow(handle);
     template <typename T> friend T reinterpret_steal(handle);
+    /// @endcond
     // Only accessible from derived classes and the reinterpret_* functions
@@ -314,14 +328,18 @@ template <typename T> T reinterpret_borrow(handle h) { return {h, object::borrow
 template <typename T> T reinterpret_steal(handle h) { return {h, object::stolen_t{}}; }
-inline std::string error_string();
+std::string error_string();
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4275 4251) // warning C4275: An exported class was derived from a class that wasn't exported. Can be ignored when derived from a STL class.
 /// Fetch and hold an error which was already set in Python.  An instance of this is typically
 /// thrown to propagate python-side errors back through C++ which can either be caught manually or
 /// else falls back to the function dispatcher (which then raises the captured error back to
 /// python).
-class error_already_set : public std::runtime_error {
+class PYBIND11_EXPORT_EXCEPTION error_already_set : public std::runtime_error {
     /// Constructs a new exception from the current Python error indicator, if any.  The current
     /// Python error indicator will be cleared.
@@ -339,16 +357,17 @@ class error_already_set : public std::runtime_error {
     /// error variables (but the `.what()` string is still available).
     void restore() { PyErr_Restore(m_type.release().ptr(), m_value.release().ptr(), m_trace.release().ptr()); }
-    /// If it is impossible to raise the currently-held error, such as in destructor, we can write
-    /// it out using Python's unraisable hook (sys.unraisablehook). The error context should be
-    /// some object whose repr() helps identify the location of the error. Python already knows the
-    /// type and value of the error, so there is no need to repeat that. For example, __func__ could
-    /// be helpful. After this call, the current object no longer stores the error variables,
-    /// and neither does Python.
+    /// If it is impossible to raise the currently-held error, such as in a destructor, we can write
+    /// it out using Python's unraisable hook (`sys.unraisablehook`). The error context should be
+    /// some object whose `repr()` helps identify the location of the error. Python already knows the
+    /// type and value of the error, so there is no need to repeat that. After this call, the current
+    /// object no longer stores the error variables, and neither does Python.
     void discard_as_unraisable(object err_context) {
+    /// An alternate version of `discard_as_unraisable()`, where a string provides information on the
+    /// location of the error. For example, `__func__` could be helpful.
     void discard_as_unraisable(const char *err_context) {
@@ -360,7 +379,9 @@ class error_already_set : public std::runtime_error {
     /// Check if the currently trapped error type matches the given Python exception class (or a
     /// subclass thereof).  May also be passed a tuple to search for any exception class matches in
     /// the given tuple.
-    bool matches(handle exc) const { return PyErr_GivenExceptionMatches(m_type.ptr(), exc.ptr()); }
+    bool matches(handle exc) const {
+        return (PyErr_GivenExceptionMatches(m_type.ptr(), exc.ptr()) != 0);
+    }
     const object& type() const { return m_type; }
     const object& value() const { return m_value; }
@@ -369,8 +390,52 @@ class error_already_set : public std::runtime_error {
     object m_type, m_value, m_trace;
+#if defined(_MSC_VER)
+#  pragma warning(pop)
+#if PY_VERSION_HEX >= 0x03030000
+/// Replaces the current Python error indicator with the chosen error, performing a
+/// 'raise from' to indicate that the chosen error was caused by the original error.
+inline void raise_from(PyObject *type, const char *message) {
+    // Based on _PyErr_FormatVFromCause:
+    // https://github.com/python/cpython/blob/467ab194fc6189d9f7310c89937c51abeac56839/Python/errors.c#L405
+    // See https://github.com/pybind/pybind11/pull/2112 for details.
+    PyObject *exc = nullptr, *val = nullptr, *val2 = nullptr, *tb = nullptr;
+    assert(PyErr_Occurred());
+    PyErr_Fetch(&exc, &val, &tb);
+    PyErr_NormalizeException(&exc, &val, &tb);
+    if (tb != nullptr) {
+        PyException_SetTraceback(val, tb);
+        Py_DECREF(tb);
+    }
+    Py_DECREF(exc);
+    assert(!PyErr_Occurred());
+    PyErr_SetString(type, message);
+    PyErr_Fetch(&exc, &val2, &tb);
+    PyErr_NormalizeException(&exc, &val2, &tb);
+    Py_INCREF(val);
+    PyException_SetCause(val2, val);
+    PyException_SetContext(val2, val);
+    PyErr_Restore(exc, val2, tb);
+/// Sets the current Python error indicator with the chosen error, performing a 'raise from'
+/// from the error contained in error_already_set to indicate that the chosen error was
+/// caused by the original error. After this function is called error_already_set will
+/// no longer contain an error.
+inline void raise_from(error_already_set& err, PyObject *type, const char *message) {
+    err.restore();
+    raise_from(type, message);
-/** \defgroup python_builtins _
+/** \defgroup python_builtins const_name
     Unless stated otherwise, the following C++ functions behave the same
     as their Python counterparts.
@@ -431,19 +496,17 @@ inline object getattr(handle obj, const char *name) {
 inline object getattr(handle obj, handle name, handle default_) {
     if (PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr())) {
         return reinterpret_steal<object>(result);
-    } else {
-        PyErr_Clear();
-        return reinterpret_borrow<object>(default_);
+    PyErr_Clear();
+    return reinterpret_borrow<object>(default_);
 inline object getattr(handle obj, const char *name, handle default_) {
     if (PyObject *result = PyObject_GetAttrString(obj.ptr(), name)) {
         return reinterpret_steal<object>(result);
-    } else {
-        PyErr_Clear();
-        return reinterpret_borrow<object>(default_);
+    PyErr_Clear();
+    return reinterpret_borrow<object>(default_);
 inline void setattr(handle obj, handle name, handle value) {
@@ -476,6 +539,43 @@ inline handle get_function(handle value) {
     return value;
+// Reimplementation of python's dict helper functions to ensure that exceptions
+// aren't swallowed (see #2862)
+// copied from cpython _PyDict_GetItemStringWithError
+inline PyObject * dict_getitemstring(PyObject *v, const char *key)
+    PyObject *kv = nullptr, *rv = nullptr;
+    kv = PyUnicode_FromString(key);
+    if (kv == NULL) {
+        throw error_already_set();
+    }
+    rv = PyDict_GetItemWithError(v, kv);
+    Py_DECREF(kv);
+    if (rv == NULL && PyErr_Occurred()) {
+        throw error_already_set();
+    }
+    return rv;
+    return PyDict_GetItemString(v, key);
+inline PyObject * dict_getitem(PyObject *v, PyObject *key)
+    PyObject *rv = PyDict_GetItemWithError(v, key);
+    if (rv == NULL && PyErr_Occurred()) {
+        throw error_already_set();
+    }
+    return rv;
+    return PyDict_GetItem(v, key);
 // Helper aliases/functions to support implicit casting of values given to python accessors/methods.
 // When given a pyobject, this simply returns the pyobject as-is; for other C++ type, the value goes
 // through pybind11::cast(obj) to convert it to an `object`.
@@ -487,6 +587,10 @@ object object_or_cast(T &&o);
 // Match a PyObject*, which we want to convert directly to handle via its converting constructor
 inline handle object_or_cast(PyObject *ptr) { return ptr; }
+#if defined(_MSC_VER) && _MSC_VER < 1920
+#  pragma warning(push)
+#  pragma warning(disable: 4522) // warning C4522: multiple assignment operators specified
 template <typename Policy>
 class accessor : public object_api<accessor<Policy>> {
     using key_type = typename Policy::key_type;
@@ -494,7 +598,7 @@ class accessor : public object_api<accessor<Policy>> {
     accessor(handle obj, key_type key) : obj(obj), key(std::move(key)) { }
     accessor(const accessor &) = default;
-    accessor(accessor &&) = default;
+    accessor(accessor &&) noexcept = default;
     // accessor overload required to override default assignment operator (templates are not allowed
     // to replace default compiler-generated assignments).
@@ -520,6 +624,7 @@ class accessor : public object_api<accessor<Policy>> {
         return obj.contains(key);
+    // NOLINTNEXTLINE(google-explicit-constructor)
     operator object() const { return get_cache(); }
     PyObject *ptr() const { return get_cache().ptr(); }
     template <typename T> T cast() const { return get_cache().template cast<T>(); }
@@ -535,6 +640,9 @@ class accessor : public object_api<accessor<Policy>> {
     key_type key;
     mutable object cache;
+#if defined(_MSC_VER) && _MSC_VER < 1920
+#  pragma warning(pop)
 struct obj_attr {
@@ -566,15 +674,17 @@ struct generic_item {
 struct sequence_item {
     using key_type = size_t;
-    static object get(handle obj, size_t index) {
-        PyObject *result = PySequence_GetItem(obj.ptr(), static_cast<ssize_t>(index));
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static object get(handle obj, const IdxType &index) {
+        PyObject *result = PySequence_GetItem(obj.ptr(), ssize_t_cast(index));
         if (!result) { throw error_already_set(); }
         return reinterpret_steal<object>(result);
-    static void set(handle obj, size_t index, handle val) {
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static void set(handle obj, const IdxType &index, handle val) {
         // PySequence_SetItem does not steal a reference to 'val'
-        if (PySequence_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.ptr()) != 0) {
+        if (PySequence_SetItem(obj.ptr(), ssize_t_cast(index), val.ptr()) != 0) {
             throw error_already_set();
@@ -583,15 +693,17 @@ struct sequence_item {
 struct list_item {
     using key_type = size_t;
-    static object get(handle obj, size_t index) {
-        PyObject *result = PyList_GetItem(obj.ptr(), static_cast<ssize_t>(index));
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static object get(handle obj, const IdxType &index) {
+        PyObject *result = PyList_GetItem(obj.ptr(), ssize_t_cast(index));
         if (!result) { throw error_already_set(); }
         return reinterpret_borrow<object>(result);
-    static void set(handle obj, size_t index, handle val) {
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static void set(handle obj, const IdxType &index, handle val) {
         // PyList_SetItem steals a reference to 'val'
-        if (PyList_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.inc_ref().ptr()) != 0) {
+        if (PyList_SetItem(obj.ptr(), ssize_t_cast(index), val.inc_ref().ptr()) != 0) {
             throw error_already_set();
@@ -600,15 +712,17 @@ struct list_item {
 struct tuple_item {
     using key_type = size_t;
-    static object get(handle obj, size_t index) {
-        PyObject *result = PyTuple_GetItem(obj.ptr(), static_cast<ssize_t>(index));
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static object get(handle obj, const IdxType &index) {
+        PyObject *result = PyTuple_GetItem(obj.ptr(), ssize_t_cast(index));
         if (!result) { throw error_already_set(); }
         return reinterpret_borrow<object>(result);
-    static void set(handle obj, size_t index, handle val) {
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static void set(handle obj, const IdxType &index, handle val) {
         // PyTuple_SetItem steals a reference to 'val'
-        if (PyTuple_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.inc_ref().ptr()) != 0) {
+        if (PyTuple_SetItem(obj.ptr(), ssize_t_cast(index), val.inc_ref().ptr()) != 0) {
             throw error_already_set();
@@ -630,7 +744,9 @@ class generic_iterator : public Policy {
     generic_iterator() = default;
     generic_iterator(handle seq, ssize_t index) : Policy(seq, index) { }
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
     reference operator*() const { return Policy::dereference(); }
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
     reference operator[](difference_type n) const { return *(*this + n); }
     pointer operator->() const { return **this; }
@@ -660,7 +776,8 @@ template <typename T>
 struct arrow_proxy {
     T value;
-    arrow_proxy(T &&value) : value(std::move(value)) { }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    arrow_proxy(T &&value) noexcept : value(std::move(value)) { }
     T *operator->() const { return &value; }
@@ -669,11 +786,12 @@ class sequence_fast_readonly {
     using iterator_category = std::random_access_iterator_tag;
     using value_type = handle;
-    using reference = const handle;
+    using reference = const handle; // PR #3263
     using pointer = arrow_proxy<const handle>;
     sequence_fast_readonly(handle obj, ssize_t n) : ptr(PySequence_Fast_ITEMS(obj.ptr()) + n) { }
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
     reference dereference() const { return *ptr; }
     void increment() { ++ptr; }
     void decrement() { --ptr; }
@@ -712,14 +830,19 @@ class dict_readonly {
     using iterator_category = std::forward_iterator_tag;
     using value_type = std::pair<handle, handle>;
-    using reference = const value_type;
+    using reference = const value_type; // PR #3263
     using pointer = arrow_proxy<const value_type>;
     dict_readonly() = default;
     dict_readonly(handle obj, ssize_t pos) : obj(obj), pos(pos) { increment(); }
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
     reference dereference() const { return {key, value}; }
-    void increment() { if (!PyDict_Next(obj.ptr(), &pos, &key, &value)) { pos = -1; } }
+    void increment() {
+        if (PyDict_Next(obj.ptr(), &pos, &key, &value) == 0) {
+            pos = -1;
+        }
+    }
     bool equal(const dict_readonly &b) const { return pos == b.pos; }
@@ -745,16 +868,20 @@ inline bool PyIterable_Check(PyObject *obj) {
     if (iter) {
         return true;
-    } else {
-        PyErr_Clear();
-        return false;
+    PyErr_Clear();
+    return false;
 inline bool PyNone_Check(PyObject *o) { return o == Py_None; }
 inline bool PyEllipsis_Check(PyObject *o) { return o == Py_Ellipsis; }
 inline bool PyUnicode_Check_Permissive(PyObject *o) { return PyUnicode_Check(o) || PYBIND11_BYTES_CHECK(o); }
+#define PYBIND11_STR_CHECK_FUN detail::PyUnicode_Check_Permissive
+#define PYBIND11_STR_CHECK_FUN PyUnicode_Check
 inline bool PyStaticMethod_Check(PyObject *o) { return o->ob_type == &PyStaticMethod_Type; }
@@ -797,26 +924,42 @@ PYBIND11_NAMESPACE_END(detail)
         Name(handle h, borrowed_t) : Parent(h, borrowed_t{}) { } \
         Name(handle h, stolen_t) : Parent(h, stolen_t{}) { } \
         PYBIND11_DEPRECATED("Use py::isinstance<py::python_type>(obj) instead") \
-        bool check() const { return m_ptr != nullptr && (bool) CheckFun(m_ptr); } \
+        bool check() const { return m_ptr != nullptr && (CheckFun(m_ptr) != 0); } \
         static bool check_(handle h) { return h.ptr() != nullptr && CheckFun(h.ptr()); } \
         template <typename Policy_> \
+        /* NOLINTNEXTLINE(google-explicit-constructor) */ \
         Name(const ::pybind11::detail::accessor<Policy_> &a) : Name(object(a)) { }
 #define PYBIND11_OBJECT_CVT(Name, Parent, CheckFun, ConvertFun) \
     PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
     /* This is deliberately not 'explicit' to allow implicit conversion from object: */ \
+    /* NOLINTNEXTLINE(google-explicit-constructor) */ \
     Name(const object &o) \
     : Parent(check_(o) ? o.inc_ref().ptr() : ConvertFun(o.ptr()), stolen_t{}) \
     { if (!m_ptr) throw error_already_set(); } \
+    /* NOLINTNEXTLINE(google-explicit-constructor) */ \
     Name(object &&o) \
     : Parent(check_(o) ? o.release().ptr() : ConvertFun(o.ptr()), stolen_t{}) \
     { if (!m_ptr) throw error_already_set(); }
+#define PYBIND11_OBJECT_CVT_DEFAULT(Name, Parent, CheckFun, ConvertFun) \
+    PYBIND11_OBJECT_CVT(Name, Parent, CheckFun, ConvertFun) \
+    Name() : Parent() { }
+#define PYBIND11_OBJECT_CHECK_FAILED(Name, o_ptr) \
+    ::pybind11::type_error("Object of type '" + \
+                           ::pybind11::detail::get_fully_qualified_tp_name(Py_TYPE(o_ptr)) + \
+                           "' is not an instance of '" #Name "'")
 #define PYBIND11_OBJECT(Name, Parent, CheckFun) \
     PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
     /* This is deliberately not 'explicit' to allow implicit conversion from object: */ \
-    Name(const object &o) : Parent(o) { } \
-    Name(object &&o) : Parent(std::move(o)) { }
+    /* NOLINTNEXTLINE(google-explicit-constructor) */ \
+    Name(const object &o) : Parent(o) \
+    { if (m_ptr && !check_(m_ptr)) throw PYBIND11_OBJECT_CHECK_FAILED(Name, m_ptr); } \
+    /* NOLINTNEXTLINE(google-explicit-constructor) */ \
+    Name(object &&o) : Parent(std::move(o)) \
+    { if (m_ptr && !check_(m_ptr)) throw PYBIND11_OBJECT_CHECK_FAILED(Name, m_ptr); }
 #define PYBIND11_OBJECT_DEFAULT(Name, Parent, CheckFun) \
     PYBIND11_OBJECT(Name, Parent, CheckFun) \
@@ -838,7 +981,7 @@ class iterator : public object {
     using iterator_category = std::input_iterator_tag;
     using difference_type = ssize_t;
     using value_type = handle;
-    using reference = const handle;
+    using reference = const handle; // PR #3263
     using pointer = const handle *;
     PYBIND11_OBJECT_DEFAULT(iterator, object, PyIter_Check)
@@ -854,6 +997,7 @@ class iterator : public object {
         return rv;
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
     reference operator*() const {
         if (m_ptr && !value.ptr()) {
             auto& self = const_cast<iterator &>(*this);
@@ -927,21 +1071,38 @@ class bytes;
 class str : public object {
-    PYBIND11_OBJECT_CVT(str, object, detail::PyUnicode_Check_Permissive, raw_str)
+    PYBIND11_OBJECT_CVT(str, object, PYBIND11_STR_CHECK_FUN, raw_str)
-    str(const char *c, size_t n)
-        : object(PyUnicode_FromStringAndSize(c, (ssize_t) n), stolen_t{}) {
+    template <typename SzType, detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    str(const char *c, const SzType &n)
+        : object(PyUnicode_FromStringAndSize(c, ssize_t_cast(n)), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate string object!");
     // 'explicit' is explicitly omitted from the following constructors to allow implicit conversion to py::str from C++ string-like objects
+    // NOLINTNEXTLINE(google-explicit-constructor)
     str(const char *c = "")
         : object(PyUnicode_FromString(c), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate string object!");
+    // NOLINTNEXTLINE(google-explicit-constructor)
     str(const std::string &s) : str(s.data(), s.size()) { }
+    // enable_if is needed to avoid "ambiguous conversion" errors (see PR #3521).
+    template <typename T, detail::enable_if_t<std::is_same<T, std::string_view>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    str(T s) : str(s.data(), s.size()) { }
+    // reinterpret_cast here is safe (C++20 guarantees char8_t has the same size/alignment as char)
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    str(std::u8string_view s) : str(reinterpret_cast<const char*>(s.data()), s.size()) { }
+# endif
     explicit str(const bytes &b);
     /** \rst
@@ -950,15 +1111,16 @@ class str : public object {
     \endrst */
     explicit str(handle h) : object(raw_str(h.ptr()), stolen_t{}) { if (!m_ptr) throw error_already_set(); }
+    // NOLINTNEXTLINE(google-explicit-constructor)
     operator std::string() const {
         object temp = *this;
         if (PyUnicode_Check(m_ptr)) {
             temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(m_ptr));
             if (!temp)
-                pybind11_fail("Unable to extract string contents! (encoding issue)");
+                throw error_already_set();
-        char *buffer;
-        ssize_t length;
+        char *buffer = nullptr;
+        ssize_t length = 0;
         if (PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), &buffer, &length))
             pybind11_fail("Unable to extract string contents! (invalid type)");
         return std::string(buffer, (size_t) length);
@@ -997,28 +1159,52 @@ class bytes : public object {
     PYBIND11_OBJECT(bytes, object, PYBIND11_BYTES_CHECK)
     // Allow implicit conversion:
+    // NOLINTNEXTLINE(google-explicit-constructor)
     bytes(const char *c = "")
         : object(PYBIND11_BYTES_FROM_STRING(c), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate bytes object!");
-    bytes(const char *c, size_t n)
-        : object(PYBIND11_BYTES_FROM_STRING_AND_SIZE(c, (ssize_t) n), stolen_t{}) {
+    template <typename SzType, detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    bytes(const char *c, const SzType &n)
+        : object(PYBIND11_BYTES_FROM_STRING_AND_SIZE(c, ssize_t_cast(n)), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate bytes object!");
     // Allow implicit conversion:
+    // NOLINTNEXTLINE(google-explicit-constructor)
     bytes(const std::string &s) : bytes(s.data(), s.size()) { }
     explicit bytes(const pybind11::str &s);
+    // NOLINTNEXTLINE(google-explicit-constructor)
     operator std::string() const {
-        char *buffer;
-        ssize_t length;
+        char *buffer = nullptr;
+        ssize_t length = 0;
         if (PYBIND11_BYTES_AS_STRING_AND_SIZE(m_ptr, &buffer, &length))
             pybind11_fail("Unable to extract bytes contents!");
         return std::string(buffer, (size_t) length);
+    // enable_if is needed to avoid "ambiguous conversion" errors (see PR #3521).
+    template <typename T, detail::enable_if_t<std::is_same<T, std::string_view>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    bytes(T s) : bytes(s.data(), s.size()) { }
+    // Obtain a string view that views the current `bytes` buffer value.  Note that this is only
+    // valid so long as the `bytes` instance remains alive and so generally should not outlive the
+    // lifetime of the `bytes` instance.
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator std::string_view() const {
+        char *buffer = nullptr;
+        ssize_t length = 0;
+        if (PYBIND11_BYTES_AS_STRING_AND_SIZE(m_ptr, &buffer, &length))
+            pybind11_fail("Unable to extract bytes contents!");
+        return {buffer, static_cast<size_t>(length)};
+    }
 // Note: breathe >= 4.17.0 will fail to build docs if the below two constructors
 // are included in the doxygen group; close here and reopen after as a workaround
@@ -1031,8 +1217,8 @@ inline bytes::bytes(const pybind11::str &s) {
         if (!temp)
             pybind11_fail("Unable to extract string contents! (encoding issue)");
-    char *buffer;
-    ssize_t length;
+    char *buffer = nullptr;
+    ssize_t length = 0;
     if (PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), &buffer, &length))
         pybind11_fail("Unable to extract string contents! (invalid type)");
     auto obj = reinterpret_steal<object>(PYBIND11_BYTES_FROM_STRING_AND_SIZE(buffer, length));
@@ -1042,16 +1228,45 @@ inline bytes::bytes(const pybind11::str &s) {
 inline str::str(const bytes& b) {
-    char *buffer;
-    ssize_t length;
+    char *buffer = nullptr;
+    ssize_t length = 0;
     if (PYBIND11_BYTES_AS_STRING_AND_SIZE(b.ptr(), &buffer, &length))
         pybind11_fail("Unable to extract bytes contents!");
-    auto obj = reinterpret_steal<object>(PyUnicode_FromStringAndSize(buffer, (ssize_t) length));
+    auto obj = reinterpret_steal<object>(PyUnicode_FromStringAndSize(buffer, length));
     if (!obj)
         pybind11_fail("Could not allocate string object!");
     m_ptr = obj.release().ptr();
+/// \addtogroup pytypes
+/// @{
+class bytearray : public object {
+    PYBIND11_OBJECT_CVT(bytearray, object, PyByteArray_Check, PyByteArray_FromObject)
+    template <typename SzType, detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    bytearray(const char *c, const SzType &n)
+        : object(PyByteArray_FromStringAndSize(c, ssize_t_cast(n)), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate bytearray object!");
+    }
+    bytearray()
+        : bytearray("", 0) {}
+    explicit bytearray(const std::string &s) : bytearray(s.data(), s.size()) { }
+    size_t size() const { return static_cast<size_t>(PyByteArray_Size(m_ptr)); }
+    explicit operator std::string() const {
+        char *buffer = PyByteArray_AS_STRING(m_ptr);
+        ssize_t size = PyByteArray_GET_SIZE(m_ptr);
+        return std::string(buffer, static_cast<size_t>(size));
+    }
+// Note: breathe >= 4.17.0 will fail to build docs if the below two constructors
+// are included in the doxygen group; close here and reopen after as a workaround
+/// @} pytypes
 /// \addtogroup pytypes
 /// @{
 class none : public object {
@@ -1071,15 +1286,17 @@ class bool_ : public object {
     PYBIND11_OBJECT_CVT(bool_, object, PyBool_Check, raw_bool)
     bool_() : object(Py_False, borrowed_t{}) { }
     // Allow implicit conversion from and to `bool`:
+    // NOLINTNEXTLINE(google-explicit-constructor)
     bool_(bool value) : object(value ? Py_True : Py_False, borrowed_t{}) { }
-    operator bool() const { return m_ptr && PyLong_AsLong(m_ptr) != 0; }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator bool() const { return (m_ptr != nullptr) && PyLong_AsLong(m_ptr) != 0; }
     /// Return the truth value of an object -- always returns a new reference
     static PyObject *raw_bool(PyObject *op) {
         const auto value = PyObject_IsTrue(op);
         if (value == -1) return nullptr;
-        return handle(value ? Py_True : Py_False).inc_ref().ptr();
+        return handle(value != 0 ? Py_True : Py_False).inc_ref().ptr();
@@ -1090,18 +1307,16 @@ PYBIND11_NAMESPACE_BEGIN(detail)
 // unsigned type: (A)-1 != (B)-1 when A and B are unsigned types of different sizes).
 template <typename Unsigned>
 Unsigned as_unsigned(PyObject *o) {
-    if (sizeof(Unsigned) <= sizeof(unsigned long)
+    if (PYBIND11_SILENCE_MSVC_C4127(sizeof(Unsigned) <= sizeof(unsigned long))
 #if PY_VERSION_HEX < 0x03000000
-            || PyInt_Check(o)
+        || PyInt_Check(o)
     ) {
         unsigned long v = PyLong_AsUnsignedLong(o);
         return v == (unsigned long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
-    else {
-        unsigned long long v = PyLong_AsUnsignedLongLong(o);
-        return v == (unsigned long long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
-    }
+    unsigned long long v = PyLong_AsUnsignedLongLong(o);
+    return v == (unsigned long long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
@@ -1112,8 +1327,9 @@ class int_ : public object {
     // Allow implicit conversion from C++ integral types:
     template <typename T,
               detail::enable_if_t<std::is_integral<T>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
     int_(T value) {
-        if (sizeof(T) <= sizeof(long)) {
+        if (PYBIND11_SILENCE_MSVC_C4127(sizeof(T) <= sizeof(long))) {
             if (std::is_signed<T>::value)
                 m_ptr = PyLong_FromLong((long) value);
@@ -1129,6 +1345,7 @@ class int_ : public object {
     template <typename T,
               detail::enable_if_t<std::is_integral<T>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
     operator T() const {
         return std::is_unsigned<T>::value
             ? detail::as_unsigned<T>(m_ptr)
@@ -1142,33 +1359,51 @@ class float_ : public object {
     PYBIND11_OBJECT_CVT(float_, object, PyFloat_Check, PyNumber_Float)
     // Allow implicit conversion from float/double:
+    // NOLINTNEXTLINE(google-explicit-constructor)
     float_(float value) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate float object!");
+    // NOLINTNEXTLINE(google-explicit-constructor)
     float_(double value = .0) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate float object!");
+    // NOLINTNEXTLINE(google-explicit-constructor)
     operator float() const { return (float) PyFloat_AsDouble(m_ptr); }
+    // NOLINTNEXTLINE(google-explicit-constructor)
     operator double() const { return (double) PyFloat_AsDouble(m_ptr); }
 class weakref : public object {
-    PYBIND11_OBJECT_DEFAULT(weakref, object, PyWeakref_Check)
+    PYBIND11_OBJECT_CVT_DEFAULT(weakref, object, PyWeakref_Check, raw_weakref)
     explicit weakref(handle obj, handle callback = {})
         : object(PyWeakref_NewRef(obj.ptr(), callback.ptr()), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate weak reference!");
+    static PyObject *raw_weakref(PyObject *o) {
+        return PyWeakref_NewRef(o, nullptr);
+    }
 class slice : public object {
     PYBIND11_OBJECT_DEFAULT(slice, object, PySlice_Check)
-    slice(ssize_t start_, ssize_t stop_, ssize_t step_) {
-        int_ start(start_), stop(stop_), step(step_);
+    slice(handle start, handle stop, handle step) {
         m_ptr = PySlice_New(start.ptr(), stop.ptr(), step.ptr());
-        if (!m_ptr) pybind11_fail("Could not allocate slice object!");
+        if (!m_ptr)
+            pybind11_fail("Could not allocate slice object!");
+    slice(std::optional<ssize_t> start, std::optional<ssize_t> stop, std::optional<ssize_t> step)
+        : slice(index_to_object(start), index_to_object(stop), index_to_object(step)) {}
+    slice(ssize_t start_, ssize_t stop_, ssize_t step_)
+        : slice(int_(start_), int_(stop_), int_(step_)) {}
     bool compute(size_t length, size_t *start, size_t *stop, size_t *step,
                  size_t *slicelength) const {
         return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *) m_ptr,
@@ -1183,6 +1418,12 @@ class slice : public object {
           stop, step,
           slicelength) == 0;
+    template <typename T>
+    static object index_to_object(T index) {
+        return index ? object(int_(*index)) : object(none());
+    }
 class capsule : public object {
@@ -1218,7 +1459,7 @@ class capsule : public object {
             pybind11_fail("Could not set capsule context!");
-    capsule(void (*destructor)()) {
+    explicit capsule(void (*destructor)()) {
         m_ptr = PyCapsule_New(reinterpret_cast<void *>(destructor), nullptr, [](PyObject *o) {
             auto destructor = reinterpret_cast<void (*)()>(PyCapsule_GetPointer(o, nullptr));
@@ -1228,20 +1469,41 @@ class capsule : public object {
             pybind11_fail("Could not allocate capsule object!");
+    // NOLINTNEXTLINE(google-explicit-constructor)
     template <typename T> operator T *() const {
+        return get_pointer<T>();
+    }
+    /// Get the pointer the capsule holds.
+    template<typename T = void>
+    T* get_pointer() const {
         auto name = this->name();
-        T * result = static_cast<T *>(PyCapsule_GetPointer(m_ptr, name));
-        if (!result) pybind11_fail("Unable to extract capsule contents!");
+        T *result = static_cast<T *>(PyCapsule_GetPointer(m_ptr, name));
+        if (!result) {
+            PyErr_Clear();
+            pybind11_fail("Unable to extract capsule contents!");
+        }
         return result;
+    /// Replaces a capsule's pointer *without* calling the destructor on the existing one.
+    void set_pointer(const void *value) {
+        if (PyCapsule_SetPointer(m_ptr, const_cast<void *>(value)) != 0) {
+            PyErr_Clear();
+            pybind11_fail("Could not set capsule pointer");
+        }
+    }
     const char *name() const { return PyCapsule_GetName(m_ptr); }
 class tuple : public object {
     PYBIND11_OBJECT_CVT(tuple, object, PyTuple_Check, PySequence_Tuple)
-    explicit tuple(size_t size = 0) : object(PyTuple_New((ssize_t) size), stolen_t{}) {
+    template <typename SzType = ssize_t,
+              detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    // Some compilers generate link errors when using `const SzType &` here:
+    explicit tuple(SzType size = 0) : object(PyTuple_New(ssize_t_cast(size)), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate tuple object!");
     size_t size() const { return (size_t) PyTuple_Size(m_ptr); }
@@ -1252,6 +1514,15 @@ class tuple : public object {
     detail::tuple_iterator end() const { return {*this, PyTuple_GET_SIZE(m_ptr)}; }
+// We need to put this into a separate function because the Intel compiler
+// fails to compile enable_if_t<all_of<is_keyword_or_ds<Args>...>::value> part below
+// (tested with ICC 2021.1 Beta 20200827).
+template <typename... Args>
+constexpr bool args_are_all_keyword_or_ds()
+  return detail::all_of<detail::is_keyword_or_ds<Args>...>::value;
 class dict : public object {
     PYBIND11_OBJECT_CVT(dict, object, PyDict_Check, raw_dict)
@@ -1259,7 +1530,7 @@ class dict : public object {
         if (!m_ptr) pybind11_fail("Could not allocate dict object!");
     template <typename... Args,
-              typename = detail::enable_if_t<detail::all_of<detail::is_keyword_or_ds<Args>...>::value>,
+              typename = detail::enable_if_t<args_are_all_keyword_or_ds<Args...>()>,
               // MSVC workaround: it can't compile an out-of-line definition, so defer the collector
               typename collector = detail::deferred_t<detail::unpacking_collector<>, Args...>>
     explicit dict(Args &&...args) : dict(collector(std::forward<Args>(args)...).kwargs()) { }
@@ -1268,7 +1539,7 @@ class dict : public object {
     bool empty() const { return size() == 0; }
     detail::dict_iterator begin() const { return {*this, 0}; }
     detail::dict_iterator end() const { return {}; }
-    void clear() const { PyDict_Clear(ptr()); }
+    void clear() /* py-non-const */ { PyDict_Clear(ptr()); }
     template <typename T> bool contains(T &&key) const {
         return PyDict_Contains(m_ptr, detail::object_or_cast(std::forward<T>(key)).ptr()) == 1;
@@ -1301,7 +1572,10 @@ class sequence : public object {
 class list : public object {
     PYBIND11_OBJECT_CVT(list, object, PyList_Check, PySequence_List)
-    explicit list(size_t size = 0) : object(PyList_New((ssize_t) size), stolen_t{}) {
+    template <typename SzType = ssize_t,
+              detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    // Some compilers generate link errors when using `const SzType &` here:
+    explicit list(SzType size = 0) : object(PyList_New(ssize_t_cast(size)), stolen_t{}) {
         if (!m_ptr) pybind11_fail("Could not allocate list object!");
     size_t size() const { return (size_t) PyList_Size(m_ptr); }
@@ -1310,12 +1584,15 @@ class list : public object {
     detail::item_accessor operator[](handle h) const { return object::operator[](h); }
     detail::list_iterator begin() const { return {*this, 0}; }
     detail::list_iterator end() const { return {*this, PyList_GET_SIZE(m_ptr)}; }
-    template <typename T> void append(T &&val) const {
+    template <typename T> void append(T &&val) /* py-non-const */ {
         PyList_Append(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr());
-    template <typename T> void insert(size_t index, T &&val) const {
-        PyList_Insert(m_ptr, static_cast<ssize_t>(index),
-            detail::object_or_cast(std::forward<T>(val)).ptr());
+    template <typename IdxType,
+              typename ValType,
+              detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    void insert(const IdxType &index, ValType &&val) /* py-non-const */ {
+        PyList_Insert(
+            m_ptr, ssize_t_cast(index), detail::object_or_cast(std::forward<ValType>(val)).ptr());
@@ -1330,10 +1607,10 @@ class set : public object {
     size_t size() const { return (size_t) PySet_Size(m_ptr); }
     bool empty() const { return size() == 0; }
-    template <typename T> bool add(T &&val) const {
+    template <typename T> bool add(T &&val) /* py-non-const */ {
         return PySet_Add(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr()) == 0;
-    void clear() const { PySet_Clear(m_ptr); }
+    void clear() /* py-non-const */ { PySet_Clear(m_ptr); }
     template <typename T> bool contains(T &&val) const {
         return PySet_Contains(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr()) == 1;
@@ -1429,7 +1706,7 @@ class memoryview : public object {
         detail::any_container<ssize_t> shape,
         detail::any_container<ssize_t> strides) {
         return memoryview::from_buffer(
-            const_cast<void*>(ptr), itemsize, format, shape, strides, true);
+            const_cast<void *>(ptr), itemsize, format, std::move(shape), std::move(strides), true);
     template<typename T>
@@ -1475,10 +1752,17 @@ class memoryview : public object {
     static memoryview from_memory(const void *mem, ssize_t size) {
         return memoryview::from_memory(const_cast<void*>(mem), size, true);
+    static memoryview from_memory(std::string_view mem) {
+        return from_memory(const_cast<char*>(mem.data()), static_cast<ssize_t>(mem.size()), true);
+    }
+/// @cond DUPLICATE
 inline memoryview memoryview::from_buffer(
     void *ptr, ssize_t itemsize, const char* format,
     detail::any_container<ssize_t> shape,
@@ -1486,7 +1770,7 @@ inline memoryview memoryview::from_buffer(
     size_t ndim = shape->size();
     if (ndim != strides->size())
         pybind11_fail("memoryview: shape length doesn't match strides length");
-    ssize_t size = ndim ? 1 : 0;
+    ssize_t size = ndim != 0u ? 1 : 0;
     for (size_t i = 0; i < ndim; ++i)
         size *= (*shape)[i];
     Py_buffer view;
@@ -1506,18 +1790,22 @@ inline memoryview memoryview::from_buffer(
         throw error_already_set();
     return memoryview(object(obj, stolen_t{}));
+/// @endcond
 /// @} pytypes
 /// \addtogroup python_builtins
 /// @{
+/// Get the length of a Python object.
 inline size_t len(handle h) {
     ssize_t result = PyObject_Length(h.ptr());
     if (result < 0)
-        pybind11_fail("Unable to compute length of object");
+        throw error_already_set();
     return (size_t) result;
+/// Get the length hint of a Python object.
+/// Returns 0 when this cannot be determined.
 inline size_t len_hint(handle h) {
 #if PY_VERSION_HEX >= 0x03040000
     ssize_t result = PyObject_LengthHint(h.ptr(), 0);
@@ -1580,8 +1868,7 @@ template <typename D>
 str_attr_accessor object_api<D>::doc() const { return attr("__doc__"); }
 template <typename D>
-PYBIND11_DEPRECATED("Use py::type::of(h) instead of h.get_type()")
-handle object_api<D>::get_type() const { return type::handle_of(*this); }
+handle object_api<D>::get_type() const { return type::handle_of(derived()); }
 template <typename D>
 bool object_api<D>::rich_compare(object_api const &other, int value) const {
diff --git a/wrap/pybind11/include/pybind11/stl.h b/wrap/pybind11/include/pybind11/stl.h
index 721bb669f0..4303494827 100644
--- a/wrap/pybind11/include/pybind11/stl.h
+++ b/wrap/pybind11/include/pybind11/stl.h
@@ -9,6 +9,7 @@
 #pragma once
+#include "detail/common.h"
 #include "pybind11.h"
 #include <set>
 #include <unordered_set>
@@ -19,33 +20,15 @@
 #include <deque>
 #include <valarray>
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+// See `detail/common.h` for implementation of these guards.
+#if defined(PYBIND11_HAS_OPTIONAL)
+#  include <optional>
+#elif defined(PYBIND11_HAS_EXP_OPTIONAL)
+#  include <experimental/optional>
-#ifdef __has_include
-// std::optional (but including it in c++14 mode isn't allowed)
-#  if defined(PYBIND11_CPP17) && __has_include(<optional>)
-#    include <optional>
-#    define PYBIND11_HAS_OPTIONAL 1
-#  endif
-// std::experimental::optional (but not allowed in c++11 mode)
-#  if defined(PYBIND11_CPP14) && (__has_include(<experimental/optional>) && \
-                                 !__has_include(<optional>))
-#    include <experimental/optional>
-#    define PYBIND11_HAS_EXP_OPTIONAL 1
-#  endif
-// std::variant
-#  if defined(PYBIND11_CPP17) && __has_include(<variant>)
-#    include <variant>
-#    define PYBIND11_HAS_VARIANT 1
-#  endif
-#elif defined(_MSC_VER) && defined(PYBIND11_CPP17)
-#  include <optional>
+#if defined(PYBIND11_HAS_VARIANT)
 #  include <variant>
-#  define PYBIND11_HAS_OPTIONAL 1
-#  define PYBIND11_HAS_VARIANT 1
@@ -95,7 +78,7 @@ template <typename Type, typename Key> struct set_caster {
         return s.release();
-    PYBIND11_TYPE_CASTER(type, _("Set[") + key_conv::name + _("]"));
+    PYBIND11_TYPE_CASTER(type, const_name("Set[") + key_conv::name + const_name("]"));
 template <typename Type, typename Key, typename Value> struct map_caster {
@@ -137,14 +120,14 @@ template <typename Type, typename Key, typename Value> struct map_caster {
         return d.release();
-    PYBIND11_TYPE_CASTER(Type, _("Dict[") + key_conv::name + _(", ") + value_conv::name + _("]"));
+    PYBIND11_TYPE_CASTER(Type, const_name("Dict[") + key_conv::name + const_name(", ") + value_conv::name + const_name("]"));
 template <typename Type, typename Value> struct list_caster {
     using value_conv = make_caster<Value>;
     bool load(handle src, bool convert) {
-        if (!isinstance<sequence>(src) || isinstance<str>(src))
+        if (!isinstance<sequence>(src) || isinstance<bytes>(src) || isinstance<str>(src))
             return false;
         auto s = reinterpret_borrow<sequence>(src);
@@ -159,10 +142,13 @@ template <typename Type, typename Value> struct list_caster {
-    template <typename T = Type,
-              enable_if_t<std::is_same<decltype(std::declval<T>().reserve(0)), void>::value, int> = 0>
-    void reserve_maybe(sequence s, Type *) { value.reserve(s.size()); }
-    void reserve_maybe(sequence, void *) { }
+    template <
+        typename T                                                                          = Type,
+        enable_if_t<std::is_same<decltype(std::declval<T>().reserve(0)), void>::value, int> = 0>
+    void reserve_maybe(const sequence &s, Type *) {
+        value.reserve(s.size());
+    }
+    void reserve_maybe(const sequence &, void *) {}
     template <typename T>
@@ -170,17 +156,17 @@ template <typename Type, typename Value> struct list_caster {
         if (!std::is_lvalue_reference<T>::value)
             policy = return_value_policy_override<Value>::policy(policy);
         list l(src.size());
-        size_t index = 0;
+        ssize_t index = 0;
         for (auto &&value : src) {
             auto value_ = reinterpret_steal<object>(value_conv::cast(forward_like<T>(value), policy, parent));
             if (!value_)
                 return handle();
-            PyList_SET_ITEM(l.ptr(), (ssize_t) index++, value_.release().ptr()); // steals a reference
+            PyList_SET_ITEM(l.ptr(), index++, value_.release().ptr()); // steals a reference
         return l.release();
-    PYBIND11_TYPE_CASTER(Type, _("List[") + value_conv::name + _("]"));
+    PYBIND11_TYPE_CASTER(Type, const_name("List[") + value_conv::name + const_name("]"));
 template <typename Type, typename Alloc> struct type_caster<std::vector<Type, Alloc>>
@@ -227,17 +213,17 @@ template <typename ArrayType, typename Value, bool Resizable, size_t Size = 0> s
     template <typename T>
     static handle cast(T &&src, return_value_policy policy, handle parent) {
         list l(src.size());
-        size_t index = 0;
+        ssize_t index = 0;
         for (auto &&value : src) {
             auto value_ = reinterpret_steal<object>(value_conv::cast(forward_like<T>(value), policy, parent));
             if (!value_)
                 return handle();
-            PyList_SET_ITEM(l.ptr(), (ssize_t) index++, value_.release().ptr()); // steals a reference
+            PyList_SET_ITEM(l.ptr(), index++, value_.release().ptr()); // steals a reference
         return l.release();
-    PYBIND11_TYPE_CASTER(ArrayType, _("List[") + value_conv::name + _<Resizable>(_(""), _("[") + _<Size>() + _("]")) + _("]"));
+    PYBIND11_TYPE_CASTER(ArrayType, const_name("List[") + value_conv::name + const_name<Resizable>(const_name(""), const_name("[") + const_name<Size>() + const_name("]")) + const_name("]"));
 template <typename Type, size_t Size> struct type_caster<std::array<Type, Size>>
@@ -259,34 +245,35 @@ template <typename Key, typename Value, typename Hash, typename Equal, typename
   : map_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>, Key, Value> { };
 // This type caster is intended to be used for std::optional and std::experimental::optional
-template<typename T> struct optional_caster {
-    using value_conv = make_caster<typename T::value_type>;
+template<typename Type, typename Value = typename Type::value_type> struct optional_caster {
+    using value_conv = make_caster<Value>;
-    template <typename T_>
-    static handle cast(T_ &&src, return_value_policy policy, handle parent) {
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
         if (!src)
             return none().inc_ref();
         if (!std::is_lvalue_reference<T>::value) {
-            policy = return_value_policy_override<T>::policy(policy);
+            policy = return_value_policy_override<Value>::policy(policy);
-        return value_conv::cast(*std::forward<T_>(src), policy, parent);
+        return value_conv::cast(*std::forward<T>(src), policy, parent);
     bool load(handle src, bool convert) {
         if (!src) {
             return false;
-        } else if (src.is_none()) {
+        }
+        if (src.is_none()) {
             return true;  // default-constructed value is already empty
         value_conv inner_caster;
         if (!inner_caster.load(src, convert))
             return false;
-        value.emplace(cast_op<typename T::value_type &&>(std::move(inner_caster)));
+        value.emplace(cast_op<Value &&>(std::move(inner_caster)));
         return true;
-    PYBIND11_TYPE_CASTER(T, _("Optional[") + value_conv::name + _("]"));
+    PYBIND11_TYPE_CASTER(Type, const_name("Optional[") + value_conv::name + const_name("]"));
 #if defined(PYBIND11_HAS_OPTIONAL)
@@ -366,7 +353,7 @@ struct variant_caster<V<Ts...>> {
     using Type = V<Ts...>;
-    PYBIND11_TYPE_CASTER(Type, _("Union[") + detail::concat(make_caster<Ts>::name...) + _("]"));
+    PYBIND11_TYPE_CASTER(Type, const_name("Union[") + detail::concat(make_caster<Ts>::name...) + const_name("]"));
 #if defined(PYBIND11_HAS_VARIANT)
@@ -377,12 +364,12 @@ struct type_caster<std::variant<Ts...>> : variant_caster<std::variant<Ts...>> {
 inline std::ostream &operator<<(std::ostream &os, const handle &obj) {
+    os << str(obj).cast<std::string_view>();
     os << (std::string) str(obj);
     return os;
-#if defined(_MSC_VER)
-#pragma warning(pop)
diff --git a/wrap/pybind11/include/pybind11/stl/filesystem.h b/wrap/pybind11/include/pybind11/stl/filesystem.h
new file mode 100644
index 0000000000..a9a6c8512c
--- /dev/null
+++ b/wrap/pybind11/include/pybind11/stl/filesystem.h
@@ -0,0 +1,103 @@
+// Copyright (c) 2021 The Pybind Development Team.
+// All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+#pragma once
+#include "../cast.h"
+#include "../pybind11.h"
+#include "../pytypes.h"
+#include "../detail/common.h"
+#include "../detail/descr.h"
+#include <string>
+#ifdef __has_include
+#  if defined(PYBIND11_CPP17) && __has_include(<filesystem>) && \
+      PY_VERSION_HEX >= 0x03060000
+#    include <filesystem>
+#    define PYBIND11_HAS_FILESYSTEM 1
+#  endif
+#    error                                                                                        \
+        "#include <filesystem> is not available. (Use -DPYBIND11_HAS_FILESYSTEM_IS_OPTIONAL to ignore.)"
+template<typename T> struct path_caster {
+    static PyObject* unicode_from_fs_native(const std::string& w) {
+#if !defined(PYPY_VERSION)
+        return PyUnicode_DecodeFSDefaultAndSize(w.c_str(), ssize_t(w.size()));
+        // PyPy mistakenly declares the first parameter as non-const.
+        return PyUnicode_DecodeFSDefaultAndSize(
+            const_cast<char*>(w.c_str()), ssize_t(w.size()));
+    }
+    static PyObject* unicode_from_fs_native(const std::wstring& w) {
+        return PyUnicode_FromWideChar(w.c_str(), ssize_t(w.size()));
+    }
+    static handle cast(const T& path, return_value_policy, handle) {
+        if (auto py_str = unicode_from_fs_native(path.native())) {
+            return module_::import("pathlib").attr("Path")(reinterpret_steal<object>(py_str))
+                   .release();
+        }
+        return nullptr;
+    }
+    bool load(handle handle, bool) {
+        // PyUnicode_FSConverter and PyUnicode_FSDecoder normally take care of
+        // calling PyOS_FSPath themselves, but that's broken on PyPy (PyPy
+        // issue #3168) so we do it ourselves instead.
+        PyObject* buf = PyOS_FSPath(handle.ptr());
+        if (!buf) {
+            PyErr_Clear();
+            return false;
+        }
+        PyObject* native = nullptr;
+        if constexpr (std::is_same_v<typename T::value_type, char>) {
+            if (PyUnicode_FSConverter(buf, &native) != 0) {
+                if (auto c_str = PyBytes_AsString(native)) {
+                    // AsString returns a pointer to the internal buffer, which
+                    // must not be free'd.
+                    value = c_str;
+                }
+            }
+        } else if constexpr (std::is_same_v<typename T::value_type, wchar_t>) {
+            if (PyUnicode_FSDecoder(buf, &native) != 0) {
+                if (auto c_str = PyUnicode_AsWideCharString(native, nullptr)) {
+                    // AsWideCharString returns a new string that must be free'd.
+                    value = c_str;  // Copies the string.
+                    PyMem_Free(c_str);
+                }
+            }
+        }
+        Py_XDECREF(native);
+        Py_DECREF(buf);
+        if (PyErr_Occurred()) {
+            PyErr_Clear();
+            return false;
+        }
+        return true;
+    }
+    PYBIND11_TYPE_CASTER(T, const_name("os.PathLike"));
+template<> struct type_caster<std::filesystem::path>
+    : public path_caster<std::filesystem::path> {};
diff --git a/wrap/pybind11/include/pybind11/stl_bind.h b/wrap/pybind11/include/pybind11/stl_bind.h
index 9d8ed0c825..050be83cc5 100644
--- a/wrap/pybind11/include/pybind11/stl_bind.h
+++ b/wrap/pybind11/include/pybind11/stl_bind.h
@@ -128,11 +128,11 @@ void vector_modifiers(enable_if_t<is_copy_constructible<typename Vector::value_t
            "Add an item to the end of the list");
-    cl.def(init([](iterable it) {
+    cl.def(init([](const iterable &it) {
         auto v = std::unique_ptr<Vector>(new Vector());
         for (handle h : it)
-           v->push_back(h.cast<T>());
+            v->push_back(h.cast<T>());
         return v.release();
@@ -151,27 +151,28 @@ void vector_modifiers(enable_if_t<is_copy_constructible<typename Vector::value_t
        "Extend the list by appending all the items in the given list"
-    cl.def("extend",
-       [](Vector &v, iterable it) {
-           const size_t old_size = v.size();
-           v.reserve(old_size + len_hint(it));
-           try {
-               for (handle h : it) {
-                   v.push_back(h.cast<T>());
-               }
-           } catch (const cast_error &) {
-               v.erase(v.begin() + static_cast<typename Vector::difference_type>(old_size), v.end());
-               try {
-                   v.shrink_to_fit();
-               } catch (const std::exception &) {
-                   // Do nothing
-               }
-               throw;
-           }
-       },
-       arg("L"),
-       "Extend the list by appending all the items in the given list"
-    );
+    cl.def(
+        "extend",
+        [](Vector &v, const iterable &it) {
+            const size_t old_size = v.size();
+            v.reserve(old_size + len_hint(it));
+            try {
+                for (handle h : it) {
+                    v.push_back(h.cast<T>());
+                }
+            } catch (const cast_error &) {
+                v.erase(v.begin() + static_cast<typename Vector::difference_type>(old_size),
+                        v.end());
+                try {
+                    v.shrink_to_fit();
+                } catch (const std::exception &) {
+                    // Do nothing
+                }
+                throw;
+            }
+        },
+        arg("L"),
+        "Extend the list by appending all the items in the given list");
         [](Vector &v, DiffType i, const T &x) {
@@ -190,7 +191,7 @@ void vector_modifiers(enable_if_t<is_copy_constructible<typename Vector::value_t
         [](Vector &v) {
             if (v.empty())
                 throw index_error();
-            T t = v.back();
+            T t = std::move(v.back());
             return t;
@@ -200,8 +201,8 @@ void vector_modifiers(enable_if_t<is_copy_constructible<typename Vector::value_t
         [wrap_i](Vector &v, DiffType i) {
             i = wrap_i(i, v.size());
-            T t = v[(SizeType) i];
-            v.erase(v.begin() + i);
+            T t = std::move(v[(SizeType) i]);
+            v.erase(std::next(v.begin(), i));
             return t;
@@ -216,9 +217,10 @@ void vector_modifiers(enable_if_t<is_copy_constructible<typename Vector::value_t
     /// Slicing protocol
-    cl.def("__getitem__",
+    cl.def(
+        "__getitem__",
         [](const Vector &v, slice slice) -> Vector * {
-            size_t start, stop, step, slicelength;
+            size_t start = 0, stop = 0, step = 0, slicelength = 0;
             if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
                 throw error_already_set();
@@ -233,12 +235,12 @@ void vector_modifiers(enable_if_t<is_copy_constructible<typename Vector::value_t
             return seq;
-        "Retrieve list elements using a slice object"
-    );
+        "Retrieve list elements using a slice object");
-    cl.def("__setitem__",
-        [](Vector &v, slice slice,  const Vector &value) {
-            size_t start, stop, step, slicelength;
+    cl.def(
+        "__setitem__",
+        [](Vector &v, slice slice, const Vector &value) {
+            size_t start = 0, stop = 0, step = 0, slicelength = 0;
             if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
                 throw error_already_set();
@@ -250,8 +252,7 @@ void vector_modifiers(enable_if_t<is_copy_constructible<typename Vector::value_t
                 start += step;
-        "Assign list elements using a slice object"
-    );
+        "Assign list elements using a slice object");
         [wrap_i](Vector &v, DiffType i) {
@@ -261,9 +262,10 @@ void vector_modifiers(enable_if_t<is_copy_constructible<typename Vector::value_t
         "Delete the list elements at index ``i``"
-    cl.def("__delitem__",
+    cl.def(
+        "__delitem__",
         [](Vector &v, slice slice) {
-            size_t start, stop, step, slicelength;
+            size_t start = 0, stop = 0, step = 0, slicelength = 0;
             if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
                 throw error_already_set();
@@ -277,9 +279,7 @@ void vector_modifiers(enable_if_t<is_copy_constructible<typename Vector::value_t
-        "Delete list elements using a slice object"
-    );
+        "Delete list elements using a slice object");
 // If the type has an operator[] that doesn't return a reference (most notably std::vector<bool>),
@@ -375,10 +375,20 @@ struct vector_has_data_and_format : std::false_type {};
 template <typename Vector>
 struct vector_has_data_and_format<Vector, enable_if_t<std::is_same<decltype(format_descriptor<typename Vector::value_type>::format(), std::declval<Vector>().data()), typename Vector::value_type*>::value>> : std::true_type {};
+// [workaround(intel)] Separate function required here
+// Workaround as the Intel compiler does not compile the enable_if_t part below
+// (tested with icc (ICC) 2021.1 Beta 20200827)
+template <typename... Args>
+constexpr bool args_any_are_buffer() {
+    return detail::any_of<std::is_same<Args, buffer_protocol>...>::value;
+// [workaround(intel)] Separate function required here
+// [workaround(msvc)] Can't use constexpr bool in return type
 // Add the buffer interface to a vector
 template <typename Vector, typename Class_, typename... Args>
-enable_if_t<detail::any_of<std::is_same<Args, buffer_protocol>...>::value>
-vector_buffer(Class_& cl) {
+void vector_buffer_impl(Class_& cl, std::true_type) {
     using T = typename Vector::value_type;
     static_assert(vector_has_data_and_format<Vector>::value, "There is not an appropriate format descriptor for this vector");
@@ -390,7 +400,7 @@ vector_buffer(Class_& cl) {
         return buffer_info(v.data(), static_cast<ssize_t>(sizeof(T)), format_descriptor<T>::format(), 1, {v.size()}, {sizeof(T)});
-    cl.def(init([](buffer buf) {
+    cl.def(init([](const buffer &buf) {
         auto info = buf.request();
         if (info.ndim != 1 || info.strides[0] % static_cast<ssize_t>(sizeof(T)))
             throw type_error("Only valid 1D buffers can be copied to a vector");
@@ -403,20 +413,24 @@ vector_buffer(Class_& cl) {
         if (step == 1) {
             return Vector(p, end);
-        else {
-            Vector vec;
-            vec.reserve((size_t) info.shape[0]);
-            for (; p != end; p += step)
-                vec.push_back(*p);
-            return vec;
-        }
+        Vector vec;
+        vec.reserve((size_t) info.shape[0]);
+        for (; p != end; p += step)
+            vec.push_back(*p);
+        return vec;
 template <typename Vector, typename Class_, typename... Args>
-enable_if_t<!detail::any_of<std::is_same<Args, buffer_protocol>...>::value> vector_buffer(Class_&) {}
+void vector_buffer_impl(Class_&, std::false_type) {}
+template <typename Vector, typename Class_, typename... Args>
+void vector_buffer(Class_& cl) {
+    vector_buffer_impl<Vector, Class_, Args...>(cl, detail::any_of<std::is_same<Args, buffer_protocol>...>{});
@@ -581,6 +595,23 @@ template <typename Map, typename Class_> auto map_if_insertion_operator(Class_ &
+template<typename Map>
+struct keys_view
+    Map &map;
+template<typename Map>
+struct values_view
+    Map &map;
+template<typename Map>
+struct items_view
+    Map &map;
@@ -588,6 +619,9 @@ template <typename Map, typename holder_type = std::unique_ptr<Map>, typename...
 class_<Map, holder_type> bind_map(handle scope, const std::string &name, Args&&... args) {
     using KeyType = typename Map::key_type;
     using MappedType = typename Map::mapped_type;
+    using KeysView = detail::keys_view<Map>;
+    using ValuesView = detail::values_view<Map>;
+    using ItemsView = detail::items_view<Map>;
     using Class_ = class_<Map, holder_type>;
     // If either type is a non-module-local bound type then make the map binding non-local as well;
@@ -601,6 +635,12 @@ class_<Map, holder_type> bind_map(handle scope, const std::string &name, Args&&.
     Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+    class_<KeysView> keys_view(
+        scope, ("KeysView[" + name + "]").c_str(), pybind11::module_local(local));
+    class_<ValuesView> values_view(
+        scope, ("ValuesView[" + name + "]").c_str(), pybind11::module_local(local));
+    class_<ItemsView> items_view(
+        scope, ("ItemsView[" + name + "]").c_str(), pybind11::module_local(local));
@@ -614,12 +654,22 @@ class_<Map, holder_type> bind_map(handle scope, const std::string &name, Args&&.
            [](Map &m) { return make_key_iterator(m.begin(), m.end()); },
-           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+           keep_alive<0, 1>() /* Essential: keep map alive while iterator exists */
+    );
+    cl.def("keys",
+           [](Map &m) { return KeysView{m}; },
+           keep_alive<0, 1>() /* Essential: keep map alive while view exists */
+    );
+    cl.def("values",
+           [](Map &m) { return ValuesView{m}; },
+           keep_alive<0, 1>() /* Essential: keep map alive while view exists */
-           [](Map &m) { return make_iterator(m.begin(), m.end()); },
-           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+           [](Map &m) { return ItemsView{m}; },
+           keep_alive<0, 1>() /* Essential: keep map alive while view exists */
@@ -640,6 +690,8 @@ class_<Map, holder_type> bind_map(handle scope, const std::string &name, Args&&.
            return true;
+    // Fallback for when the object is not of the key type
+    cl.def("__contains__", [](Map &, const object &) -> bool { return false; });
     // Assignment provided only if the type is copyable
     detail::map_assignment<Map, Class_>(cl);
@@ -655,6 +707,40 @@ class_<Map, holder_type> bind_map(handle scope, const std::string &name, Args&&.
     cl.def("__len__", &Map::size);
+    keys_view.def("__len__", [](KeysView &view) { return view.map.size(); });
+    keys_view.def("__iter__",
+        [](KeysView &view) {
+            return make_key_iterator(view.map.begin(), view.map.end());
+        },
+        keep_alive<0, 1>() /* Essential: keep view alive while iterator exists */
+    );
+    keys_view.def("__contains__",
+        [](KeysView &view, const KeyType &k) -> bool {
+            auto it = view.map.find(k);
+            if (it == view.map.end())
+                return false;
+            return true;
+        }
+    );
+    // Fallback for when the object is not of the key type
+    keys_view.def("__contains__", [](KeysView &, const object &) -> bool { return false; });
+    values_view.def("__len__", [](ValuesView &view) { return view.map.size(); });
+    values_view.def("__iter__",
+        [](ValuesView &view) {
+            return make_value_iterator(view.map.begin(), view.map.end());
+        },
+        keep_alive<0, 1>() /* Essential: keep view alive while iterator exists */
+    );
+    items_view.def("__len__", [](ItemsView &view) { return view.map.size(); });
+    items_view.def("__iter__",
+        [](ItemsView &view) {
+            return make_iterator(view.map.begin(), view.map.end());
+        },
+        keep_alive<0, 1>() /* Essential: keep view alive while iterator exists */
+    );
     return cl;
diff --git a/wrap/pybind11/noxfile.py b/wrap/pybind11/noxfile.py
new file mode 100644
index 0000000000..4adffac2ec
--- /dev/null
+++ b/wrap/pybind11/noxfile.py
@@ -0,0 +1,93 @@
+import nox
+nox.options.sessions = ["lint", "tests", "tests_packaging"]
+PYTHON_VERISONS = ["2.7", "3.5", "3.6", "3.7", "3.8", "3.9", "3.10", "3.11"]
+def lint(session: nox.Session) -> None:
+    """
+    Lint the codebase (except for clang-format/tidy).
+    """
+    session.install("pre-commit")
+    session.run("pre-commit", "run", "-a")
+def tests(session: nox.Session) -> None:
+    """
+    Run the tests (requires a compiler).
+    """
+    tmpdir = session.create_tmp()
+    session.install("cmake")
+    session.install("-r", "tests/requirements.txt")
+    session.run(
+        "cmake",
+        "-S",
+        ".",
+        "-B",
+        tmpdir,
+        "-DPYBIND11_WERROR=ON",
+        *session.posargs
+    )
+    session.run("cmake", "--build", tmpdir)
+    session.run("cmake", "--build", tmpdir, "--config=Release", "--target", "check")
+def tests_packaging(session: nox.Session) -> None:
+    """
+    Run the packaging tests.
+    """
+    session.install("-r", "tests/requirements.txt", "--prefer-binary")
+    session.run("pytest", "tests/extra_python_package")
+def docs(session: nox.Session) -> None:
+    """
+    Build the docs. Pass "serve" to serve.
+    """
+    session.install("-r", "docs/requirements.txt")
+    session.chdir("docs")
+    if "pdf" in session.posargs:
+        session.run("sphinx-build", "-b", "latexpdf", ".", "_build")
+        return
+    session.run("sphinx-build", "-b", "html", ".", "_build")
+    if "serve" in session.posargs:
+        session.log("Launching docs at http://localhost:8000/ - use Ctrl-C to quit")
+        session.run("python", "-m", "http.server", "8000", "-d", "_build/html")
+    elif session.posargs:
+        session.error("Unsupported argument to docs")
+def make_changelog(session: nox.Session) -> None:
+    """
+    Inspect the closed issues and make entries for a changelog.
+    """
+    session.install("ghapi", "rich")
+    session.run("python", "tools/make_changelog.py")
+def build(session: nox.Session) -> None:
+    """
+    Build SDists and wheels.
+    """
+    session.install("build")
+    session.log("Building normal files")
+    session.run("python", "-m", "build", *session.posargs)
+    session.log("Building pybind11-global files (PYBIND11_GLOBAL_SDIST=1)")
+    session.run(
+        "python", "-m", "build", *session.posargs, env={"PYBIND11_GLOBAL_SDIST": "1"}
+    )
diff --git a/wrap/pybind11/pybind11/__init__.py b/wrap/pybind11/pybind11/__init__.py
index ad65420893..64e999ba06 100644
--- a/wrap/pybind11/pybind11/__init__.py
+++ b/wrap/pybind11/pybind11/__init__.py
@@ -1,8 +1,7 @@
 # -*- coding: utf-8 -*-
-from ._version import version_info, __version__
-from .commands import get_include, get_cmake_dir
+from ._version import __version__, version_info
+from .commands import get_cmake_dir, get_include
 __all__ = (
diff --git a/wrap/pybind11/pybind11/__main__.py b/wrap/pybind11/pybind11/__main__.py
index f4d5437836..3235747bed 100644
--- a/wrap/pybind11/pybind11/__main__.py
+++ b/wrap/pybind11/pybind11/__main__.py
@@ -5,10 +5,11 @@
 import sys
 import sysconfig
-from .commands import get_include, get_cmake_dir
+from .commands import get_cmake_dir, get_include
 def print_includes():
+    # type: () -> None
     dirs = [
@@ -18,13 +19,15 @@ def print_includes():
     # Make unique but preserve order
     unique_dirs = []
     for d in dirs:
-        if d not in unique_dirs:
+        if d and d not in unique_dirs:
     print(" ".join("-I" + d for d in unique_dirs))
 def main():
+    # type: () -> None
     parser = argparse.ArgumentParser()
diff --git a/wrap/pybind11/pybind11/_version.py b/wrap/pybind11/pybind11/_version.py
index ca84c262c9..9d39b77a41 100644
--- a/wrap/pybind11/pybind11/_version.py
+++ b/wrap/pybind11/pybind11/_version.py
@@ -8,5 +8,5 @@ def _to_int(s):
         return s
-__version__ = "2.6.0.dev1"
+__version__ = "2.9.1"
 version_info = tuple(_to_int(s) for s in __version__.split("."))
diff --git a/wrap/pybind11/pybind11/_version.pyi b/wrap/pybind11/pybind11/_version.pyi
new file mode 100644
index 0000000000..d45e5dc907
--- /dev/null
+++ b/wrap/pybind11/pybind11/_version.pyi
@@ -0,0 +1,6 @@
+from typing import Tuple, Union
+def _to_int(s: str) -> Union[int, str]: ...
+__version__: str
+version_info: Tuple[Union[int, str], ...]
diff --git a/wrap/pybind11/pybind11/commands.py b/wrap/pybind11/pybind11/commands.py
index fa7eac3ccd..11f81d2d6d 100644
--- a/wrap/pybind11/pybind11/commands.py
+++ b/wrap/pybind11/pybind11/commands.py
@@ -1,17 +1,18 @@
 # -*- coding: utf-8 -*-
 import os
 DIR = os.path.abspath(os.path.dirname(__file__))
 def get_include(user=False):
+    # type: (bool) -> str
     installed_path = os.path.join(DIR, "include")
     source_path = os.path.join(os.path.dirname(DIR), "include")
     return installed_path if os.path.exists(installed_path) else source_path
 def get_cmake_dir():
+    # type: () -> str
     cmake_installed_path = os.path.join(DIR, "share", "cmake", "pybind11")
     if os.path.exists(cmake_installed_path):
         return cmake_installed_path
diff --git a/wrap/pybind11/pybind11/py.typed b/wrap/pybind11/pybind11/py.typed
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/wrap/pybind11/pybind11/setup_helpers.py b/wrap/pybind11/pybind11/setup_helpers.py
index 041e22689f..5b7c9aab1c 100644
--- a/wrap/pybind11/pybind11/setup_helpers.py
+++ b/wrap/pybind11/pybind11/setup_helpers.py
@@ -33,25 +33,34 @@
+# IMPORTANT: If you change this file in the pybind11 repo, also review
+# setup_helpers.pyi for matching changes.
+# If you copy this file in, you don't
+# need the .pyi file; it's just an interface file for static type checkers.
 import contextlib
 import os
+import platform
+import shlex
 import shutil
 import sys
+import sysconfig
 import tempfile
 import threading
 import warnings
-    from setuptools.command.build_ext import build_ext as _build_ext
     from setuptools import Extension as _Extension
+    from setuptools.command.build_ext import build_ext as _build_ext
 except ImportError:
     from distutils.command.build_ext import build_ext as _build_ext
     from distutils.extension import Extension as _Extension
+import distutils.ccompiler
 import distutils.errors
-WIN = sys.platform.startswith("win32")
+WIN = sys.platform.startswith("win32") and "mingw" not in sysconfig.get_platform()
 PY2 = sys.version_info[0] < 3
 MACOS = sys.platform.startswith("darwin")
 STD_TMPL = "/std:c++{}" if WIN else "-std=c++{}"
@@ -76,7 +85,7 @@ class Pybind11Extension(_Extension):
     * ``stdlib=libc++`` on macOS
     * ``visibility=hidden`` and ``-g0`` on Unix
-    Finally, you can set ``cxx_std`` via constructor or afterwords to enable
+    Finally, you can set ``cxx_std`` via constructor or afterwards to enable
     flags for C++ std, and a few extra helper flags related to the C++ standard
     level. It is _highly_ recommended you either set this, or use the provided
     ``build_ext``, which will search for the highest supported extension for
@@ -91,15 +100,14 @@ class Pybind11Extension(_Extension):
     this is an ugly old-style class due to Distutils.
-    def _add_cflags(self, *flags):
-        for flag in flags:
-            if flag not in self.extra_compile_args:
-                self.extra_compile_args.append(flag)
+    # flags are prepended, so that they can be further overridden, e.g. by
+    # ``extra_compile_args=["-g"]``.
-    def _add_lflags(self, *flags):
-        for flag in flags:
-            if flag not in self.extra_compile_args:
-                self.extra_link_args.append(flag)
+    def _add_cflags(self, flags):
+        self.extra_compile_args[:0] = flags
+    def _add_ldflags(self, flags):
+        self.extra_link_args[:0] = flags
     def __init__(self, *args, **kwargs):
@@ -131,13 +139,22 @@ def __init__(self, *args, **kwargs):
         # Have to use the accessor manually to support Python 2 distutils
         Pybind11Extension.cxx_std.__set__(self, cxx_std)
+        cflags = []
+        ldflags = []
         if WIN:
-            self._add_cflags("/EHsc", "/bigobj")
+            cflags += ["/EHsc", "/bigobj"]
-            self._add_cflags("-fvisibility=hidden", "-g0")
+            cflags += ["-fvisibility=hidden"]
+            env_cflags = os.environ.get("CFLAGS", "")
+            env_cppflags = os.environ.get("CPPFLAGS", "")
+            c_cpp_flags = shlex.split(env_cflags) + shlex.split(env_cppflags)
+            if not any(opt.startswith("-g") for opt in c_cpp_flags):
+                cflags += ["-g0"]
             if MACOS:
-                self._add_cflags("-stdlib=libc++")
-                self._add_lflags("-stdlib=libc++")
+                cflags += ["-stdlib=libc++"]
+                ldflags += ["-stdlib=libc++"]
+        self._add_cflags(cflags)
+        self._add_ldflags(ldflags)
     def cxx_std(self):
@@ -156,7 +173,8 @@ def cxx_std(self, level):
         if self._cxx_level:
             warnings.warn("You cannot safely change the cxx_level after setting it!")
-        # MSVC 2015 Update 3 and later only have 14 (and later 17) modes
+        # MSVC 2015 Update 3 and later only have 14 (and later 17) modes, so
+        # force a valid flag here.
         if WIN and level == 11:
             level = 14
@@ -165,19 +183,34 @@ def cxx_std(self, level):
         if not level:
-        self.extra_compile_args.append(STD_TMPL.format(level))
+        cflags = [STD_TMPL.format(level)]
+        ldflags = []
         if MACOS and "MACOSX_DEPLOYMENT_TARGET" not in os.environ:
-            # C++17 requires a higher min version of macOS
-            macosx_min = "-mmacosx-version-min=" + ("10.9" if level < 17 else "10.14")
-            self.extra_compile_args.append(macosx_min)
-            self.extra_link_args.append(macosx_min)
+            # C++17 requires a higher min version of macOS. An earlier version
+            # (10.12 or 10.13) can be set manually via environment variable if
+            # you are careful in your feature usage, but 10.14 is the safest
+            # setting for general use. However, never set higher than the
+            # current macOS version!
+            current_macos = tuple(int(x) for x in platform.mac_ver()[0].split(".")[:2])
+            desired_macos = (10, 9) if level < 17 else (10, 14)
+            macos_string = ".".join(str(x) for x in min(current_macos, desired_macos))
+            macosx_min = "-mmacosx-version-min=" + macos_string
+            cflags += [macosx_min]
+            ldflags += [macosx_min]
         if PY2:
-            if level >= 17:
-                self.extra_compile_args.append("/wd503" if WIN else "-Wno-register")
-            elif not WIN and level >= 14:
-                self.extra_compile_args.append("-Wno-deprecated-register")
+            if WIN:
+                # Will be ignored on MSVC 2015, where C++17 is not supported so
+                # this flag is not valid.
+                cflags += ["/wd5033"]
+            elif level >= 17:
+                cflags += ["-Wno-register"]
+            elif level >= 14:
+                cflags += ["-Wno-deprecated-register"]
+        self._add_cflags(cflags)
+        self._add_ldflags(ldflags)
 # Just in case someone clever tries to multithread
@@ -212,7 +245,8 @@ def has_flag(compiler, flag):
     with tmp_chdir():
         fname = "flagcheck.cpp"
         with open(fname, "w") as f:
-            f.write("int main (int argc, char **argv) { return 0; }")
+            # Don't trigger -Wunused-parameter.
+            f.write("int main (int, char **) { return 0; }")
             compiler.compile([fname], extra_postargs=[flag])
@@ -227,9 +261,12 @@ def has_flag(compiler, flag):
 def auto_cpp_level(compiler):
-    Return the max supported C++ std level (17, 14, or 11).
+    Return the max supported C++ std level (17, 14, or 11). Returns latest on Windows.
+    if WIN:
+        return "latest"
     global cpp_flag_cache
     # If this has been previously calculated with the same args, return that
@@ -237,7 +274,7 @@ def auto_cpp_level(compiler):
         if cpp_flag_cache:
             return cpp_flag_cache
-    levels = [17, 14] + ([] if WIN else [11])
+    levels = [17, 14, 11]
     for level in levels:
         if has_flag(compiler, STD_TMPL.format(level)):
@@ -252,7 +289,8 @@ def auto_cpp_level(compiler):
 class build_ext(_build_ext):  # noqa: N801
     Customized build_ext that allows an auto-search for the highest supported
-    C++ level for Pybind11Extension.
+    C++ level for Pybind11Extension. This is only needed for the auto-search
+    for now, and is completely optional otherwise.
     def build_extensions(self):
@@ -268,3 +306,189 @@ def build_extensions(self):
         # Python 2 doesn't allow super here, since distutils uses old-style
         # classes!
+def intree_extensions(paths, package_dir=None):
+    """
+    Generate Pybind11Extensions from source files directly located in a Python
+    source tree.
+    ``package_dir`` behaves as in ``setuptools.setup``.  If unset, the Python
+    package root parent is determined as the first parent directory that does
+    not contain an ``__init__.py`` file.
+    """
+    exts = []
+    for path in paths:
+        if package_dir is None:
+            parent, _ = os.path.split(path)
+            while os.path.exists(os.path.join(parent, "__init__.py")):
+                parent, _ = os.path.split(parent)
+            relname, _ = os.path.splitext(os.path.relpath(path, parent))
+            qualified_name = relname.replace(os.path.sep, ".")
+            exts.append(Pybind11Extension(qualified_name, [path]))
+        else:
+            found = False
+            for prefix, parent in package_dir.items():
+                if path.startswith(parent):
+                    found = True
+                    relname, _ = os.path.splitext(os.path.relpath(path, parent))
+                    qualified_name = relname.replace(os.path.sep, ".")
+                    if prefix:
+                        qualified_name = prefix + "." + qualified_name
+                    exts.append(Pybind11Extension(qualified_name, [path]))
+            if not found:
+                raise ValueError(
+                    "path {} is not a child of any of the directories listed "
+                    "in 'package_dir' ({})".format(path, package_dir)
+                )
+    return exts
+def naive_recompile(obj, src):
+    """
+    This will recompile only if the source file changes. It does not check
+    header files, so a more advanced function or Ccache is better if you have
+    editable header files in your package.
+    """
+    return os.stat(obj).st_mtime < os.stat(src).st_mtime
+def no_recompile(obg, src):
+    """
+    This is the safest but slowest choice (and is the default) - will always
+    recompile sources.
+    """
+    return True
+# Optional parallel compile utility
+# inspired by: http://stackoverflow.com/questions/11013851/speeding-up-build-process-with-distutils
+# and: https://github.com/tbenthompson/cppimport/blob/stable/cppimport/build_module.py
+# and NumPy's parallel distutils module:
+#              https://github.com/numpy/numpy/blob/master/numpy/distutils/ccompiler.py
+class ParallelCompile(object):
+    """
+    Make a parallel compile function. Inspired by
+    numpy.distutils.ccompiler.CCompiler_compile and cppimport.
+    This takes several arguments that allow you to customize the compile
+    function created:
+    envvar:
+        Set an environment variable to control the compilation threads, like
+    default:
+        0 will automatically multithread, or 1 will only multithread if the
+        envvar is set.
+    max:
+        The limit for automatic multithreading if non-zero
+    needs_recompile:
+        A function of (obj, src) that returns True when recompile is needed.  No
+        effect in isolated mode; use ccache instead, see
+        https://github.com/matplotlib/matplotlib/issues/1507/
+    To use::
+        ParallelCompile("NPY_NUM_BUILD_JOBS").install()
+    or::
+        with ParallelCompile("NPY_NUM_BUILD_JOBS"):
+            setup(...)
+    By default, this assumes all files need to be recompiled. A smarter
+    function can be provided via needs_recompile.  If the output has not yet
+    been generated, the compile will always run, and this function is not
+    called.
+    """
+    __slots__ = ("envvar", "default", "max", "_old", "needs_recompile")
+    def __init__(self, envvar=None, default=0, max=0, needs_recompile=no_recompile):
+        self.envvar = envvar
+        self.default = default
+        self.max = max
+        self.needs_recompile = needs_recompile
+        self._old = []
+    def function(self):
+        """
+        Builds a function object usable as distutils.ccompiler.CCompiler.compile.
+        """
+        def compile_function(
+            compiler,
+            sources,
+            output_dir=None,
+            macros=None,
+            include_dirs=None,
+            debug=0,
+            extra_preargs=None,
+            extra_postargs=None,
+            depends=None,
+        ):
+            # These lines are directly from distutils.ccompiler.CCompiler
+            macros, objects, extra_postargs, pp_opts, build = compiler._setup_compile(
+                output_dir, macros, include_dirs, sources, depends, extra_postargs
+            )
+            cc_args = compiler._get_cc_args(pp_opts, debug, extra_preargs)
+            # The number of threads; start with default.
+            threads = self.default
+            # Determine the number of compilation threads, unless set by an environment variable.
+            if self.envvar is not None:
+                threads = int(os.environ.get(self.envvar, self.default))
+            def _single_compile(obj):
+                try:
+                    src, ext = build[obj]
+                except KeyError:
+                    return
+                if not os.path.exists(obj) or self.needs_recompile(obj, src):
+                    compiler._compile(obj, src, ext, cc_args, extra_postargs, pp_opts)
+            try:
+                # Importing .synchronize checks for platforms that have some multiprocessing
+                # capabilities but lack semaphores, such as AWS Lambda and Android Termux.
+                import multiprocessing.synchronize
+                from multiprocessing.pool import ThreadPool
+            except ImportError:
+                threads = 1
+            if threads == 0:
+                try:
+                    threads = multiprocessing.cpu_count()
+                    threads = self.max if self.max and self.max < threads else threads
+                except NotImplementedError:
+                    threads = 1
+            if threads > 1:
+                pool = ThreadPool(threads)
+                # In Python 2, ThreadPool can't be used as a context manager.
+                # Once we are no longer supporting it, this can be 'with pool:'
+                try:
+                    for _ in pool.imap_unordered(_single_compile, objects):
+                        pass
+                finally:
+                    pool.terminate()
+            else:
+                for ob in objects:
+                    _single_compile(ob)
+            return objects
+        return compile_function
+    def install(self):
+        distutils.ccompiler.CCompiler.compile = self.function()
+        return self
+    def __enter__(self):
+        self._old.append(distutils.ccompiler.CCompiler.compile)
+        return self.install()
+    def __exit__(self, *args):
+        distutils.ccompiler.CCompiler.compile = self._old.pop()
diff --git a/wrap/pybind11/pybind11/setup_helpers.pyi b/wrap/pybind11/pybind11/setup_helpers.pyi
new file mode 100644
index 0000000000..074744eb82
--- /dev/null
+++ b/wrap/pybind11/pybind11/setup_helpers.pyi
@@ -0,0 +1,63 @@
+# IMPORTANT: Should stay in sync with setup_helpers.py (mostly checked by CI /
+# pre-commit).
+import contextlib
+import distutils.ccompiler
+from distutils.command.build_ext import build_ext as _build_ext  # type: ignore
+from distutils.extension import Extension as _Extension
+from types import TracebackType
+from typing import Any, Callable, Dict, Iterator, List, Optional, Type, TypeVar, Union
+WIN: bool
+PY2: bool
+MACOS: bool
+STD_TMPL: str
+class Pybind11Extension(_Extension):
+    def _add_cflags(self, *flags: str) -> None: ...
+    def _add_lflags(self, *flags: str) -> None: ...
+    def __init__(
+        self, *args: Any, cxx_std: int = 0, language: str = "c++", **kwargs: Any
+    ) -> None: ...
+    @property
+    def cxx_std(self) -> int: ...
+    @cxx_std.setter
+    def cxx_std(self, level: int) -> None: ...
+def tmp_chdir() -> Iterator[str]: ...
+def has_flag(compiler: distutils.ccompiler.CCompiler, flag: str) -> bool: ...
+def auto_cpp_level(compiler: distutils.ccompiler.CCompiler) -> Union[int, str]: ...
+class build_ext(_build_ext):  # type: ignore
+    def build_extensions(self) -> None: ...
+def intree_extensions(
+    paths: Iterator[str], package_dir: Optional[Dict[str, str]] = None
+) -> List[Pybind11Extension]: ...
+def no_recompile(obj: str, src: str) -> bool: ...
+def naive_recompile(obj: str, src: str) -> bool: ...
+T = TypeVar("T", bound="ParallelCompile")
+class ParallelCompile:
+    envvar: Optional[str]
+    default: int
+    max: int
+    needs_recompile: Callable[[str, str], bool]
+    def __init__(
+        self,
+        envvar: Optional[str] = None,
+        default: int = 0,
+        max: int = 0,
+        needs_recompile: Callable[[str, str], bool] = no_recompile,
+    ) -> None: ...
+    def function(self) -> Any: ...
+    def install(self: T) -> T: ...
+    def __enter__(self: T) -> T: ...
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_value: Optional[BaseException],
+        traceback: Optional[TracebackType],
+    ) -> None: ...
diff --git a/wrap/pybind11/pyproject.toml b/wrap/pybind11/pyproject.toml
index 3bab1c1a28..7d7a1c8213 100644
--- a/wrap/pybind11/pyproject.toml
+++ b/wrap/pybind11/pyproject.toml
@@ -1,3 +1,41 @@
-requires = ["setuptools", "wheel", "cmake==3.18.0", "ninja"]
+requires = ["setuptools>=42", "wheel", "cmake>=3.18", "ninja"]
 build-backend = "setuptools.build_meta"
+ignore = [
+    "tests/**",
+    "docs/**",
+    "tools/**",
+    "include/**",
+    ".*",
+    "pybind11/include/**",
+    "pybind11/share/**",
+    "CMakeLists.txt",
+    "noxfile.py",
+# Needs the compiled .so modules and env.py from tests
+known_first_party = "env,pybind11_cross_module_tests,pybind11_tests,"
+# For black compatibility
+profile = "black"
+files = "pybind11"
+python_version = "2.7"
+warn_unused_configs = true
+disallow_any_generics = true
+disallow_subclassing_any = true
+disallow_untyped_calls = true
+disallow_untyped_defs = true
+disallow_incomplete_defs = true
+check_untyped_defs = true
+disallow_untyped_decorators = true
+no_implicit_optional = true
+warn_redundant_casts = true
+warn_unused_ignores = true
+warn_return_any = true
+no_implicit_reexport = true
+strict_equality = true
diff --git a/wrap/pybind11/setup.cfg b/wrap/pybind11/setup.cfg
index ca0d59a4d2..317c44bbf6 100644
--- a/wrap/pybind11/setup.cfg
+++ b/wrap/pybind11/setup.cfg
@@ -1,10 +1,10 @@
-long_description = file: README.md
-long_description_content_type = text/markdown
+long_description = file: README.rst
+long_description_content_type = text/x-rst
 description = Seamless operability between C++11 and Python
 author = Wenzel Jakob
-author_email = "wenzel.jakob@epfl.ch"
-url = "https://github.com/pybind/pybind11"
+author_email = wenzel.jakob@epfl.ch
+url = https://github.com/pybind/pybind11
 license = BSD
 classifiers =
@@ -19,6 +19,8 @@ classifiers =
     Programming Language :: Python :: 3.6
     Programming Language :: Python :: 3.7
     Programming Language :: Python :: 3.8
+    Programming Language :: Python :: 3.9
+    Programming Language :: Python :: 3.10
     License :: OSI Approved :: BSD License
     Programming Language :: Python :: Implementation :: PyPy
     Programming Language :: Python :: Implementation :: CPython
@@ -29,29 +31,20 @@ keywords =
     Python bindings
+project_urls =
+    Documentation = https://pybind11.readthedocs.io/
+    Bug Tracker = https://github.com/pybind/pybind11/issues
+    Discussions = https://github.com/pybind/pybind11/discussions
+    Changelog = https://pybind11.readthedocs.io/en/latest/changelog.html
+    Chat = https://gitter.im/pybind/Lobby
-python_requires = >=2.7, !=3.0, !=3.1, !=3.2, !=3.3, !=3.4
+python_requires = >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*
 zip_safe = False
-ignore =
-    tests/**
-    docs/**
-    tools/**
-    include/**
-    .appveyor.yml
-    .cmake-format.yaml
-    .gitmodules
-    .pre-commit-config.yaml
-    .readthedocs.yml
-    .clang-tidy
-    pybind11/include/**
-    pybind11/share/**
-    CMakeLists.txt
 max-line-length = 99
@@ -64,3 +57,7 @@ ignore =
     # Black conflict
     W503, E203
+timeout = 300
diff --git a/wrap/pybind11/setup.py b/wrap/pybind11/setup.py
index c9ba77d6d8..0e73489820 100644
--- a/wrap/pybind11/setup.py
+++ b/wrap/pybind11/setup.py
@@ -4,6 +4,7 @@
 # Setup script for PyPI; use CMakeFile.txt to build extension modules
 import contextlib
+import io
 import os
 import re
 import shutil
@@ -19,6 +20,36 @@
     r"^\s*#\s*define\s+PYBIND11_VERSION_([A-Z]+)\s+(.*)$", re.MULTILINE
+def build_expected_version_hex(matches):
+    patch_level_serial = matches["PATCH"]
+    serial = None
+    try:
+        major = int(matches["MAJOR"])
+        minor = int(matches["MINOR"])
+        flds = patch_level_serial.split(".")
+        if flds:
+            patch = int(flds[0])
+            level = None
+            if len(flds) == 1:
+                level = "0"
+                serial = 0
+            elif len(flds) == 2:
+                level_serial = flds[1]
+                for level in ("a", "b", "c", "dev"):
+                    if level_serial.startswith(level):
+                        serial = int(level_serial[len(level) :])
+                        break
+    except ValueError:
+        pass
+    if serial is None:
+        msg = 'Invalid PYBIND11_VERSION_PATCH: "{}"'.format(patch_level_serial)
+        raise RuntimeError(msg)
+    return "0x{:02x}{:02x}{:02x}{}{:x}".format(
+        major, minor, patch, level[:1].upper(), serial
+    )
 # PYBIND11_GLOBAL_SDIST will build a different sdist, with the python-headers
 # files, and the sys.prefix files (CMake and headers).
@@ -35,12 +66,12 @@
 # Read the listed version
 with open("pybind11/_version.py") as f:
     code = compile(f.read(), "pybind11/_version.py", "exec")
-    loc = {}
-    exec(code, loc)
-    version = loc["__version__"]
+loc = {}
+exec(code, loc)
+version = loc["__version__"]
 # Verify that the version matches the one in C++
-with open("include/pybind11/detail/common.h") as f:
+with io.open("include/pybind11/detail/common.h", encoding="utf8") as f:
     matches = dict(VERSION_REGEX.findall(f.read()))
 cpp_version = "{MAJOR}.{MINOR}.{PATCH}".format(**matches)
 if version != cpp_version:
@@ -49,6 +80,15 @@
     raise RuntimeError(msg)
+version_hex = matches.get("HEX", "MISSING")
+expected_version_hex = build_expected_version_hex(matches)
+if version_hex != expected_version_hex:
+    msg = "PYBIND11_VERSION_HEX {} does not match expected value {}!".format(
+        version_hex,
+        expected_version_hex,
+    )
+    raise RuntimeError(msg)
 def get_and_replace(filename, binary=False, **opts):
     with open(filename, "rb" if binary else "r") as f:
@@ -106,6 +146,13 @@ def remove_output(*sources):
+        if "CMAKE_ARGS" in os.environ:
+            fcommand = [
+                c
+                for c in os.environ["CMAKE_ARGS"].split()
+                if "DCMAKE_INSTALL_PREFIX" not in c
+            ]
+            cmd += fcommand
         cmake_opts = dict(cwd=DIR, stdout=sys.stdout, stderr=sys.stderr)
         subprocess.check_call(cmd, **cmake_opts)
         subprocess.check_call(["cmake", "--install", tmpdir], **cmake_opts)
diff --git a/wrap/pybind11/tests/CMakeLists.txt b/wrap/pybind11/tests/CMakeLists.txt
index 45e094b080..9040cf8c06 100644
--- a/wrap/pybind11/tests/CMakeLists.txt
+++ b/wrap/pybind11/tests/CMakeLists.txt
@@ -10,27 +10,34 @@ cmake_minimum_required(VERSION 3.4)
 # The `cmake_minimum_required(VERSION 3.4...3.18)` syntax does not work with
 # some versions of VS that have a patched CMake 3.11. This forces us to emulate
 # the behavior using the following workaround:
-  cmake_policy(VERSION 3.18)
+  cmake_policy(VERSION 3.21)
 # Only needed for CMake < 3.5 support
-# Filter out items; print an optional message if any items filtered
+# Filter out items; print an optional message if any items filtered. This ignores extensions.
 # Usage:
 #   pybind11_filter_tests(LISTNAME file1.cpp file2.cpp ... MESSAGE "")
+macro(pybind11_filter_tests LISTNAME)
   cmake_parse_arguments(ARG "" "MESSAGE" "" ${ARGN})
+  # Make a list of the test without any extensions, for easier filtering.
+  set(_TMP_ACTUAL_LIST "${${LISTNAME}};") # enforce ';' at the end to allow matching last item.
-    list(FIND ${LISTNAME} ${filename} _FILE_FOUND)
+    string(REGEX REPLACE "\\.[^.]*$" "" filename_no_ext ${filename})
+    # Search in the list without extensions.
+    list(FIND LIST_WITHOUT_EXTENSIONS ${filename_no_ext} _FILE_FOUND)
+      list(REMOVE_AT ${LISTNAME} ${_FILE_FOUND}) # And remove from the list with extensions.
+      )# And our search list, to ensure it is in sync.
@@ -39,6 +46,26 @@ macro(PYBIND11_FILTER_TESTS LISTNAME)
+  foreach(VARNAME ${ARGN})
+      set("${VARNAME}" "")
+    endif()
+  endforeach()
+# Function to add additional targets if any of the provided tests are found.
+# Needles; Specifies the test names to look for.
+# Additions; Specifies the additional test targets to add when any of the needles are found.
+macro(tests_extra_targets needles additions)
+  # Add the index for this relation to the index extra targets map.
+  # Add the test names to look for, and the associated test target additions.
 # New Python support
   find_package(pybind11 REQUIRED CONFIG)
   message(STATUS "Setting tests build type to MinSizeRel as none was specified")
@@ -84,52 +111,67 @@ if(PYBIND11_CUDA_TESTS)
-# Full set of test files (you can override these; see below)
+# Full set of test files (you can override these; see below, overrides ignore extension)
+# Any test that has no extension is both .py and .cpp, so 'foo' will add 'foo.cpp' and 'foo.py'.
+# Any test that has an extension is exclusively that and handled as such.
-    test_async.cpp
-    test_buffers.cpp
-    test_builtin_casters.cpp
-    test_call_policies.cpp
-    test_callbacks.cpp
-    test_chrono.cpp
-    test_class.cpp
-    test_constants_and_functions.cpp
-    test_copy_move.cpp
-    test_custom_type_casters.cpp
-    test_docstring_options.cpp
-    test_eigen.cpp
-    test_enum.cpp
-    test_eval.cpp
-    test_exceptions.cpp
-    test_factory_constructors.cpp
-    test_gil_scoped.cpp
-    test_iostream.cpp
-    test_kwargs_and_defaults.cpp
-    test_local_bindings.cpp
-    test_methods_and_attributes.cpp
-    test_modules.cpp
-    test_multiple_inheritance.cpp
-    test_numpy_array.cpp
-    test_numpy_dtypes.cpp
-    test_numpy_vectorize.cpp
-    test_opaque_types.cpp
-    test_operator_overloading.cpp
-    test_pickling.cpp
-    test_pytypes.cpp
-    test_sequences_and_iterators.cpp
-    test_smart_ptr.cpp
-    test_stl.cpp
-    test_stl_binders.cpp
-    test_tagbased_polymorphic.cpp
-    test_union.cpp
-    test_virtual_functions.cpp)
+    test_async
+    test_buffers
+    test_builtin_casters
+    test_call_policies
+    test_callbacks
+    test_chrono
+    test_class
+    test_const_name
+    test_constants_and_functions
+    test_copy_move
+    test_custom_type_casters
+    test_custom_type_setup
+    test_docstring_options
+    test_eigen
+    test_enum
+    test_eval
+    test_exceptions
+    test_factory_constructors
+    test_gil_scoped
+    test_iostream
+    test_kwargs_and_defaults
+    test_local_bindings
+    test_methods_and_attributes
+    test_modules
+    test_multiple_inheritance
+    test_numpy_array
+    test_numpy_dtypes
+    test_numpy_vectorize
+    test_opaque_types
+    test_operator_overloading
+    test_pickling
+    test_pytypes
+    test_sequences_and_iterators
+    test_smart_ptr
+    test_stl
+    test_stl_binders
+    test_tagbased_polymorphic
+    test_thread
+    test_union
+    test_virtual_functions)
 # Invoking cmake with something like:
 #     cmake -DPYBIND11_TEST_OVERRIDE="test_callbacks.cpp;test_pickling.cpp" ..
 # lets you override the tests that get compiled and run.  You can restore to all tests with:
 #     cmake -DPYBIND11_TEST_OVERRIDE= ..
+  # Instead of doing a direct override here, we iterate over the overrides without extension and
+  # match them against entries from the PYBIND11_TEST_FILES, anything that not matches goes into the filter list.
+  string(REGEX REPLACE "\\.[^.;]*;" ";" TEST_FILES_NO_EXT "${PYBIND11_TEST_FILES};")
+  # This allows the override to be done with extensions, preserving backwards compatibility.
+  foreach(test_name ${TEST_FILES_NO_EXT})
+    if(NOT ${test_name} IN_LIST TEST_OVERRIDE_NO_EXT
+    )# If not in the whitelist, add to be filtered out.
+      list(APPEND PYBIND11_TEST_FILTER ${test_name})
+    endif()
+  endforeach()
 # You can also filter tests:
@@ -151,15 +193,46 @@ if(PYBIND11_CUDA_TESTS)
     "Skipping test_constants_and_functions due to incompatible exception specifications")
+# Now that the test filtering is complete, we need to split the list into the test for PYTEST
+# and the list for the cpp targets.
+foreach(test_name ${PYBIND11_TEST_FILES})
+  if(test_name MATCHES "\\.py$") # Ends in .py, purely python test.
+    list(APPEND PYBIND11_PYTEST_FILES ${test_name})
+  elseif(test_name MATCHES "\\.cpp$") # Ends in .cpp, purely cpp test.
+    list(APPEND PYBIND11_CPPTEST_FILES ${test_name})
+  elseif(NOT test_name MATCHES "\\.") # No extension specified, assume both, add extension.
+    list(APPEND PYBIND11_PYTEST_FILES ${test_name}.py)
+    list(APPEND PYBIND11_CPPTEST_FILES ${test_name}.cpp)
+  else()
+    message(WARNING "Unhanded test extension in test: ${test_name}")
+  endif()
 # Contains the set of test files that require pybind11_cross_module_tests to be
 # built; if none of these are built (i.e. because TEST_OVERRIDE is used and
 # doesn't include them) the second module doesn't get built.
-set(PYBIND11_CROSS_MODULE_TESTS test_exceptions.py test_local_bindings.py test_stl.py
-                                test_stl_binders.py)
+                    "pybind11_cross_module_tests")
-set(PYBIND11_CROSS_MODULE_GIL_TESTS test_gil_scoped.py)
+# And add additional targets for other tests.
+tests_extra_targets("test_gil_scoped.py" "cross_module_gil_utils")
+    "https://gitlab.com/libeigen/eigen.git"
+    CACHE STRING "Eigen repository to use for tests")
+# Always use a hash for reconfigure speed and security reasons
+# Include the version number for pretty printing (keep in sync)
+    "3.4.0;929bc0e191d0927b1735b9a1ddc0e8b77e3a25ec"
+    CACHE STRING "Eigen version to use for tests, format: VERSION;HASH")
 # Check if Eigen is available; if not, remove from PYBIND11_TEST_FILES (but
 # keep it in PYBIND11_PYTEST_FILES, so that we get the "eigen is not installed"
@@ -174,22 +247,26 @@ if(PYBIND11_TEST_FILES_EIGEN_I GREATER -1)
       message(FATAL_ERROR "CMake 3.11+ required when using DOWNLOAD_EIGEN")
-    set(EIGEN3_VERSION_STRING "3.3.7")
-      GIT_REPOSITORY https://gitlab.com/libeigen/eigen.git
     if(NOT eigen_POPULATED)
-      message(STATUS "Downloading Eigen")
+      message(
+        STATUS
+      )
     set(EIGEN3_INCLUDE_DIR ${eigen_SOURCE_DIR})
+    # When getting locally, the version is not visible from a superprojet,
+    # so just force it.
     find_package(Eigen3 3.2.7 QUIET CONFIG)
@@ -217,7 +294,8 @@ if(PYBIND11_TEST_FILES_EIGEN_I GREATER -1)
     message(STATUS "Building tests with Eigen v${EIGEN3_VERSION}")
-    message(STATUS "Building tests WITHOUT Eigen, use -DDOWNLOAD_EIGEN on CMake 3.11+ to download")
+    message(
+      STATUS "Building tests WITHOUT Eigen, use -DDOWNLOAD_EIGEN=ON on CMake 3.11+ to download")
@@ -226,25 +304,69 @@ find_package(Boost 1.56)
   if(NOT TARGET Boost::headers)
+    add_library(Boost::headers IMPORTED INTERFACE)
     if(TARGET Boost::boost)
       # Classic FindBoost
-      add_library(Boost::headers ALIAS Boost::boost)
+      set_property(TARGET Boost::boost PROPERTY INTERFACE_LINK_LIBRARIES Boost::boost)
       # Very old FindBoost, or newer Boost than CMake in older CMakes
-      add_library(Boost::headers IMPORTED INTERFACE)
+# Check if we need to add -lstdc++fs or -lc++fs or nothing
+  file(
+    "#include <filesystem>\nint main(int argc, char ** argv) {\n  std::filesystem::path p(argv[0]);\n  return p.string().length();\n}"
+  )
+  try_compile(
+    COMPILE_DEFINITIONS -std=c++17)
+  try_compile(
+    LINK_LIBRARIES stdc++fs)
+  try_compile(
+    LINK_LIBRARIES c++fs)
+  set(STD_FS_LIB stdc++fs)
+  set(STD_FS_LIB c++fs)
+  set(STD_FS_LIB "")
+  message(WARNING "Unknown C++17 compiler - not passing -lstdc++fs")
+  set(STD_FS_LIB "")
 # Compile with compiler warnings turned on
 function(pybind11_enable_warnings target_name)
     target_compile_options(${target_name} PRIVATE /W4)
-    target_compile_options(${target_name} PRIVATE -Wall -Wextra -Wconversion -Wcast-qual
-                                                  -Wdeprecated -Wundef)
+    target_compile_options(
+      ${target_name}
+      PRIVATE -Wall
+              -Wextra
+              -Wconversion
+              -Wcast-qual
+              -Wdeprecated
+              -Wundef
+              -Wnon-virtual-dtor)
@@ -252,12 +374,22 @@ function(pybind11_enable_warnings target_name)
       target_compile_options(${target_name} PRIVATE /WX)
       target_compile_options(${target_name} PRIVATE "SHELL:-Werror all-warnings")
-    elseif(CMAKE_CXX_COMPILER_ID MATCHES "(GNU|Intel|Clang)")
       target_compile_options(${target_name} PRIVATE -Werror)
+      if(CMAKE_CXX_STANDARD EQUAL 17) # See PR #3570
+        target_compile_options(${target_name} PRIVATE -Wno-conversion)
+      endif()
+      target_compile_options(
+        ${target_name}
+        PRIVATE
+          -Werror-all
+          # "Inlining inhibited by limit max-size", "Inlining inhibited by limit max-total-size"
+          -diag-disable 11074,11076)
-  # Needs to be readded since the ordering requires these to be after the ones above
+  # Needs to be re-added since the ordering requires these to be after the ones above
@@ -271,21 +403,17 @@ endfunction()
 set(test_targets pybind11_tests)
-# Build pybind11_cross_module_tests if any test_whatever.py are being built that require it
-  list(FIND PYBIND11_PYTEST_FILES ${t} i)
-  if(i GREATER -1)
-    list(APPEND test_targets pybind11_cross_module_tests)
-    break()
-  endif()
-  list(FIND PYBIND11_PYTEST_FILES ${t} i)
-  if(i GREATER -1)
-    list(APPEND test_targets cross_module_gil_utils)
-    break()
-  endif()
+# Check if any tests need extra targets by iterating through the mappings registered.
+  foreach(needle ${PYBIND11_TEST_EXTRA_TARGETS_NEEDLES_${i}})
+      # Add all the additional targets to the test list. List join in newer cmake.
+      foreach(extra_target ${PYBIND11_TEST_EXTRA_TARGETS_ADDITION_${i}})
+        list(APPEND test_targets ${extra_target})
+      endforeach()
+      break() # Breaks out of the needle search, continues with the next mapping.
+    endif()
+  endforeach()
 # Support CUDA testing by forcing the target file to compile with NVCC
@@ -334,38 +462,34 @@ foreach(target ${test_targets})
     target_compile_definitions(${target} PRIVATE -DPYBIND11_TEST_BOOST)
+  target_link_libraries(${target} PRIVATE ${STD_FS_LIB})
   # Always write the output file directly into the 'tests' directory (even on MSVC)
     set_target_properties(${target} PROPERTIES LIBRARY_OUTPUT_DIRECTORY
-    foreach(config ${CMAKE_CONFIGURATION_TYPES})
-      string(TOUPPER ${config} config)
-      set_target_properties(${target} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${config}
-                                                 "${CMAKE_CURRENT_BINARY_DIR}")
-    endforeach()
+      foreach(config ${CMAKE_CONFIGURATION_TYPES})
+        string(TOUPPER ${config} config)
+        set_target_properties(${target} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${config}
+                                                   "${CMAKE_CURRENT_BINARY_DIR}")
+      endforeach()
+    endif()
-# Make sure pytest is found or produce a warning
-  execute_process(
-    COMMAND ${PYTHON_EXECUTABLE} -c "import pytest; print(pytest.__version__)"
-    RESULT_VARIABLE pytest_not_found
-    OUTPUT_VARIABLE pytest_version
-  if(pytest_not_found)
-    message(WARNING "Running the tests requires pytest. Please install it manually"
-                    " (try: ${PYTHON_EXECUTABLE} -m pip install pytest)")
-  elseif(pytest_version VERSION_LESS 3.1)
-    message(WARNING "Running the tests requires pytest >= 3.1. Found: ${pytest_version}"
-                    "Please update it (try: ${PYTHON_EXECUTABLE} -m pip install -U pytest)")
-  else()
-        TRUE
-        CACHE INTERNAL "")
-  endif()
+# Provide nice organisation in IDEs
+  source_group(
+    TREE "${CMAKE_CURRENT_SOURCE_DIR}/../include"
+    PREFIX "Header Files"
+# Make sure pytest is found or produce a warning
+pybind11_find_import(pytest VERSION 3.1)
   # This is not used later in the build, so it's okay to regenerate each time.
   configure_file("${CMAKE_CURRENT_SOURCE_DIR}/pytest.ini" "${CMAKE_CURRENT_BINARY_DIR}/pytest.ini"
@@ -377,15 +501,20 @@ endif()
 # cmake 3.12 added list(transform <list> prepend
 # but we can't use it yet
+    ""
+    CACHE STRING "Put this before pytest, use for checkers and such")
 # A single command to compile and run the tests
   DEPENDS ${test_targets}
@@ -396,6 +525,27 @@ if(PYBIND11_TEST_OVERRIDE)
             "Note: not all tests run: -DPYBIND11_TEST_OVERRIDE is in effect")
+# cmake-format: off
+  memcheck
+    PYTHONMALLOC=malloc
+    valgrind
+    --leak-check=full
+    --show-leak-kinds=definite,indirect
+    --errors-for-leak-kinds=definite,indirect
+    --error-exitcode=1
+    --read-var-info=yes
+    --track-origins=yes
+    --suppressions="${CMAKE_CURRENT_SOURCE_DIR}/valgrind-python.supp"
+    --suppressions="${CMAKE_CURRENT_SOURCE_DIR}/valgrind-numpy-scipy.supp"
+    --gen-suppressions=all
+  DEPENDS ${test_targets}
+# cmake-format: on
 # Add a check target to run all the tests, starting with pytest (we add dependencies to this below)
 add_custom_target(check DEPENDS pytest)
diff --git a/wrap/pybind11/tests/conftest.py b/wrap/pybind11/tests/conftest.py
index a2350d041f..362eb80691 100644
--- a/wrap/pybind11/tests/conftest.py
+++ b/wrap/pybind11/tests/conftest.py
@@ -18,9 +18,9 @@
 # Early diagnostic for failed imports
 import pybind11_tests  # noqa: F401
-_unicode_marker = re.compile(r'u(\'[^\']*\')')
-_long_marker = re.compile(r'([0-9])L')
-_hexadecimal = re.compile(r'0x[0-9a-fA-F]+')
+_unicode_marker = re.compile(r"u(\'[^\']*\')")
+_long_marker = re.compile(r"([0-9])L")
+_hexadecimal = re.compile(r"0x[0-9a-fA-F]+")
 # Avoid collecting Python3 only files
 collect_ignore = []
@@ -30,7 +30,7 @@
 def _strip_and_dedent(s):
     """For triple-quote strings"""
-    return textwrap.dedent(s.lstrip('\n').rstrip())
+    return textwrap.dedent(s.lstrip("\n").rstrip())
 def _split_and_sort(s):
@@ -40,11 +40,14 @@ def _split_and_sort(s):
 def _make_explanation(a, b):
     """Explanation for a failed assert -- the a and b arguments are List[str]"""
-    return ["--- actual / +++ expected"] + [line.strip('\n') for line in difflib.ndiff(a, b)]
+    return ["--- actual / +++ expected"] + [
+        line.strip("\n") for line in difflib.ndiff(a, b)
+    ]
 class Output(object):
     """Basic output post-processing and comparison"""
     def __init__(self, string):
         self.string = string
         self.explanation = []
@@ -54,7 +57,11 @@ def __str__(self):
     def __eq__(self, other):
         # Ignore constructor/destructor output which is prefixed with "###"
-        a = [line for line in self.string.strip().splitlines() if not line.startswith("###")]
+        a = [
+            line
+            for line in self.string.strip().splitlines()
+            if not line.startswith("###")
+        ]
         b = _strip_and_dedent(other).splitlines()
         if a == b:
             return True
@@ -65,6 +72,7 @@ def __eq__(self, other):
 class Unordered(Output):
     """Custom comparison for output without strict line ordering"""
     def __eq__(self, other):
         a = _split_and_sort(self.string)
         b = _split_and_sort(other)
@@ -175,7 +183,7 @@ def msg():
 # noinspection PyUnusedLocal
 def pytest_assertrepr_compare(op, left, right):
     """Hook to insert custom failure explanation"""
-    if hasattr(left, 'explanation'):
+    if hasattr(left, "explanation"):
         return left.explanation
@@ -189,8 +197,8 @@ def suppress(exception):
 def gc_collect():
-    ''' Run the garbage collector twice (needed when running
-    reference counting tests with PyPy) '''
+    """Run the garbage collector twice (needed when running
+    reference counting tests with PyPy)"""
diff --git a/wrap/pybind11/tests/constructor_stats.h b/wrap/pybind11/tests/constructor_stats.h
index abfaf91614..805968a09b 100644
--- a/wrap/pybind11/tests/constructor_stats.h
+++ b/wrap/pybind11/tests/constructor_stats.h
@@ -120,7 +120,7 @@ class ConstructorStats {
             throw py::error_already_set();
-        py::module::import("gc").attr("collect")();
+        py::module_::import("gc").attr("collect")();
diff --git a/wrap/pybind11/tests/env.py b/wrap/pybind11/tests/env.py
index 5cded44127..6172b451b3 100644
--- a/wrap/pybind11/tests/env.py
+++ b/wrap/pybind11/tests/env.py
@@ -2,6 +2,8 @@
 import platform
 import sys
+import pytest
 LINUX = sys.platform.startswith("linux")
 MACOS = sys.platform.startswith("darwin")
 WIN = sys.platform.startswith("win32") or sys.platform.startswith("cygwin")
@@ -12,3 +14,20 @@
 PY2 = sys.version_info.major == 2
 PY = sys.version_info
+def deprecated_call():
+    """
+    pytest.deprecated_call() seems broken in pytest<3.9.x; concretely, it
+    doesn't work on CPython 3.8.0 with pytest==3.3.2 on Ubuntu 18.04 (#2922).
+    This is a narrowed reimplementation of the following PR :(
+    https://github.com/pytest-dev/pytest/pull/4104
+    """
+    # TODO: Remove this when testing requires pytest>=3.9.
+    pieces = pytest.__version__.split(".")
+    pytest_major_minor = (int(pieces[0]), int(pieces[1]))
+    if pytest_major_minor < (3, 9):
+        return pytest.warns((DeprecationWarning, PendingDeprecationWarning))
+    else:
+        return pytest.deprecated_call()
diff --git a/wrap/pybind11/tests/extra_python_package/test_files.py b/wrap/pybind11/tests/extra_python_package/test_files.py
index ac8ca1f97b..337a72dfea 100644
--- a/wrap/pybind11/tests/extra_python_package/test_files.py
+++ b/wrap/pybind11/tests/extra_python_package/test_files.py
@@ -25,6 +25,7 @@
+    "include/pybind11/gil.h",
@@ -41,9 +42,14 @@
+    "include/pybind11/detail/type_caster_base.h",
+stl_headers = {
+    "include/pybind11/stl/filesystem.h",
 cmake_files = {
@@ -58,11 +64,14 @@
+    "_version.pyi",
+    "py.typed",
+    "setup_helpers.pyi",
-headers = main_headers | detail_headers
+headers = main_headers | detail_headers | stl_headers
 src_files = headers | cmake_files
 all_files = src_files | py_files
@@ -72,6 +81,7 @@
+    "pybind11/include/pybind11/stl",
@@ -80,7 +90,7 @@
-    "README.md",
+    "README.rst",
@@ -116,7 +126,7 @@ def test_build_sdist(monkeypatch, tmpdir):
     with tarfile.open(str(sdist)) as tar:
         start = tar.getnames()[0] + "/"
         version = start[9:-1]
-        simpler = set(n.split("/", 1)[-1] for n in tar.getnames()[1:])
+        simpler = {n.split("/", 1)[-1] for n in tar.getnames()[1:]}
         with contextlib.closing(
             tar.extractfile(tar.getmember(start + "setup.py"))
@@ -128,9 +138,19 @@ def test_build_sdist(monkeypatch, tmpdir):
         ) as f:
             pyproject_toml = f.read()
-    files = set("pybind11/{}".format(n) for n in all_files)
+        with contextlib.closing(
+            tar.extractfile(
+                tar.getmember(
+                    start + "pybind11/share/cmake/pybind11/pybind11Config.cmake"
+                )
+            )
+        ) as f:
+            contents = f.read().decode("utf8")
+        assert 'set(pybind11_INCLUDE_DIR "${PACKAGE_PREFIX_DIR}/include")' in contents
+    files = {"pybind11/{}".format(n) for n in all_files}
     files |= sdist_files
-    files |= set("pybind11{}".format(n) for n in local_sdist_files)
+    files |= {"pybind11{}".format(n) for n in local_sdist_files}
     assert simpler == files
@@ -141,11 +161,11 @@ def test_build_sdist(monkeypatch, tmpdir):
             .substitute(version=version, extra_cmd="")
-        assert setup_py == contents
+    assert setup_py == contents
     with open(os.path.join(MAIN_DIR, "tools", "pyproject.toml"), "rb") as f:
         contents = f.read()
-        assert pyproject_toml == contents
+    assert pyproject_toml == contents
 def test_build_global_dist(monkeypatch, tmpdir):
@@ -171,7 +191,7 @@ def test_build_global_dist(monkeypatch, tmpdir):
     with tarfile.open(str(sdist)) as tar:
         start = tar.getnames()[0] + "/"
         version = start[16:-1]
-        simpler = set(n.split("/", 1)[-1] for n in tar.getnames()[1:])
+        simpler = {n.split("/", 1)[-1] for n in tar.getnames()[1:]}
         with contextlib.closing(
             tar.extractfile(tar.getmember(start + "setup.py"))
@@ -183,9 +203,9 @@ def test_build_global_dist(monkeypatch, tmpdir):
         ) as f:
             pyproject_toml = f.read()
-    files = set("pybind11/{}".format(n) for n in all_files)
+    files = {"pybind11/{}".format(n) for n in all_files}
     files |= sdist_files
-    files |= set("pybind11_global{}".format(n) for n in local_sdist_files)
+    files |= {"pybind11_global{}".format(n) for n in local_sdist_files}
     assert simpler == files
     with open(os.path.join(MAIN_DIR, "tools", "setup_global.py.in"), "rb") as f:
@@ -210,7 +230,7 @@ def tests_build_wheel(monkeypatch, tmpdir):
     (wheel,) = tmpdir.visit("*.whl")
-    files = set("pybind11/{}".format(n) for n in all_files)
+    files = {"pybind11/{}".format(n) for n in all_files}
     files |= {
@@ -223,10 +243,10 @@ def tests_build_wheel(monkeypatch, tmpdir):
     with zipfile.ZipFile(str(wheel)) as z:
         names = z.namelist()
-    trimmed = set(n for n in names if "dist-info" not in n)
-    trimmed |= set(
+    trimmed = {n for n in names if "dist-info" not in n}
+    trimmed |= {
         "dist-info/{}".format(n.split("/", 1)[-1]) for n in names if "dist-info" in n
-    )
+    }
     assert files == trimmed
@@ -240,8 +260,8 @@ def tests_build_global_wheel(monkeypatch, tmpdir):
     (wheel,) = tmpdir.visit("*.whl")
-    files = set("data/data/{}".format(n) for n in src_files)
-    files |= set("data/headers/{}".format(n[8:]) for n in headers)
+    files = {"data/data/{}".format(n) for n in src_files}
+    files |= {"data/headers/{}".format(n[8:]) for n in headers}
     files |= {
@@ -254,6 +274,6 @@ def tests_build_global_wheel(monkeypatch, tmpdir):
         names = z.namelist()
     beginning = names[0].split("/", 1)[0].rsplit(".", 1)[0]
-    trimmed = set(n[len(beginning) + 1 :] for n in names)
+    trimmed = {n[len(beginning) + 1 :] for n in names}
     assert files == trimmed
diff --git a/wrap/pybind11/tests/extra_setuptools/test_setuphelper.py b/wrap/pybind11/tests/extra_setuptools/test_setuphelper.py
index de0b516a9f..788f368b14 100644
--- a/wrap/pybind11/tests/extra_setuptools/test_setuphelper.py
+++ b/wrap/pybind11/tests/extra_setuptools/test_setuphelper.py
@@ -1,17 +1,19 @@
 # -*- coding: utf-8 -*-
 import os
-import sys
 import subprocess
+import sys
 from textwrap import dedent
 import pytest
 DIR = os.path.abspath(os.path.dirname(__file__))
 MAIN_DIR = os.path.dirname(os.path.dirname(DIR))
+WIN = sys.platform.startswith("win32") or sys.platform.startswith("cygwin")
+@pytest.mark.parametrize("parallel", [False, True])
 @pytest.mark.parametrize("std", [11, 0])
-def test_simple_setup_py(monkeypatch, tmpdir, std):
+def test_simple_setup_py(monkeypatch, tmpdir, parallel, std):
@@ -39,13 +41,18 @@ def test_simple_setup_py(monkeypatch, tmpdir, std):
                 cmdclass["build_ext"] = build_ext
+            parallel = {parallel}
+            if parallel:
+                from pybind11.setup_helpers import ParallelCompile
+                ParallelCompile().install()
-        ).format(MAIN_DIR=MAIN_DIR, std=std),
+        ).format(MAIN_DIR=MAIN_DIR, std=std, parallel=parallel),
@@ -65,13 +72,20 @@ def test_simple_setup_py(monkeypatch, tmpdir, std):
-    subprocess.check_call(
+    out = subprocess.check_output(
         [sys.executable, "setup.py", "build_ext", "--inplace"],
-        stdout=sys.stdout,
-        stderr=sys.stderr,
+    if not WIN:
+        assert b"-g0" in out
+    out = subprocess.check_output(
+        [sys.executable, "setup.py", "build_ext", "--inplace", "--force"],
+        env=dict(os.environ, CFLAGS="-g"),
+    )
+    if not WIN:
+        assert b"-g0" not in out
     # Debug helper printout, normally hidden
+    print(out)
     for item in tmpdir.listdir():
@@ -93,3 +107,45 @@ def test_simple_setup_py(monkeypatch, tmpdir, std):
         [sys.executable, "test.py"], stdout=sys.stdout, stderr=sys.stderr
+def test_intree_extensions(monkeypatch, tmpdir):
+    monkeypatch.syspath_prepend(MAIN_DIR)
+    from pybind11.setup_helpers import intree_extensions
+    monkeypatch.chdir(tmpdir)
+    root = tmpdir
+    root.ensure_dir()
+    subdir = root / "dir"
+    subdir.ensure_dir()
+    src = subdir / "ext.cpp"
+    src.ensure()
+    (ext,) = intree_extensions([src.relto(tmpdir)])
+    assert ext.name == "ext"
+    subdir.ensure("__init__.py")
+    (ext,) = intree_extensions([src.relto(tmpdir)])
+    assert ext.name == "dir.ext"
+def test_intree_extensions_package_dir(monkeypatch, tmpdir):
+    monkeypatch.syspath_prepend(MAIN_DIR)
+    from pybind11.setup_helpers import intree_extensions
+    monkeypatch.chdir(tmpdir)
+    root = tmpdir / "src"
+    root.ensure_dir()
+    subdir = root / "dir"
+    subdir.ensure_dir()
+    src = subdir / "ext.cpp"
+    src.ensure()
+    (ext,) = intree_extensions([src.relto(tmpdir)], package_dir={"": "src"})
+    assert ext.name == "dir.ext"
+    (ext,) = intree_extensions([src.relto(tmpdir)], package_dir={"foo": "src"})
+    assert ext.name == "foo.dir.ext"
+    subdir.ensure("__init__.py")
+    (ext,) = intree_extensions([src.relto(tmpdir)], package_dir={"": "src"})
+    assert ext.name == "dir.ext"
+    (ext,) = intree_extensions([src.relto(tmpdir)], package_dir={"foo": "src"})
+    assert ext.name == "foo.dir.ext"
diff --git a/wrap/pybind11/tests/local_bindings.h b/wrap/pybind11/tests/local_bindings.h
index 22537b13ad..4c936c19a5 100644
--- a/wrap/pybind11/tests/local_bindings.h
+++ b/wrap/pybind11/tests/local_bindings.h
@@ -1,10 +1,12 @@
 #pragma once
+#include <utility>
 #include "pybind11_tests.h"
 /// Simple class used to test py::local:
 template <int> class LocalBase {
-    LocalBase(int i) : i(i) { }
+    explicit LocalBase(int i) : i(i) { }
     int i = -1;
@@ -33,6 +35,25 @@ using NonLocalVec2 = std::vector<NonLocal2>;
 using NonLocalMap = std::unordered_map<std::string, NonLocalType>;
 using NonLocalMap2 = std::unordered_map<std::string, uint8_t>;
+// Exception that will be caught via the module local translator.
+class LocalException : public std::exception {
+    explicit LocalException(const char * m) : message{m} {}
+    const char * what() const noexcept override {return message.c_str();}
+    std::string message = "";
+// Exception that will be registered with register_local_exception_translator
+class LocalSimpleException : public std::exception {
+    explicit LocalSimpleException(const char * m) : message{m} {}
+    const char * what() const noexcept override {return message.c_str();}
+    std::string message = "";
@@ -54,11 +75,11 @@ py::class_<T> bind_local(Args && ...args) {
 namespace pets {
 class Pet {
-    Pet(std::string name) : name_(name) {}
+    explicit Pet(std::string name) : name_(std::move(name)) {}
     std::string name_;
-    const std::string &name() { return name_; }
+    const std::string &name() const { return name_; }
 } // namespace pets
-struct MixGL { int i; MixGL(int i) : i{i} {} };
-struct MixGL2 { int i; MixGL2(int i) : i{i} {} };
+struct MixGL { int i; explicit MixGL(int i) : i{i} {} };
+struct MixGL2 { int i; explicit MixGL2(int i) : i{i} {} };
diff --git a/wrap/pybind11/tests/object.h b/wrap/pybind11/tests/object.h
index 9fbbc69f0f..df34c2bad0 100644
--- a/wrap/pybind11/tests/object.h
+++ b/wrap/pybind11/tests/object.h
@@ -64,7 +64,7 @@ template <typename T> class ref {
     ref() : m_ptr(nullptr) { print_default_created(this); track_default_created((ref_tag*) this); }
     /// Construct a reference from a pointer
-    ref(T *ptr) : m_ptr(ptr) {
+    explicit ref(T *ptr) : m_ptr(ptr) {
         if (m_ptr) ((Object *) m_ptr)->incRef();
         print_created(this, "from pointer", m_ptr); track_created((ref_tag*) this, "from pointer");
@@ -80,7 +80,7 @@ template <typename T> class ref {
     /// Move constructor
-    ref(ref &&r) : m_ptr(r.m_ptr) {
+    ref(ref &&r) noexcept : m_ptr(r.m_ptr) {
         r.m_ptr = nullptr;
         print_move_created(this, "with pointer", m_ptr); track_move_created((ref_tag*) this);
@@ -95,7 +95,7 @@ template <typename T> class ref {
     /// Move another reference into the current one
-    ref& operator=(ref&& r) {
+    ref &operator=(ref &&r) noexcept {
         print_move_assigned(this, "pointer", r.m_ptr); track_move_assigned((ref_tag*) this);
         if (*this == r)
@@ -109,7 +109,11 @@ template <typename T> class ref {
     /// Overwrite this reference with another reference
     ref& operator=(const ref& r) {
-        print_copy_assigned(this, "pointer", r.m_ptr); track_copy_assigned((ref_tag*) this);
+        if (this == &r) {
+            return *this;
+        }
+        print_copy_assigned(this, "pointer", r.m_ptr);
+        track_copy_assigned((ref_tag *) this);
         if (m_ptr == r.m_ptr)
             return *this;
@@ -160,7 +164,7 @@ template <typename T> class ref {
     const T& operator*() const { return *m_ptr; }
     /// Return a pointer to the referenced object
-    operator T* () { return m_ptr; }
+    explicit operator T* () { return m_ptr; }
     /// Return a const pointer to the referenced object
     T* get_ptr() { return m_ptr; }
diff --git a/wrap/pybind11/tests/pybind11_cross_module_tests.cpp b/wrap/pybind11/tests/pybind11_cross_module_tests.cpp
index f705e31061..5838cb2746 100644
--- a/wrap/pybind11/tests/pybind11_cross_module_tests.cpp
+++ b/wrap/pybind11/tests/pybind11_cross_module_tests.cpp
@@ -9,8 +9,12 @@
 #include "pybind11_tests.h"
 #include "local_bindings.h"
+#include "test_exceptions.h"
 #include <pybind11/stl_bind.h>
 #include <numeric>
+#include <utility>
 PYBIND11_MODULE(pybind11_cross_module_tests, m) {
     m.doc() = "pybind11 cross-module test module";
@@ -25,11 +29,32 @@ PYBIND11_MODULE(pybind11_cross_module_tests, m) {
     bind_local<ExternalType2>(m, "ExternalType2", py::module_local());
     // test_exceptions.py
+    py::register_local_exception<LocalSimpleException>(m, "LocalSimpleException");
     m.def("raise_runtime_error", []() { PyErr_SetString(PyExc_RuntimeError, "My runtime error"); throw py::error_already_set(); });
     m.def("raise_value_error", []() { PyErr_SetString(PyExc_ValueError, "My value error"); throw py::error_already_set(); });
     m.def("throw_pybind_value_error", []() { throw py::value_error("pybind11 value error"); });
     m.def("throw_pybind_type_error", []() { throw py::type_error("pybind11 type error"); });
     m.def("throw_stop_iteration", []() { throw py::stop_iteration(); });
+    m.def("throw_local_error", []() { throw LocalException("just local"); });
+    m.def("throw_local_simple_error", []() { throw LocalSimpleException("external mod"); });
+    py::register_exception_translator([](std::exception_ptr p) {
+      try {
+          if (p) std::rethrow_exception(p);
+      } catch (const shared_exception &e) {
+          PyErr_SetString(PyExc_KeyError, e.what());
+      }
+    });
+    // translate the local exception into a key error but only in this module
+    py::register_local_exception_translator([](std::exception_ptr p) {
+      try {
+          if (p) {
+            std::rethrow_exception(p);
+          }
+      } catch (const LocalException &e) {
+        PyErr_SetString(PyExc_KeyError, e.what());
+      }
+    });
     // test_local_bindings.py
     // Local to both:
@@ -83,7 +108,7 @@ PYBIND11_MODULE(pybind11_cross_module_tests, m) {
     m.def("get_mixed_lg", [](int i) { return MixedLocalGlobal(i); });
     // test_internal_locals_differ
-    m.def("local_cpp_types_addr", []() { return (uintptr_t) &py::detail::registered_local_types_cpp(); });
+    m.def("local_cpp_types_addr", []() { return (uintptr_t) &py::detail::get_local_internals().registered_types_cpp; });
     // test_stl_caster_vs_stl_bind
     py::bind_vector<std::vector<int>>(m, "VectorInt");
@@ -96,7 +121,10 @@ PYBIND11_MODULE(pybind11_cross_module_tests, m) {
     m.def("return_self", [](LocalVec *v) { return v; });
     m.def("return_copy", [](const LocalVec &v) { return LocalVec(v); });
-    class Dog : public pets::Pet { public: Dog(std::string name) : Pet(name) {}; };
+    class Dog : public pets::Pet {
+    public:
+        explicit Dog(std::string name) : Pet(std::move(name)) {}
+    };
     py::class_<pets::Pet>(m, "Pet", py::module_local())
         .def("name", &pets::Pet::name);
     // Binding for local extending class:
@@ -118,6 +146,6 @@ PYBIND11_MODULE(pybind11_cross_module_tests, m) {
     // test_missing_header_message
     // The main module already includes stl.h, but we need to test the error message
     // which appears when this header is missing.
-    m.def("missing_header_arg", [](std::vector<float>) { });
+    m.def("missing_header_arg", [](const std::vector<float> &) {});
     m.def("missing_header_return", []() { return std::vector<float>(); });
diff --git a/wrap/pybind11/tests/pybind11_tests.cpp b/wrap/pybind11/tests/pybind11_tests.cpp
index 24b65df6ff..439cd40129 100644
--- a/wrap/pybind11/tests/pybind11_tests.cpp
+++ b/wrap/pybind11/tests/pybind11_tests.cpp
@@ -26,8 +26,8 @@ productively.
 Instead, see the "How can I reduce the build time?" question in the "Frequently asked questions"
 section of the documentation for good practice on splitting binding code over multiple files.
-std::list<std::function<void(py::module &)>> &initializers() {
-    static std::list<std::function<void(py::module &)>> inits;
+std::list<std::function<void(py::module_ &)>> &initializers() {
+    static std::list<std::function<void(py::module_ &)>> inits;
     return inits;
@@ -36,13 +36,13 @@ test_initializer::test_initializer(Initializer init) {
 test_initializer::test_initializer(const char *submodule_name, Initializer init) {
-    initializers().emplace_back([=](py::module &parent) {
+    initializers().emplace_back([=](py::module_ &parent) {
         auto m = parent.def_submodule(submodule_name);
-void bind_ConstructorStats(py::module &m) {
+void bind_ConstructorStats(py::module_ &m) {
     py::class_<ConstructorStats>(m, "ConstructorStats")
         .def("alive", &ConstructorStats::alive)
         .def("values", &ConstructorStats::values)
diff --git a/wrap/pybind11/tests/pybind11_tests.h b/wrap/pybind11/tests/pybind11_tests.h
index 1e47416270..9b99923237 100644
--- a/wrap/pybind11/tests/pybind11_tests.h
+++ b/wrap/pybind11/tests/pybind11_tests.h
@@ -1,27 +1,29 @@
 #pragma once
 #include <pybind11/pybind11.h>
+#include <pybind11/eval.h>
 #if defined(_MSC_VER) && _MSC_VER < 1910
 // We get some really long type names here which causes MSVC 2015 to emit warnings
-#  pragma warning(disable: 4503) // warning C4503: decorated name length exceeded, name was truncated
+#    pragma warning(                                                                              \
+        disable : 4503) // warning C4503: decorated name length exceeded, name was truncated
 namespace py = pybind11;
 using namespace pybind11::literals;
 class test_initializer {
-    using Initializer = void (*)(py::module &);
+    using Initializer = void (*)(py::module_ &);
-    test_initializer(Initializer init);
+    explicit test_initializer(Initializer init);
     test_initializer(const char *submodule_name, Initializer init);
-#define TEST_SUBMODULE(name, variable)                   \
-    void test_submodule_##name(py::module &);            \
-    test_initializer name(#name, test_submodule_##name); \
-    void test_submodule_##name(py::module &variable)
+#define TEST_SUBMODULE(name, variable)                                                            \
+    void test_submodule_##name(py::module_ &);                                                    \
+    test_initializer name(#name, test_submodule_##name);                                          \
+    void test_submodule_##name(py::module_ &(variable))
 /// Dummy type which is not exported anywhere -- something to trigger a conversion error
 struct UnregisteredType { };
@@ -30,7 +32,7 @@ struct UnregisteredType { };
 class UserType {
     UserType() = default;
-    UserType(int i) : i(i) { }
+    explicit UserType(int i) : i(i) { }
     int value() const { return i; }
     void set(int set) { i = set; }
@@ -50,6 +52,12 @@ class IncType : public UserType {
     IncType &operator=(IncType &&) = delete;
+/// A simple union for basic testing
+union IntFloat {
+    int i;
+    float f;
 /// Custom cast-only type that casts to a string "rvalue" or "lvalue" depending on the cast context.
 /// Used to test recursive casters (e.g. std::tuple, stl containers).
 struct RValueCaster {};
@@ -57,9 +65,21 @@ PYBIND11_NAMESPACE_BEGIN(pybind11)
 template<> class type_caster<RValueCaster> {
-    PYBIND11_TYPE_CASTER(RValueCaster, _("RValueCaster"));
+    PYBIND11_TYPE_CASTER(RValueCaster, const_name("RValueCaster"));
     static handle cast(RValueCaster &&, return_value_policy, handle) { return py::str("rvalue").release(); }
     static handle cast(const RValueCaster &, return_value_policy, handle) { return py::str("lvalue").release(); }
+template <typename F>
+void ignoreOldStyleInitWarnings(F &&body) {
+    py::exec(R"(
+    message = "pybind11-bound class '.+' is using an old-style placement-new '(?:__init__|__setstate__)' which has been deprecated"
+    import warnings
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", message=message, category=FutureWarning)
+        body()
+    )", py::dict(py::arg("body") = py::cpp_function(body)));
diff --git a/wrap/pybind11/tests/pytest.ini b/wrap/pybind11/tests/pytest.ini
index c47cbe9c1e..a3871d6c3a 100644
--- a/wrap/pybind11/tests/pytest.ini
+++ b/wrap/pybind11/tests/pytest.ini
@@ -7,11 +7,11 @@ addopts =
     # capture only Python print and C++ py::print, but not C output (low-level Python errors)
-    # enable all warnings
-    -Wa
 filterwarnings =
     # make warnings into errors but ignore certain third-party extension issues
+    # somehow, some DeprecationWarnings do not get turned into errors
+    always::DeprecationWarning
     # importing scipy submodules on some version of Python
     # bogus numpy ABI warning (see numpy/#432)
diff --git a/wrap/pybind11/tests/requirements.txt b/wrap/pybind11/tests/requirements.txt
index 39bd57a1c7..98ca46d28a 100644
--- a/wrap/pybind11/tests/requirements.txt
+++ b/wrap/pybind11/tests/requirements.txt
@@ -1,8 +1,12 @@
---extra-index-url https://antocuni.github.io/pypy-wheels/manylinux2010/
-numpy==1.16.6; python_version<"3.6"
-numpy==1.18.0; platform_python_implementation=="PyPy" and sys_platform=="darwin" and python_version>="3.6"
-numpy==1.19.1; (platform_python_implementation!="PyPy" or sys_platform!="darwin") and python_version>="3.6" and python_version<"3.9"
+numpy==1.16.6; python_version<"3.6" and sys_platform!="win32" and platform_python_implementation!="PyPy"
+numpy==1.19.0; platform_python_implementation=="PyPy" and sys_platform=="linux" and python_version=="3.6"
+numpy==1.20.0; platform_python_implementation=="PyPy" and sys_platform=="linux" and python_version=="3.7"
+numpy==1.19.3; platform_python_implementation!="PyPy" and python_version=="3.6"
+numpy==1.21.3; platform_python_implementation!="PyPy" and python_version>="3.7" and python_version<"3.11"
+py @ git+https://github.com/pytest-dev/py; python_version>="3.11"
 pytest==4.6.9; python_version<"3.5"
-pytest==5.4.3; python_version>="3.5"
-scipy==1.2.3; (platform_python_implementation!="PyPy" or sys_platform!="darwin") and python_version<"3.6"
-scipy==1.5.2; (platform_python_implementation!="PyPy" or sys_platform!="darwin") and python_version>="3.6" and python_version<"3.9"
+pytest==6.1.2; python_version=="3.5"
+pytest==6.2.4; python_version>="3.6"
+scipy==1.2.3; platform_python_implementation!="PyPy" and python_version<"3.6"
+scipy==1.5.4; platform_python_implementation!="PyPy" and python_version>="3.6" and python_version<"3.10"
diff --git a/wrap/pybind11/tests/test_async.cpp b/wrap/pybind11/tests/test_async.cpp
index f0ad0d5350..e6e01d72c9 100644
--- a/wrap/pybind11/tests/test_async.cpp
+++ b/wrap/pybind11/tests/test_async.cpp
@@ -18,7 +18,7 @@ TEST_SUBMODULE(async_module, m) {
         .def("__await__", [](const SupportsAsync& self) -> py::object {
-            py::object loop = py::module::import("asyncio.events").attr("get_event_loop")();
+            py::object loop = py::module_::import("asyncio.events").attr("get_event_loop")();
             py::object f = loop.attr("create_future")();
             return f.attr("__await__")();
diff --git a/wrap/pybind11/tests/test_buffers.cpp b/wrap/pybind11/tests/test_buffers.cpp
index 1bc67ff7b6..3a8e3e7b75 100644
--- a/wrap/pybind11/tests/test_buffers.cpp
+++ b/wrap/pybind11/tests/test_buffers.cpp
@@ -9,12 +9,13 @@
 #include "pybind11_tests.h"
 #include "constructor_stats.h"
+#include <pybind11/stl.h>
 TEST_SUBMODULE(buffers, m) {
     // test_from_python / test_to_python:
     class Matrix {
-        Matrix(ssize_t rows, ssize_t cols) : m_rows(rows), m_cols(cols) {
+        Matrix(py::ssize_t rows, py::ssize_t cols) : m_rows(rows), m_cols(cols) {
             print_created(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
             m_data = new float[(size_t) (rows*cols)];
             memset(m_data, 0, sizeof(float) * (size_t) (rows * cols));
@@ -26,7 +27,7 @@ TEST_SUBMODULE(buffers, m) {
             memcpy(m_data, s.m_data, sizeof(float) * (size_t) (m_rows * m_cols));
-        Matrix(Matrix &&s) : m_rows(s.m_rows), m_cols(s.m_cols), m_data(s.m_data) {
+        Matrix(Matrix &&s) noexcept : m_rows(s.m_rows), m_cols(s.m_cols), m_data(s.m_data) {
             s.m_rows = 0;
             s.m_cols = 0;
@@ -39,7 +40,11 @@ TEST_SUBMODULE(buffers, m) {
         Matrix &operator=(const Matrix &s) {
-            print_copy_assigned(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
+            if (this == &s) {
+                return *this;
+            }
+            print_copy_assigned(this,
+                                std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
             delete[] m_data;
             m_rows = s.m_rows;
             m_cols = s.m_cols;
@@ -48,7 +53,7 @@ TEST_SUBMODULE(buffers, m) {
             return *this;
-        Matrix &operator=(Matrix &&s) {
+        Matrix &operator=(Matrix &&s) noexcept {
             print_move_assigned(this, std::to_string(m_rows) + "x" + std::to_string(m_cols) + " matrix");
             if (&s != this) {
                 delete[] m_data;
@@ -58,27 +63,27 @@ TEST_SUBMODULE(buffers, m) {
             return *this;
-        float operator()(ssize_t i, ssize_t j) const {
+        float operator()(py::ssize_t i, py::ssize_t j) const {
             return m_data[(size_t) (i*m_cols + j)];
-        float &operator()(ssize_t i, ssize_t j) {
+        float &operator()(py::ssize_t i, py::ssize_t j) {
             return m_data[(size_t) (i*m_cols + j)];
         float *data() { return m_data; }
-        ssize_t rows() const { return m_rows; }
-        ssize_t cols() const { return m_cols; }
+        py::ssize_t rows() const { return m_rows; }
+        py::ssize_t cols() const { return m_cols; }
-        ssize_t m_rows;
-        ssize_t m_cols;
+        py::ssize_t m_rows;
+        py::ssize_t m_cols;
         float *m_data;
     py::class_<Matrix>(m, "Matrix", py::buffer_protocol())
-        .def(py::init<ssize_t, ssize_t>())
+        .def(py::init<py::ssize_t, py::ssize_t>())
         /// Construct from a buffer
-        .def(py::init([](py::buffer const b) {
+        .def(py::init([](const py::buffer &b) {
             py::buffer_info info = b.request();
             if (info.format != py::format_descriptor<float>::format() || info.ndim != 2)
                 throw std::runtime_error("Incompatible buffer format!");
@@ -88,40 +93,40 @@ TEST_SUBMODULE(buffers, m) {
             return v;
-       .def("rows", &Matrix::rows)
-       .def("cols", &Matrix::cols)
+        .def("rows", &Matrix::rows)
+        .def("cols", &Matrix::cols)
         /// Bare bones interface
-       .def("__getitem__", [](const Matrix &m, std::pair<ssize_t, ssize_t> i) {
-            if (i.first >= m.rows() || i.second >= m.cols())
-                throw py::index_error();
-            return m(i.first, i.second);
-        })
-       .def("__setitem__", [](Matrix &m, std::pair<ssize_t, ssize_t> i, float v) {
-            if (i.first >= m.rows() || i.second >= m.cols())
-                throw py::index_error();
-            m(i.first, i.second) = v;
-        })
-       /// Provide buffer access
-       .def_buffer([](Matrix &m) -> py::buffer_info {
+        .def("__getitem__",
+             [](const Matrix &m, std::pair<py::ssize_t, py::ssize_t> i) {
+                 if (i.first >= m.rows() || i.second >= m.cols())
+                     throw py::index_error();
+                 return m(i.first, i.second);
+             })
+        .def("__setitem__",
+             [](Matrix &m, std::pair<py::ssize_t, py::ssize_t> i, float v) {
+                 if (i.first >= m.rows() || i.second >= m.cols())
+                     throw py::index_error();
+                 m(i.first, i.second) = v;
+             })
+        /// Provide buffer access
+        .def_buffer([](Matrix &m) -> py::buffer_info {
             return py::buffer_info(
                 m.data(),                               /* Pointer to buffer */
                 { m.rows(), m.cols() },                 /* Buffer dimensions */
                 { sizeof(float) * size_t(m.cols()),     /* Strides (in bytes) for each index */
                   sizeof(float) }
-        })
-        ;
+        });
     // test_inherited_protocol
     class SquareMatrix : public Matrix {
-        SquareMatrix(ssize_t n) : Matrix(n, n) { }
+        explicit SquareMatrix(py::ssize_t n) : Matrix(n, n) {}
     // Derived classes inherit the buffer protocol and the buffer access function
     py::class_<SquareMatrix, Matrix>(m, "SquareMatrix")
-        .def(py::init<ssize_t>());
+        .def(py::init<py::ssize_t>());
     // test_pointer_to_member_fn
@@ -153,7 +158,7 @@ TEST_SUBMODULE(buffers, m) {
                                    py::format_descriptor<int32_t>::format(), 1);
-        ConstBuffer() : value(new int32_t{0}) { };
+        ConstBuffer() : value(new int32_t{0}) {}
     py::class_<ConstBuffer>(m, "ConstBuffer", py::buffer_protocol())
@@ -168,7 +173,7 @@ TEST_SUBMODULE(buffers, m) {
     struct BufferReadOnly {
         const uint8_t value = 0;
-        BufferReadOnly(uint8_t value): value(value) {}
+        explicit BufferReadOnly(uint8_t value) : value(value) {}
         py::buffer_info get_buffer_info() {
             return py::buffer_info(&value, 1);
@@ -192,4 +197,20 @@ TEST_SUBMODULE(buffers, m) {
         .def_readwrite("readonly", &BufferReadOnlySelect::readonly)
+    // Expose buffer_info for testing.
+    py::class_<py::buffer_info>(m, "buffer_info")
+        .def(py::init<>())
+        .def_readonly("itemsize", &py::buffer_info::itemsize)
+        .def_readonly("size", &py::buffer_info::size)
+        .def_readonly("format", &py::buffer_info::format)
+        .def_readonly("ndim", &py::buffer_info::ndim)
+        .def_readonly("shape", &py::buffer_info::shape)
+        .def_readonly("strides", &py::buffer_info::strides)
+        .def_readonly("readonly", &py::buffer_info::readonly)
+        .def("__repr__", [](py::handle self) {
+             return py::str("itemsize={0.itemsize!r}, size={0.size!r}, format={0.format!r}, ndim={0.ndim!r}, shape={0.shape!r}, strides={0.strides!r}, readonly={0.readonly!r}").format(self);
+        })
+        ;
+    m.def("get_buffer_info", [](const py::buffer &buffer) { return buffer.request(); });
diff --git a/wrap/pybind11/tests/test_buffers.py b/wrap/pybind11/tests/test_buffers.py
index d6adaf1f5e..0d5bf16c3d 100644
--- a/wrap/pybind11/tests/test_buffers.py
+++ b/wrap/pybind11/tests/test_buffers.py
@@ -1,13 +1,13 @@
 # -*- coding: utf-8 -*-
+import ctypes
 import io
 import struct
 import pytest
-import env  # noqa: F401
-from pybind11_tests import buffers as m
+import env
 from pybind11_tests import ConstructorStats
+from pybind11_tests import buffers as m
 np = pytest.importorskip("numpy")
@@ -36,6 +36,10 @@ def test_from_python():
 # https://foss.heptapod.net/pypy/pypy/-/issues/2444
+# TODO: fix on recent PyPy
+    env.PYPY, reason="PyPy 7.3.7 doesn't clear this anymore", strict=False
 def test_to_python():
     mat = m.Matrix(5, 4)
     assert memoryview(mat).shape == (5, 4)
@@ -45,8 +49,8 @@ def test_to_python():
     mat[3, 2] = 7.0
     assert mat[2, 3] == 4
     assert mat[3, 2] == 7
-    assert struct.unpack_from('f', mat, (3 * 4 + 2) * 4) == (7, )
-    assert struct.unpack_from('f', mat, (2 * 4 + 3) * 4) == (4, )
+    assert struct.unpack_from("f", mat, (3 * 4 + 2) * 4) == (7,)
+    assert struct.unpack_from("f", mat, (2 * 4 + 3) * 4) == (4,)
     mat2 = np.array(mat, copy=False)
     assert mat2.shape == (5, 4)
@@ -82,28 +86,82 @@ def test_pointer_to_member_fn():
     for cls in [m.Buffer, m.ConstBuffer, m.DerivedBuffer]:
         buf = cls()
         buf.value = 0x12345678
-        value = struct.unpack('i', bytearray(buf))[0]
+        value = struct.unpack("i", bytearray(buf))[0]
         assert value == 0x12345678
 def test_readonly_buffer():
     buf = m.BufferReadOnly(0x64)
     view = memoryview(buf)
-    assert view[0] == b'd' if env.PY2 else 0x64
+    assert view[0] == b"d" if env.PY2 else 0x64
     assert view.readonly
+    with pytest.raises(TypeError):
+        view[0] = b"\0" if env.PY2 else 0
 def test_selective_readonly_buffer():
     buf = m.BufferReadOnlySelect()
-    memoryview(buf)[0] = b'd' if env.PY2 else 0x64
+    memoryview(buf)[0] = b"d" if env.PY2 else 0x64
     assert buf.value == 0x64
-    io.BytesIO(b'A').readinto(buf)
-    assert buf.value == ord(b'A')
+    io.BytesIO(b"A").readinto(buf)
+    assert buf.value == ord(b"A")
     buf.readonly = True
     with pytest.raises(TypeError):
-        memoryview(buf)[0] = b'\0' if env.PY2 else 0
+        memoryview(buf)[0] = b"\0" if env.PY2 else 0
     with pytest.raises(TypeError):
-        io.BytesIO(b'1').readinto(buf)
+        io.BytesIO(b"1").readinto(buf)
+def test_ctypes_array_1d():
+    char1d = (ctypes.c_char * 10)()
+    int1d = (ctypes.c_int * 15)()
+    long1d = (ctypes.c_long * 7)()
+    for carray in (char1d, int1d, long1d):
+        info = m.get_buffer_info(carray)
+        assert info.itemsize == ctypes.sizeof(carray._type_)
+        assert info.size == len(carray)
+        assert info.ndim == 1
+        assert info.shape == [info.size]
+        assert info.strides == [info.itemsize]
+        assert not info.readonly
+def test_ctypes_array_2d():
+    char2d = ((ctypes.c_char * 10) * 4)()
+    int2d = ((ctypes.c_int * 15) * 3)()
+    long2d = ((ctypes.c_long * 7) * 2)()
+    for carray in (char2d, int2d, long2d):
+        info = m.get_buffer_info(carray)
+        assert info.itemsize == ctypes.sizeof(carray[0]._type_)
+        assert info.size == len(carray) * len(carray[0])
+        assert info.ndim == 2
+        assert info.shape == [len(carray), len(carray[0])]
+        assert info.strides == [info.itemsize * len(carray[0]), info.itemsize]
+        assert not info.readonly
+    "env.PYPY and env.PY2", reason="PyPy2 bytes buffer not reported as readonly"
+def test_ctypes_from_buffer():
+    test_pystr = b"0123456789"
+    for pyarray in (test_pystr, bytearray(test_pystr)):
+        pyinfo = m.get_buffer_info(pyarray)
+        if pyinfo.readonly:
+            cbytes = (ctypes.c_char * len(pyarray)).from_buffer_copy(pyarray)
+            cinfo = m.get_buffer_info(cbytes)
+        else:
+            cbytes = (ctypes.c_char * len(pyarray)).from_buffer(pyarray)
+            cinfo = m.get_buffer_info(cbytes)
+        assert cinfo.size == pyinfo.size
+        assert cinfo.ndim == pyinfo.ndim
+        assert cinfo.shape == pyinfo.shape
+        assert cinfo.strides == pyinfo.strides
+        assert not cinfo.readonly
diff --git a/wrap/pybind11/tests/test_builtin_casters.cpp b/wrap/pybind11/tests/test_builtin_casters.cpp
index acc9f8fb36..4a9f338378 100644
--- a/wrap/pybind11/tests/test_builtin_casters.cpp
+++ b/wrap/pybind11/tests/test_builtin_casters.cpp
@@ -10,10 +10,64 @@
 #include "pybind11_tests.h"
 #include <pybind11/complex.h>
-#if defined(_MSC_VER)
-#  pragma warning(push)
-#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+struct ConstRefCasted {
+  int tag;
+template <>
+class type_caster<ConstRefCasted> {
+ public:
+  static constexpr auto name = const_name<ConstRefCasted>();
+  // Input is unimportant, a new value will always be constructed based on the
+  // cast operator.
+  bool load(handle, bool) { return true; }
+  explicit operator ConstRefCasted &&() {
+      value = {1};
+      // NOLINTNEXTLINE(performance-move-const-arg)
+      return std::move(value);
+  }
+  explicit operator ConstRefCasted &() {
+      value = {2};
+      return value;
+  }
+  explicit operator ConstRefCasted *() {
+      value = {3};
+      return &value;
+  }
+  explicit operator const ConstRefCasted &() {
+      value = {4};
+      return value;
+  }
+  explicit operator const ConstRefCasted *() {
+      value = {5};
+      return &value;
+  }
+  // custom cast_op to explicitly propagate types to the conversion operators.
+  template <typename T_>
+  using cast_op_type =
+      /// const
+      conditional_t<
+          std::is_same<remove_reference_t<T_>, const ConstRefCasted*>::value, const ConstRefCasted*,
+      conditional_t<
+          std::is_same<T_, const ConstRefCasted&>::value, const ConstRefCasted&,
+      /// non-const
+      conditional_t<
+          std::is_same<remove_reference_t<T_>, ConstRefCasted*>::value, ConstRefCasted*,
+      conditional_t<
+          std::is_same<T_, ConstRefCasted&>::value, ConstRefCasted&,
+          /* else */ConstRefCasted&&>>>>;
+ private:
+  ConstRefCasted value = {0};
 TEST_SUBMODULE(builtin_casters, m) {
     // test_simple_string
@@ -26,7 +80,7 @@ TEST_SUBMODULE(builtin_casters, m) {
     std::wstring wstr;
     wstr.push_back(0x61); // a
     wstr.push_back(0x2e18); // ⸘
-    if (sizeof(wchar_t) == 2) { wstr.push_back(mathbfA16_1); wstr.push_back(mathbfA16_2); } // 𝐀, utf16
+    if (PYBIND11_SILENCE_MSVC_C4127(sizeof(wchar_t) == 2)) { wstr.push_back(mathbfA16_1); wstr.push_back(mathbfA16_2); } // 𝐀, utf16
     else { wstr.push_back((wchar_t) mathbfA32); } // 𝐀, utf32
     wstr.push_back(0x7a); // z
@@ -36,11 +90,12 @@ TEST_SUBMODULE(builtin_casters, m) {
     m.def("good_wchar_string", [=]() { return wstr; }); // a‽𝐀z
     m.def("bad_utf8_string", []()  { return std::string("abc\xd0" "def"); });
     m.def("bad_utf16_string", [=]() { return std::u16string({ b16, char16_t(0xd800), z16 }); });
     // Under Python 2.7, invalid unicode UTF-32 characters don't appear to trigger UnicodeDecodeError
-    if (PY_MAJOR_VERSION >= 3)
-        m.def("bad_utf32_string", [=]() { return std::u32string({ a32, char32_t(0xd800), z32 }); });
-    if (PY_MAJOR_VERSION >= 3 || sizeof(wchar_t) == 2)
+    m.def("bad_utf32_string", [=]() { return std::u32string({ a32, char32_t(0xd800), z32 }); });
+    if (PYBIND11_SILENCE_MSVC_C4127(sizeof(wchar_t) == 2))
         m.def("bad_wchar_string", [=]() { return std::wstring({ wchar_t(0x61), wchar_t(0xd800) }); });
     m.def("u8_Z", []() -> char { return 'Z'; });
     m.def("u8_eacute", []() -> char { return '\xe9'; });
     m.def("u16_ibang", [=]() -> char16_t { return ib16; });
@@ -58,7 +113,7 @@ TEST_SUBMODULE(builtin_casters, m) {
     // test_bytes_to_string
     m.def("strlen", [](char *s) { return strlen(s); });
-    m.def("string_length", [](std::string s) { return s.length(); });
+    m.def("string_length", [](const std::string &s) { return s.length(); });
     m.attr("has_u8string") = true;
@@ -85,11 +140,35 @@ TEST_SUBMODULE(builtin_casters, m) {
     m.def("string_view16_return", []() { return std::u16string_view(u"utf16 secret \U0001f382"); });
     m.def("string_view32_return", []() { return std::u32string_view(U"utf32 secret \U0001f382"); });
+    // The inner lambdas here are to also test implicit conversion
+    using namespace std::literals;
+    m.def("string_view_bytes", []() { return [](py::bytes b) { return b; }("abc \x80\x80 def"sv); });
+    m.def("string_view_str",   []() { return [](py::str s) { return s; }("abc \342\200\275 def"sv); });
+    m.def("string_view_from_bytes", [](const py::bytes &b) { return [](std::string_view s) { return s; }(b); });
+    m.def("string_view_memoryview", []() {
+        static constexpr auto val = "Have some \360\237\216\202"sv;
+        return py::memoryview::from_memory(val);
+    });
 #   ifdef PYBIND11_HAS_U8STRING
     m.def("string_view8_print",  [](std::u8string_view s) { py::print(s, s.size()); });
     m.def("string_view8_chars",  [](std::u8string_view s) { py::list l; for (auto c : s) l.append((std::uint8_t) c); return l; });
     m.def("string_view8_return", []() { return std::u8string_view(u8"utf8 secret \U0001f382"); });
+    m.def("string_view8_str",    []() { return py::str{std::u8string_view{u8"abc ‽ def"}}; });
 #   endif
+    struct TypeWithBothOperatorStringAndStringView {
+        // NOLINTNEXTLINE(google-explicit-constructor)
+        operator std::string() const { return "success"; }
+        // NOLINTNEXTLINE(google-explicit-constructor)
+        operator std::string_view() const { return "failure"; }
+    };
+    m.def("bytes_from_type_with_both_operator_string_and_string_view",
+          []() { return py::bytes(TypeWithBothOperatorStringAndStringView()); });
+    m.def("str_from_type_with_both_operator_string_and_string_view",
+          []() { return py::str(TypeWithBothOperatorStringAndStringView()); });
     // test_integer_casting
@@ -98,10 +177,17 @@ TEST_SUBMODULE(builtin_casters, m) {
     m.def("i64_str", [](std::int64_t v) { return std::to_string(v); });
     m.def("u64_str", [](std::uint64_t v) { return std::to_string(v); });
+    // test_int_convert
+    m.def("int_passthrough", [](int arg) { return arg; });
+    m.def("int_passthrough_noconvert", [](int arg) { return arg; }, py::arg{}.noconvert());
     // test_tuple
-    m.def("pair_passthrough", [](std::pair<bool, std::string> input) {
-        return std::make_pair(input.second, input.first);
-    }, "Return a pair in reversed order");
+    m.def(
+        "pair_passthrough",
+        [](const std::pair<bool, std::string> &input) {
+            return std::make_pair(input.second, input.first);
+        },
+        "Return a pair in reversed order");
     m.def("tuple_passthrough", [](std::tuple<bool, std::string, int> input) {
         return std::make_tuple(std::get<2>(input), std::get<1>(input), std::get<0>(input));
     }, "Return a triple in reversed order");
@@ -130,23 +216,45 @@ TEST_SUBMODULE(builtin_casters, m) {
     // test_none_deferred
     m.def("defer_none_cstring", [](char *) { return false; });
-    m.def("defer_none_cstring", [](py::none) { return true; });
+    m.def("defer_none_cstring", [](const py::none &) { return true; });
     m.def("defer_none_custom", [](UserType *) { return false; });
-    m.def("defer_none_custom", [](py::none) { return true; });
+    m.def("defer_none_custom", [](const py::none &) { return true; });
     m.def("nodefer_none_void", [](void *) { return true; });
-    m.def("nodefer_none_void", [](py::none) { return false; });
+    m.def("nodefer_none_void", [](const py::none &) { return false; });
     // test_void_caster
     m.def("load_nullptr_t", [](std::nullptr_t) {}); // not useful, but it should still compile
     m.def("cast_nullptr_t", []() { return std::nullptr_t{}; });
+    // [workaround(intel)] ICC 20/21 breaks with py::arg().stuff, using py::arg{}.stuff works.
     // test_bool_caster
     m.def("bool_passthrough", [](bool arg) { return arg; });
-    m.def("bool_passthrough_noconvert", [](bool arg) { return arg; }, py::arg().noconvert());
+    m.def("bool_passthrough_noconvert", [](bool arg) { return arg; }, py::arg{}.noconvert());
+    // TODO: This should be disabled and fixed in future Intel compilers
+#if !defined(__INTEL_COMPILER)
+    // Test "bool_passthrough_noconvert" again, but using () instead of {} to construct py::arg
+    // When compiled with the Intel compiler, this results in segmentation faults when importing
+    // the module. Tested with icc (ICC) 2021.1 Beta 20200827, this should be tested again when
+    // a newer version of icc is available.
+    m.def("bool_passthrough_noconvert2", [](bool arg) { return arg; }, py::arg().noconvert());
     // test_reference_wrapper
     m.def("refwrap_builtin", [](std::reference_wrapper<int> p) { return 10 * p.get(); });
     m.def("refwrap_usertype", [](std::reference_wrapper<UserType> p) { return p.get().value(); });
+    m.def("refwrap_usertype_const", [](std::reference_wrapper<const UserType> p) { return p.get().value(); });
+    m.def("refwrap_lvalue", []() -> std::reference_wrapper<UserType> {
+        static UserType x(1);
+        return std::ref(x);
+    });
+    m.def("refwrap_lvalue_const", []() -> std::reference_wrapper<const UserType> {
+        static UserType x(1);
+        return std::cref(x);
+    });
     // Not currently supported (std::pair caster has return-by-value cast operator);
     // triggers static_assert failure.
     //m.def("refwrap_pair", [](std::reference_wrapper<std::pair<int, int>>) { });
@@ -162,7 +270,7 @@ TEST_SUBMODULE(builtin_casters, m) {
     }, "copy"_a);
     m.def("refwrap_iiw", [](const IncType &w) { return w.value(); });
-    m.def("refwrap_call_iiw", [](IncType &w, py::function f) {
+    m.def("refwrap_call_iiw", [](IncType &w, const py::function &f) {
         py::list l;
@@ -189,4 +297,14 @@ TEST_SUBMODULE(builtin_casters, m) {
         py::object o = py::cast(v);
         return py::cast<void *>(o) == v;
+    // Tests const/non-const propagation in cast_op.
+    m.def("takes", [](ConstRefCasted x) { return x.tag; });
+    m.def("takes_move", [](ConstRefCasted&& x) { return x.tag; });
+    m.def("takes_ptr", [](ConstRefCasted* x) { return x->tag; });
+    m.def("takes_ref", [](ConstRefCasted& x) { return x.tag; });
+    m.def("takes_ref_wrap", [](std::reference_wrapper<ConstRefCasted> x) { return x.get().tag; });
+    m.def("takes_const_ptr", [](const ConstRefCasted* x) { return x->tag; });
+    m.def("takes_const_ref", [](const ConstRefCasted& x) { return x.tag; });
+    m.def("takes_const_ref_wrap", [](std::reference_wrapper<const ConstRefCasted> x) { return x.get().tag; });
diff --git a/wrap/pybind11/tests/test_builtin_casters.py b/wrap/pybind11/tests/test_builtin_casters.py
index 08d38bc154..b1f1e395a7 100644
--- a/wrap/pybind11/tests/test_builtin_casters.py
+++ b/wrap/pybind11/tests/test_builtin_casters.py
@@ -1,10 +1,9 @@
 # -*- coding: utf-8 -*-
 import pytest
-import env  # noqa: F401
+import env
+from pybind11_tests import IncType, UserType
 from pybind11_tests import builtin_casters as m
-from pybind11_tests import UserType, IncType
 def test_simple_string():
@@ -37,79 +36,85 @@ def test_unicode_conversion():
         with pytest.raises(UnicodeDecodeError):
-    assert m.u8_Z() == 'Z'
-    assert m.u8_eacute() == u'é'
-    assert m.u16_ibang() == u'‽'
-    assert m.u32_mathbfA() == u'𝐀'
-    assert m.wchar_heart() == u'♥'
+    assert m.u8_Z() == "Z"
+    assert m.u8_eacute() == u"é"
+    assert m.u16_ibang() == u"‽"
+    assert m.u32_mathbfA() == u"𝐀"
+    assert m.wchar_heart() == u"♥"
     if hasattr(m, "has_u8string"):
-        assert m.u8_char8_Z() == 'Z'
+        assert m.u8_char8_Z() == "Z"
 def test_single_char_arguments():
     """Tests failures for passing invalid inputs to char-accepting functions"""
     def toobig_message(r):
-        return "Character code point not in range({0:#x})".format(r)
+        return "Character code point not in range({:#x})".format(r)
     toolong_message = "Expected a character, but multi-character string found"
-    assert m.ord_char(u'a') == 0x61  # simple ASCII
-    assert m.ord_char_lv(u'b') == 0x62
-    assert m.ord_char(u'é') == 0xE9  # requires 2 bytes in utf-8, but can be stuffed in a char
+    assert m.ord_char(u"a") == 0x61  # simple ASCII
+    assert m.ord_char_lv(u"b") == 0x62
+    assert (
+        m.ord_char(u"é") == 0xE9
+    )  # requires 2 bytes in utf-8, but can be stuffed in a char
     with pytest.raises(ValueError) as excinfo:
-        assert m.ord_char(u'Ā') == 0x100  # requires 2 bytes, doesn't fit in a char
+        assert m.ord_char(u"Ā") == 0x100  # requires 2 bytes, doesn't fit in a char
     assert str(excinfo.value) == toobig_message(0x100)
     with pytest.raises(ValueError) as excinfo:
-        assert m.ord_char(u'ab')
+        assert m.ord_char(u"ab")
     assert str(excinfo.value) == toolong_message
-    assert m.ord_char16(u'a') == 0x61
-    assert m.ord_char16(u'é') == 0xE9
-    assert m.ord_char16_lv(u'ê') == 0xEA
-    assert m.ord_char16(u'Ā') == 0x100
-    assert m.ord_char16(u'‽') == 0x203d
-    assert m.ord_char16(u'♥') == 0x2665
-    assert m.ord_char16_lv(u'♡') == 0x2661
+    assert m.ord_char16(u"a") == 0x61
+    assert m.ord_char16(u"é") == 0xE9
+    assert m.ord_char16_lv(u"ê") == 0xEA
+    assert m.ord_char16(u"Ā") == 0x100
+    assert m.ord_char16(u"‽") == 0x203D
+    assert m.ord_char16(u"♥") == 0x2665
+    assert m.ord_char16_lv(u"♡") == 0x2661
     with pytest.raises(ValueError) as excinfo:
-        assert m.ord_char16(u'🎂') == 0x1F382  # requires surrogate pair
+        assert m.ord_char16(u"🎂") == 0x1F382  # requires surrogate pair
     assert str(excinfo.value) == toobig_message(0x10000)
     with pytest.raises(ValueError) as excinfo:
-        assert m.ord_char16(u'aa')
+        assert m.ord_char16(u"aa")
     assert str(excinfo.value) == toolong_message
-    assert m.ord_char32(u'a') == 0x61
-    assert m.ord_char32(u'é') == 0xE9
-    assert m.ord_char32(u'Ā') == 0x100
-    assert m.ord_char32(u'‽') == 0x203d
-    assert m.ord_char32(u'♥') == 0x2665
-    assert m.ord_char32(u'🎂') == 0x1F382
+    assert m.ord_char32(u"a") == 0x61
+    assert m.ord_char32(u"é") == 0xE9
+    assert m.ord_char32(u"Ā") == 0x100
+    assert m.ord_char32(u"‽") == 0x203D
+    assert m.ord_char32(u"♥") == 0x2665
+    assert m.ord_char32(u"🎂") == 0x1F382
     with pytest.raises(ValueError) as excinfo:
-        assert m.ord_char32(u'aa')
+        assert m.ord_char32(u"aa")
     assert str(excinfo.value) == toolong_message
-    assert m.ord_wchar(u'a') == 0x61
-    assert m.ord_wchar(u'é') == 0xE9
-    assert m.ord_wchar(u'Ā') == 0x100
-    assert m.ord_wchar(u'‽') == 0x203d
-    assert m.ord_wchar(u'♥') == 0x2665
+    assert m.ord_wchar(u"a") == 0x61
+    assert m.ord_wchar(u"é") == 0xE9
+    assert m.ord_wchar(u"Ā") == 0x100
+    assert m.ord_wchar(u"‽") == 0x203D
+    assert m.ord_wchar(u"♥") == 0x2665
     if m.wchar_size == 2:
         with pytest.raises(ValueError) as excinfo:
-            assert m.ord_wchar(u'🎂') == 0x1F382  # requires surrogate pair
+            assert m.ord_wchar(u"🎂") == 0x1F382  # requires surrogate pair
         assert str(excinfo.value) == toobig_message(0x10000)
-        assert m.ord_wchar(u'🎂') == 0x1F382
+        assert m.ord_wchar(u"🎂") == 0x1F382
     with pytest.raises(ValueError) as excinfo:
-        assert m.ord_wchar(u'aa')
+        assert m.ord_wchar(u"aa")
     assert str(excinfo.value) == toolong_message
     if hasattr(m, "has_u8string"):
-        assert m.ord_char8(u'a') == 0x61  # simple ASCII
-        assert m.ord_char8_lv(u'b') == 0x62
-        assert m.ord_char8(u'é') == 0xE9  # requires 2 bytes in utf-8, but can be stuffed in a char
+        assert m.ord_char8(u"a") == 0x61  # simple ASCII
+        assert m.ord_char8_lv(u"b") == 0x62
+        assert (
+            m.ord_char8(u"é") == 0xE9
+        )  # requires 2 bytes in utf-8, but can be stuffed in a char
         with pytest.raises(ValueError) as excinfo:
-            assert m.ord_char8(u'Ā') == 0x100  # requires 2 bytes, doesn't fit in a char
+            assert m.ord_char8(u"Ā") == 0x100  # requires 2 bytes, doesn't fit in a char
         assert str(excinfo.value) == toobig_message(0x100)
         with pytest.raises(ValueError) as excinfo:
-            assert m.ord_char8(u'ab')
+            assert m.ord_char8(u"ab")
         assert str(excinfo.value) == toolong_message
@@ -129,19 +134,19 @@ def to_bytes(s):
     assert m.strlen(to_bytes("a\x00b")) == 1  # C-string limitation
     # passing in a utf8 encoded string should work
-    assert m.string_length(u'💩'.encode("utf8")) == 4
+    assert m.string_length(u"💩".encode("utf8")) == 4
 @pytest.mark.skipif(not hasattr(m, "has_string_view"), reason="no <string_view>")
 def test_string_view(capture):
     """Tests support for C++17 string_view arguments and return values"""
     assert m.string_view_chars("Hi") == [72, 105]
-    assert m.string_view_chars("Hi 🎂") == [72, 105, 32, 0xf0, 0x9f, 0x8e, 0x82]
-    assert m.string_view16_chars(u"Hi 🎂") == [72, 105, 32, 0xd83c, 0xdf82]
+    assert m.string_view_chars("Hi 🎂") == [72, 105, 32, 0xF0, 0x9F, 0x8E, 0x82]
+    assert m.string_view16_chars(u"Hi 🎂") == [72, 105, 32, 0xD83C, 0xDF82]
     assert m.string_view32_chars(u"Hi 🎂") == [72, 105, 32, 127874]
     if hasattr(m, "has_u8string"):
         assert m.string_view8_chars("Hi") == [72, 105]
-        assert m.string_view8_chars(u"Hi 🎂") == [72, 105, 32, 0xf0, 0x9f, 0x8e, 0x82]
+        assert m.string_view8_chars(u"Hi 🎂") == [72, 105, 32, 0xF0, 0x9F, 0x8E, 0x82]
     assert m.string_view_return() == u"utf8 secret 🎂"
     assert m.string_view16_return() == u"utf16 secret 🎂"
@@ -154,40 +159,63 @@ def test_string_view(capture):
         m.string_view_print("utf8 🎂")
         m.string_view16_print(u"utf16 🎂")
         m.string_view32_print(u"utf32 🎂")
-    assert capture == u"""
+    assert (
+        capture
+        == u"""
         Hi 2
         utf8 🎂 9
         utf16 🎂 8
         utf32 🎂 7
+    )
     if hasattr(m, "has_u8string"):
         with capture:
             m.string_view8_print(u"utf8 🎂")
-        assert capture == u"""
+        assert (
+            capture
+            == u"""
             Hi 2
             utf8 🎂 9
+        )
     with capture:
         m.string_view_print("Hi, ascii")
         m.string_view_print("Hi, utf8 🎂")
         m.string_view16_print(u"Hi, utf16 🎂")
         m.string_view32_print(u"Hi, utf32 🎂")
-    assert capture == u"""
+    assert (
+        capture
+        == u"""
         Hi, ascii 9
         Hi, utf8 🎂 13
         Hi, utf16 🎂 12
         Hi, utf32 🎂 11
+    )
     if hasattr(m, "has_u8string"):
         with capture:
             m.string_view8_print("Hi, ascii")
             m.string_view8_print(u"Hi, utf8 🎂")
-        assert capture == u"""
+        assert (
+            capture
+            == u"""
             Hi, ascii 9
             Hi, utf8 🎂 13
+        )
+    assert m.string_view_bytes() == b"abc \x80\x80 def"
+    assert m.string_view_str() == u"abc ‽ def"
+    assert m.string_view_from_bytes(u"abc ‽ def".encode("utf-8")) == u"abc ‽ def"
+    if hasattr(m, "has_u8string"):
+        assert m.string_view8_str() == u"abc ‽ def"
+    if not env.PY2:
+        assert m.string_view_memoryview() == "Have some 🎂".encode()
+    assert m.bytes_from_type_with_both_operator_string_and_string_view() == b"success"
+    assert m.str_from_type_with_both_operator_string_and_string_view() == "success"
 def test_integer_casting():
@@ -199,8 +227,14 @@ def test_integer_casting():
     if env.PY2:
         assert m.i32_str(long(-1)) == "-1"  # noqa: F821 undefined name 'long'
         assert m.i64_str(long(-1)) == "-1"  # noqa: F821 undefined name 'long'
-        assert m.i64_str(long(-999999999999)) == "-999999999999"  # noqa: F821 undefined name
-        assert m.u64_str(long(999999999999)) == "999999999999"  # noqa: F821 undefined name 'long'
+        assert (
+            m.i64_str(long(-999999999999))  # noqa: F821 undefined name 'long'
+            == "-999999999999"
+        )
+        assert (
+            m.u64_str(long(999999999999))  # noqa: F821 undefined name 'long'
+            == "999999999999"
+        )
         assert m.i64_str(-999999999999) == "-999999999999"
         assert m.u64_str(999999999999) == "999999999999"
@@ -227,6 +261,101 @@ def test_integer_casting():
         assert "incompatible function arguments" in str(excinfo.value)
+def test_int_convert():
+    class Int(object):
+        def __int__(self):
+            return 42
+    class NotInt(object):
+        pass
+    class Float(object):
+        def __float__(self):
+            return 41.99999
+    class Index(object):
+        def __index__(self):
+            return 42
+    class IntAndIndex(object):
+        def __int__(self):
+            return 42
+        def __index__(self):
+            return 0
+    class RaisingTypeErrorOnIndex(object):
+        def __index__(self):
+            raise TypeError
+        def __int__(self):
+            return 42
+    class RaisingValueErrorOnIndex(object):
+        def __index__(self):
+            raise ValueError
+        def __int__(self):
+            return 42
+    convert, noconvert = m.int_passthrough, m.int_passthrough_noconvert
+    def requires_conversion(v):
+        pytest.raises(TypeError, noconvert, v)
+    def cant_convert(v):
+        pytest.raises(TypeError, convert, v)
+    assert convert(7) == 7
+    assert noconvert(7) == 7
+    cant_convert(3.14159)
+    # TODO: Avoid DeprecationWarning in `PyLong_AsLong` (and similar)
+    # TODO: PyPy 3.8 does not behave like CPython 3.8 here yet (7.3.7)
+    if (3, 8) <= env.PY < (3, 10) and env.CPYTHON:
+        with env.deprecated_call():
+            assert convert(Int()) == 42
+    else:
+        assert convert(Int()) == 42
+    requires_conversion(Int())
+    cant_convert(NotInt())
+    cant_convert(Float())
+    # Before Python 3.8, `PyLong_AsLong` does not pick up on `obj.__index__`,
+    # but pybind11 "backports" this behavior.
+    assert convert(Index()) == 42
+    assert noconvert(Index()) == 42
+    assert convert(IntAndIndex()) == 0  # Fishy; `int(DoubleThought)` == 42
+    assert noconvert(IntAndIndex()) == 0
+    assert convert(RaisingTypeErrorOnIndex()) == 42
+    requires_conversion(RaisingTypeErrorOnIndex())
+    assert convert(RaisingValueErrorOnIndex()) == 42
+    requires_conversion(RaisingValueErrorOnIndex())
+def test_numpy_int_convert():
+    np = pytest.importorskip("numpy")
+    convert, noconvert = m.int_passthrough, m.int_passthrough_noconvert
+    def require_implicit(v):
+        pytest.raises(TypeError, noconvert, v)
+    # `np.intc` is an alias that corresponds to a C++ `int`
+    assert convert(np.intc(42)) == 42
+    assert noconvert(np.intc(42)) == 42
+    # The implicit conversion from np.float32 is undesirable but currently accepted.
+    # TODO: Avoid DeprecationWarning in `PyLong_AsLong` (and similar)
+    # TODO: PyPy 3.8 does not behave like CPython 3.8 here yet (7.3.7)
+    # https://github.com/pybind/pybind11/issues/3408
+    if (3, 8) <= env.PY < (3, 10) and env.CPYTHON:
+        with env.deprecated_call():
+            assert convert(np.float32(3.14159)) == 3
+    else:
+        assert convert(np.float32(3.14159)) == 3
+    require_implicit(np.float32(3.14159))
 def test_tuple(doc):
     """std::pair <-> tuple & std::tuple <-> tuple"""
     assert m.pair_passthrough((True, "test")) == ("test", True)
@@ -236,16 +365,22 @@ def test_tuple(doc):
     assert m.tuple_passthrough([True, "test", 5]) == (5, "test", True)
     assert m.empty_tuple() == ()
-    assert doc(m.pair_passthrough) == """
+    assert (
+        doc(m.pair_passthrough)
+        == """
         pair_passthrough(arg0: Tuple[bool, str]) -> Tuple[str, bool]
         Return a pair in reversed order
-    assert doc(m.tuple_passthrough) == """
+    )
+    assert (
+        doc(m.tuple_passthrough)
+        == """
         tuple_passthrough(arg0: Tuple[bool, str, int]) -> Tuple[int, str, bool]
         Return a triple in reversed order
+    )
     assert m.rvalue_pair() == ("rvalue", "rvalue")
     assert m.lvalue_pair() == ("lvalue", "lvalue")
@@ -285,6 +420,7 @@ def test_reference_wrapper():
     """std::reference_wrapper for builtin and user types"""
     assert m.refwrap_builtin(42) == 420
     assert m.refwrap_usertype(UserType(42)) == 42
+    assert m.refwrap_usertype_const(UserType(42)) == 42
     with pytest.raises(TypeError) as excinfo:
@@ -294,6 +430,9 @@ def test_reference_wrapper():
     assert "incompatible function arguments" in str(excinfo.value)
+    assert m.refwrap_lvalue().value == 1
+    assert m.refwrap_lvalue_const().value == 1
     a1 = m.refwrap_list(copy=True)
     a2 = m.refwrap_list(copy=True)
     assert [x.value for x in a1] == [2, 3]
@@ -372,7 +511,7 @@ def cant_convert(v):
     assert convert(np.bool_(False)) is False
     assert noconvert(np.bool_(True)) is True
     assert noconvert(np.bool_(False)) is False
-    cant_convert(np.zeros(2, dtype='int'))
+    cant_convert(np.zeros(2, dtype="int"))
 def test_int_long():
@@ -382,7 +521,8 @@ def test_int_long():
     import sys
-    must_be_long = type(getattr(sys, 'maxint', 1) + 1)
+    must_be_long = type(getattr(sys, "maxint", 1) + 1)
     assert isinstance(m.int_cast(), int)
     assert isinstance(m.long_cast(), int)
     assert isinstance(m.longlong_cast(), must_be_long)
@@ -390,3 +530,21 @@ def test_int_long():
 def test_void_caster_2():
     assert m.test_void_caster()
+def test_const_ref_caster():
+    """Verifies that const-ref is propagated through type_caster cast_op.
+    The returned ConstRefCasted type is a minimal type that is constructed to
+    reference the casting mode used.
+    """
+    x = False
+    assert m.takes(x) == 1
+    assert m.takes_move(x) == 1
+    assert m.takes_ptr(x) == 3
+    assert m.takes_ref(x) == 2
+    assert m.takes_ref_wrap(x) == 2
+    assert m.takes_const_ptr(x) == 5
+    assert m.takes_const_ref(x) == 4
+    assert m.takes_const_ref_wrap(x) == 4
diff --git a/wrap/pybind11/tests/test_call_policies.cpp b/wrap/pybind11/tests/test_call_policies.cpp
index 26c83f81b0..7cb98d0d86 100644
--- a/wrap/pybind11/tests/test_call_policies.cpp
+++ b/wrap/pybind11/tests/test_call_policies.cpp
@@ -51,6 +51,7 @@ TEST_SUBMODULE(call_policies, m) {
         void addChild(Child *) { }
         Child *returnChild() { return new Child(); }
         Child *returnNullChild() { return nullptr; }
+        static Child *staticFunction(Parent*) { return new Child(); }
     py::class_<Parent>(m, "Parent")
@@ -60,7 +61,12 @@ TEST_SUBMODULE(call_policies, m) {
         .def("returnChild", &Parent::returnChild)
         .def("returnChildKeepAlive", &Parent::returnChild, py::keep_alive<1, 0>())
         .def("returnNullChildKeepAliveChild", &Parent::returnNullChild, py::keep_alive<1, 0>())
-        .def("returnNullChildKeepAliveParent", &Parent::returnNullChild, py::keep_alive<0, 1>());
+        .def("returnNullChildKeepAliveParent", &Parent::returnNullChild, py::keep_alive<0, 1>())
+        .def_static(
+            "staticFunction", &Parent::staticFunction, py::keep_alive<1, 0>());
+    m.def("free_function", [](Parent*, Child*) {}, py::keep_alive<1, 2>());
+    m.def("invalid_arg_index", []{}, py::keep_alive<0, 1>());
 #if !defined(PYPY_VERSION)
     // test_alive_gc
diff --git a/wrap/pybind11/tests/test_call_policies.py b/wrap/pybind11/tests/test_call_policies.py
index ec005c132f..3599cf81af 100644
--- a/wrap/pybind11/tests/test_call_policies.py
+++ b/wrap/pybind11/tests/test_call_policies.py
@@ -2,9 +2,8 @@
 import pytest
 import env  # noqa: F401
-from pybind11_tests import call_policies as m
 from pybind11_tests import ConstructorStats
+from pybind11_tests import call_policies as m
 @pytest.mark.xfail("env.PYPY", reason="sometimes comes out 1 off on PyPy", strict=False)
@@ -16,10 +15,13 @@ def test_keep_alive_argument(capture):
     with capture:
         assert ConstructorStats.detail_reg_inst() == n_inst + 1
-    assert capture == """
+    assert (
+        capture
+        == """
         Allocating child.
         Releasing child.
+    )
     with capture:
         del p
         assert ConstructorStats.detail_reg_inst() == n_inst
@@ -35,10 +37,26 @@ def test_keep_alive_argument(capture):
     with capture:
         del p
         assert ConstructorStats.detail_reg_inst() == n_inst
-    assert capture == """
+    assert (
+        capture
+        == """
         Releasing parent.
         Releasing child.
+    )
+    p = m.Parent()
+    c = m.Child()
+    assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    m.free_function(p, c)
+    del c
+    assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    del p
+    assert ConstructorStats.detail_reg_inst() == n_inst
+    with pytest.raises(RuntimeError) as excinfo:
+        m.invalid_arg_index()
+    assert str(excinfo.value) == "Could not activate keep_alive!"
 def test_keep_alive_return_value(capture):
@@ -49,10 +67,13 @@ def test_keep_alive_return_value(capture):
     with capture:
         assert ConstructorStats.detail_reg_inst() == n_inst + 1
-    assert capture == """
+    assert (
+        capture
+        == """
         Allocating child.
         Releasing child.
+    )
     with capture:
         del p
         assert ConstructorStats.detail_reg_inst() == n_inst
@@ -68,10 +89,30 @@ def test_keep_alive_return_value(capture):
     with capture:
         del p
         assert ConstructorStats.detail_reg_inst() == n_inst
-    assert capture == """
+    assert (
+        capture
+        == """
+        Releasing parent.
+        Releasing child.
+    """
+    )
+    p = m.Parent()
+    assert ConstructorStats.detail_reg_inst() == n_inst + 1
+    with capture:
+        m.Parent.staticFunction(p)
+        assert ConstructorStats.detail_reg_inst() == n_inst + 2
+    assert capture == "Allocating child."
+    with capture:
+        del p
+        assert ConstructorStats.detail_reg_inst() == n_inst
+    assert (
+        capture
+        == """
         Releasing parent.
         Releasing child.
+    )
 # https://foss.heptapod.net/pypy/pypy/-/issues/2447
@@ -82,14 +123,17 @@ def test_alive_gc(capture):
     assert ConstructorStats.detail_reg_inst() == n_inst + 2
     lst = [p]
-    lst.append(lst)   # creates a circular reference
+    lst.append(lst)  # creates a circular reference
     with capture:
         del p, lst
         assert ConstructorStats.detail_reg_inst() == n_inst
-    assert capture == """
+    assert (
+        capture
+        == """
         Releasing parent.
         Releasing child.
+    )
 def test_alive_gc_derived(capture):
@@ -101,14 +145,17 @@ class Derived(m.Parent):
     assert ConstructorStats.detail_reg_inst() == n_inst + 2
     lst = [p]
-    lst.append(lst)   # creates a circular reference
+    lst.append(lst)  # creates a circular reference
     with capture:
         del p, lst
         assert ConstructorStats.detail_reg_inst() == n_inst
-    assert capture == """
+    assert (
+        capture
+        == """
         Releasing parent.
         Releasing child.
+    )
 def test_alive_gc_multi_derived(capture):
@@ -123,15 +170,18 @@ def __init__(self):
     # +3 rather than +2 because Derived corresponds to two registered instances
     assert ConstructorStats.detail_reg_inst() == n_inst + 3
     lst = [p]
-    lst.append(lst)   # creates a circular reference
+    lst.append(lst)  # creates a circular reference
     with capture:
         del p, lst
         assert ConstructorStats.detail_reg_inst() == n_inst
-    assert capture == """
+    assert (
+        capture
+        == """
         Releasing parent.
         Releasing child.
         Releasing child.
+    )
 def test_return_none(capture):
@@ -167,17 +217,23 @@ def test_keep_alive_constructor(capture):
     with capture:
         p = m.Parent(m.Child())
         assert ConstructorStats.detail_reg_inst() == n_inst + 2
-    assert capture == """
+    assert (
+        capture
+        == """
         Allocating child.
         Allocating parent.
+    )
     with capture:
         del p
         assert ConstructorStats.detail_reg_inst() == n_inst
-    assert capture == """
+    assert (
+        capture
+        == """
         Releasing parent.
         Releasing child.
+    )
 def test_call_guard():
diff --git a/wrap/pybind11/tests/test_callbacks.cpp b/wrap/pybind11/tests/test_callbacks.cpp
index 71b88c44c7..58688b6e8b 100644
--- a/wrap/pybind11/tests/test_callbacks.cpp
+++ b/wrap/pybind11/tests/test_callbacks.cpp
@@ -17,8 +17,8 @@ int dummy_function(int i) { return i + 1; }
 TEST_SUBMODULE(callbacks, m) {
     // test_callbacks, test_function_signatures
-    m.def("test_callback1", [](py::object func) { return func(); });
-    m.def("test_callback2", [](py::object func) { return func("Hello", 'x', true, 5); });
+    m.def("test_callback1", [](const py::object &func) { return func(); });
+    m.def("test_callback2", [](const py::object &func) { return func("Hello", 'x', true, 5); });
     m.def("test_callback3", [](const std::function<int(int)> &func) {
         return "func(43) = " + std::to_string(func(43)); });
     m.def("test_callback4", []() -> std::function<int(int)> { return [](int i) { return i+1; }; });
@@ -27,51 +27,48 @@ TEST_SUBMODULE(callbacks, m) {
     // test_keyword_args_and_generalized_unpacking
-    m.def("test_tuple_unpacking", [](py::function f) {
+    m.def("test_tuple_unpacking", [](const py::function &f) {
         auto t1 = py::make_tuple(2, 3);
         auto t2 = py::make_tuple(5, 6);
         return f("positional", 1, *t1, 4, *t2);
-    m.def("test_dict_unpacking", [](py::function f) {
+    m.def("test_dict_unpacking", [](const py::function &f) {
         auto d1 = py::dict("key"_a="value", "a"_a=1);
         auto d2 = py::dict();
         auto d3 = py::dict("b"_a=2);
         return f("positional", 1, **d1, **d2, **d3);
-    m.def("test_keyword_args", [](py::function f) {
-        return f("x"_a=10, "y"_a=20);
-    });
+    m.def("test_keyword_args", [](const py::function &f) { return f("x"_a = 10, "y"_a = 20); });
-    m.def("test_unpacking_and_keywords1", [](py::function f) {
+    m.def("test_unpacking_and_keywords1", [](const py::function &f) {
         auto args = py::make_tuple(2);
         auto kwargs = py::dict("d"_a=4);
         return f(1, *args, "c"_a=3, **kwargs);
-    m.def("test_unpacking_and_keywords2", [](py::function f) {
+    m.def("test_unpacking_and_keywords2", [](const py::function &f) {
         auto kwargs1 = py::dict("a"_a=1);
         auto kwargs2 = py::dict("c"_a=3, "d"_a=4);
         return f("positional", *py::make_tuple(1), 2, *py::make_tuple(3, 4), 5,
                  "key"_a="value", **kwargs1, "b"_a=2, **kwargs2, "e"_a=5);
-    m.def("test_unpacking_error1", [](py::function f) {
+    m.def("test_unpacking_error1", [](const py::function &f) {
         auto kwargs = py::dict("x"_a=3);
         return f("x"_a=1, "y"_a=2, **kwargs); // duplicate ** after keyword
-    m.def("test_unpacking_error2", [](py::function f) {
+    m.def("test_unpacking_error2", [](const py::function &f) {
         auto kwargs = py::dict("x"_a=3);
         return f(**kwargs, "x"_a=1); // duplicate keyword after **
-    m.def("test_arg_conversion_error1", [](py::function f) {
-        f(234, UnregisteredType(), "kw"_a=567);
-    });
+    m.def("test_arg_conversion_error1",
+          [](const py::function &f) { f(234, UnregisteredType(), "kw"_a = 567); });
-    m.def("test_arg_conversion_error2", [](py::function f) {
+    m.def("test_arg_conversion_error2", [](const py::function &f) {
         f(234, "expected_name"_a=UnregisteredType(), "kw"_a=567);
@@ -80,23 +77,64 @@ TEST_SUBMODULE(callbacks, m) {
         Payload() { print_default_created(this); }
         ~Payload() { print_destroyed(this); }
         Payload(const Payload &) { print_copy_created(this); }
-        Payload(Payload &&) { print_move_created(this); }
+        Payload(Payload &&) noexcept { print_move_created(this); }
     // Export the payload constructor statistics for testing purposes:
     m.def("payload_cstats", &ConstructorStats::get<Payload>);
-    /* Test cleanup of lambda closure */
-    m.def("test_cleanup", []() -> std::function<void(void)> {
+    m.def("test_lambda_closure_cleanup", []() -> std::function<void()> {
         Payload p;
+        // In this situation, `Func` in the implementation of
+        // `cpp_function::initialize` is NOT trivially destructible.
         return [p]() {
             /* p should be cleaned up when the returned function is garbage collected */
             (void) p;
+    class CppCallable {
+    public:
+        CppCallable() { track_default_created(this); }
+        ~CppCallable() { track_destroyed(this); }
+        CppCallable(const CppCallable &) { track_copy_created(this); }
+        CppCallable(CppCallable &&) noexcept { track_move_created(this); }
+        void operator()() {}
+    };
+    m.def("test_cpp_callable_cleanup", []() {
+        // Related issue: https://github.com/pybind/pybind11/issues/3228
+        // Related PR: https://github.com/pybind/pybind11/pull/3229
+        py::list alive_counts;
+        ConstructorStats &stat = ConstructorStats::get<CppCallable>();
+        alive_counts.append(stat.alive());
+        {
+            CppCallable cpp_callable;
+            alive_counts.append(stat.alive());
+            {
+                // In this situation, `Func` in the implementation of
+                // `cpp_function::initialize` IS trivially destructible,
+                // only `capture` is not.
+                py::cpp_function py_func(cpp_callable);
+                py::detail::silence_unused_warnings(py_func);
+                alive_counts.append(stat.alive());
+            }
+            alive_counts.append(stat.alive());
+            {
+                py::cpp_function py_func(std::move(cpp_callable));
+                py::detail::silence_unused_warnings(py_func);
+                alive_counts.append(stat.alive());
+            }
+            alive_counts.append(stat.alive());
+        }
+        alive_counts.append(stat.alive());
+        return alive_counts;
+    });
     // test_cpp_function_roundtrip
     /* Test if passing a function pointer from C++ -> Python -> C++ yields the original pointer */
     m.def("dummy_function", &dummy_function);
+    m.def("dummy_function_overloaded", [](int i, int j) { return i + j; });
+    m.def("dummy_function_overloaded", &dummy_function);
     m.def("dummy_function2", [](int i, int j) { return i + j; });
     m.def("roundtrip", [](std::function<int(int)> f, bool expect_none = false) {
         if (expect_none && f)
@@ -109,16 +147,25 @@ TEST_SUBMODULE(callbacks, m) {
         if (!result) {
             auto r = f(1);
             return "can't convert to function pointer: eval(1) = " + std::to_string(r);
-        } else if (*result == dummy_function) {
+        }
+        if (*result == dummy_function) {
             auto r = (*result)(1);
             return "matches dummy_function: eval(1) = " + std::to_string(r);
-        } else {
-            return "argument does NOT match dummy_function. This should never happen!";
+        return "argument does NOT match dummy_function. This should never happen!";
-    class AbstractBase { public: virtual unsigned int func() = 0; };
-    m.def("func_accepting_func_accepting_base", [](std::function<double(AbstractBase&)>) { });
+    class AbstractBase {
+    public:
+        // [workaround(intel)] = default does not work here
+        // Defaulting this destructor results in linking errors with the Intel compiler
+        // (in Debug builds only, tested with icpc (ICC) 2021.1 Beta 20200827)
+        virtual ~AbstractBase() {} // NOLINT(modernize-use-equals-default)
+        virtual unsigned int func() = 0;
+    };
+    m.def("func_accepting_func_accepting_base",
+          [](const std::function<double(AbstractBase &)> &) {});
     struct MovableObject {
         bool valid = true;
@@ -126,8 +173,8 @@ TEST_SUBMODULE(callbacks, m) {
         MovableObject() = default;
         MovableObject(const MovableObject &) = default;
         MovableObject &operator=(const MovableObject &) = default;
-        MovableObject(MovableObject &&o) : valid(o.valid) { o.valid = false; }
-        MovableObject &operator=(MovableObject &&o) {
+        MovableObject(MovableObject &&o) noexcept : valid(o.valid) { o.valid = false; }
+        MovableObject &operator=(MovableObject &&o) noexcept {
             valid = o.valid;
             o.valid = false;
             return *this;
@@ -136,7 +183,7 @@ TEST_SUBMODULE(callbacks, m) {
     py::class_<MovableObject>(m, "MovableObject");
     // test_movable_object
-    m.def("callback_with_movable", [](std::function<void(MovableObject &)> f) {
+    m.def("callback_with_movable", [](const std::function<void(MovableObject &)> &f) {
         auto x = MovableObject();
         f(x); // lvalue reference shouldn't move out object
         return x.valid; // must still return `true`
@@ -148,9 +195,15 @@ TEST_SUBMODULE(callbacks, m) {
         .def("triple", [](CppBoundMethodTest &, int val) { return 3 * val; });
+    // This checks that builtin functions can be passed as callbacks
+    // rather than throwing RuntimeError due to trying to extract as capsule
+    m.def("test_sum_builtin", [](const std::function<double(py::iterable)> &sum_builtin, const py::iterable &i) {
+      return sum_builtin(i);
+    });
     // test async Python callbacks
     using callback_f = std::function<void(int)>;
-    m.def("test_async_callback", [](callback_f f, py::list work) {
+    m.def("test_async_callback", [](const callback_f &f, const py::list &work) {
         // make detached thread that calls `f` with piece of work after a little delay
         auto start_f = [f](int j) {
             auto invoke_f = [f, j] {
@@ -165,4 +218,10 @@ TEST_SUBMODULE(callbacks, m) {
         for (auto i : work)
+    m.def("callback_num_times", [](const py::function &f, std::size_t num) {
+        for (std::size_t i = 0; i < num; i++) {
+            f();
+        }
+    });
diff --git a/wrap/pybind11/tests/test_callbacks.py b/wrap/pybind11/tests/test_callbacks.py
index d5d0e045d2..f41ad86e7f 100644
--- a/wrap/pybind11/tests/test_callbacks.py
+++ b/wrap/pybind11/tests/test_callbacks.py
@@ -1,7 +1,11 @@
 # -*- coding: utf-8 -*-
+import time
+from threading import Thread
 import pytest
+import env  # noqa: F401
 from pybind11_tests import callbacks as m
-from threading import Thread
 def test_callbacks():
@@ -42,17 +46,19 @@ def double(self, val):
 def test_keyword_args_and_generalized_unpacking():
     def f(*args, **kwargs):
         return args, kwargs
     assert m.test_tuple_unpacking(f) == (("positional", 1, 2, 3, 4, 5, 6), {})
-    assert m.test_dict_unpacking(f) == (("positional", 1), {"key": "value", "a": 1, "b": 2})
+    assert m.test_dict_unpacking(f) == (
+        ("positional", 1),
+        {"key": "value", "a": 1, "b": 2},
+    )
     assert m.test_keyword_args(f) == ((), {"x": 10, "y": 20})
     assert m.test_unpacking_and_keywords1(f) == ((1, 2), {"c": 3, "d": 4})
     assert m.test_unpacking_and_keywords2(f) == (
         ("positional", 1, 2, 3, 4, 5),
-        {"key": "value", "a": 1, "b": 2, "c": 3, "d": 4, "e": 5}
+        {"key": "value", "a": 1, "b": 2, "c": 3, "d": 4, "e": 5},
     with pytest.raises(TypeError) as excinfo:
@@ -73,22 +79,37 @@ def f(*args, **kwargs):
 def test_lambda_closure_cleanup():
-    m.test_cleanup()
+    m.test_lambda_closure_cleanup()
     cstats = m.payload_cstats()
     assert cstats.alive() == 0
     assert cstats.copy_constructions == 1
     assert cstats.move_constructions >= 1
+def test_cpp_callable_cleanup():
+    alive_counts = m.test_cpp_callable_cleanup()
+    assert alive_counts == [0, 1, 2, 1, 2, 1, 0]
 def test_cpp_function_roundtrip():
     """Test if passing a function pointer from C++ -> Python -> C++ yields the original pointer"""
-    assert m.test_dummy_function(m.dummy_function) == "matches dummy_function: eval(1) = 2"
-    assert (m.test_dummy_function(m.roundtrip(m.dummy_function)) ==
-            "matches dummy_function: eval(1) = 2")
+    assert (
+        m.test_dummy_function(m.dummy_function) == "matches dummy_function: eval(1) = 2"
+    )
+    assert (
+        m.test_dummy_function(m.roundtrip(m.dummy_function))
+        == "matches dummy_function: eval(1) = 2"
+    )
+    assert (
+        m.test_dummy_function(m.dummy_function_overloaded)
+        == "matches dummy_function: eval(1) = 2"
+    )
     assert m.roundtrip(None, expect_none=True) is None
-    assert (m.test_dummy_function(lambda x: x + 2) ==
-            "can't convert to function pointer: eval(1) = 3")
+    assert (
+        m.test_dummy_function(lambda x: x + 2)
+        == "can't convert to function pointer: eval(1) = 3"
+    )
     with pytest.raises(TypeError) as excinfo:
@@ -96,8 +117,10 @@ def test_cpp_function_roundtrip():
     with pytest.raises(TypeError) as excinfo:
         m.test_dummy_function(lambda x, y: x + y)
-    assert any(s in str(excinfo.value) for s in ("missing 1 required positional argument",
-                                                 "takes exactly 2 arguments"))
+    assert any(
+        s in str(excinfo.value)
+        for s in ("missing 1 required positional argument", "takes exactly 2 arguments")
+    )
 def test_function_signatures(doc):
@@ -109,6 +132,16 @@ def test_movable_object():
     assert m.callback_with_movable(lambda _: None) is True
+    "env.PYPY",
+    reason="PyPy segfaults on here. See discussion on #1413.",
+def test_python_builtins():
+    """Test if python builtins like sum() can be used as callbacks"""
+    assert m.test_sum_builtin(sum, [1, 2, 3]) == 6
+    assert m.test_sum_builtin(sum, []) == 0
 def test_async_callbacks():
     # serves as state for async callback
     class Item:
@@ -127,11 +160,43 @@ def gen_f():
     m.test_async_callback(gen_f(), work)
     # wait until work is done
     from time import sleep
-    assert sum(res) == sum([x + 3 for x in work])
+    assert sum(res) == sum(x + 3 for x in work)
 def test_async_async_callbacks():
     t = Thread(target=test_async_callbacks)
+def test_callback_num_times():
+    # Super-simple micro-benchmarking related to PR #2919.
+    # Example runtimes (Intel Xeon 2.2GHz, fully optimized):
+    #   num_millions  1, repeats  2:  0.1 secs
+    #   num_millions 20, repeats 10: 11.5 secs
+    one_million = 1000000
+    num_millions = 1  # Try 20 for actual micro-benchmarking.
+    repeats = 2  # Try 10.
+    rates = []
+    for rep in range(repeats):
+        t0 = time.time()
+        m.callback_num_times(lambda: None, num_millions * one_million)
+        td = time.time() - t0
+        rate = num_millions / td if td else 0
+        rates.append(rate)
+        if not rep:
+            print()
+        print(
+            "callback_num_times: {:d} million / {:.3f} seconds = {:.3f} million / second".format(
+                num_millions, td, rate
+            )
+        )
+    if len(rates) > 1:
+        print("Min    Mean   Max")
+        print(
+            "{:6.3f} {:6.3f} {:6.3f}".format(
+                min(rates), sum(rates) / len(rates), max(rates)
+            )
+        )
diff --git a/wrap/pybind11/tests/test_chrono.py b/wrap/pybind11/tests/test_chrono.py
index ae24b7dda2..fdd73d6908 100644
--- a/wrap/pybind11/tests/test_chrono.py
+++ b/wrap/pybind11/tests/test_chrono.py
@@ -1,9 +1,10 @@
 # -*- coding: utf-8 -*-
-from pybind11_tests import chrono as m
 import datetime
 import pytest
 import env  # noqa: F401
+from pybind11_tests import chrono as m
 def test_chrono_system_clock():
@@ -39,9 +40,7 @@ def test_chrono_system_clock_roundtrip():
     # They should be identical (no information lost on roundtrip)
     diff = abs(date1 - date2)
-    assert diff.days == 0
-    assert diff.seconds == 0
-    assert diff.microseconds == 0
+    assert diff == datetime.timedelta(0)
 def test_chrono_system_clock_roundtrip_date():
@@ -64,9 +63,7 @@ def test_chrono_system_clock_roundtrip_date():
     assert diff.microseconds == 0
     # Year, Month & Day should be the same after the round trip
-    assert date1.year == date2.year
-    assert date1.month == date2.month
-    assert date1.day == date2.day
+    assert date1 == date2
     # There should be no time information
     assert time2.hour == 0
@@ -80,22 +77,28 @@ def test_chrono_system_clock_roundtrip_date():
-@pytest.mark.parametrize("time1", [
-    datetime.datetime.today().time(),
-    datetime.time(0, 0, 0),
-    datetime.time(0, 0, 0, 1),
-    datetime.time(0, 28, 45, 109827),
-    datetime.time(0, 59, 59, 999999),
-    datetime.time(1, 0, 0),
-    datetime.time(5, 59, 59, 0),
-    datetime.time(5, 59, 59, 1),
-@pytest.mark.parametrize("tz", [
-    None,
-    pytest.param("Europe/Brussels", marks=SKIP_TZ_ENV_ON_WIN),
-    pytest.param("Asia/Pyongyang", marks=SKIP_TZ_ENV_ON_WIN),
-    pytest.param("America/New_York", marks=SKIP_TZ_ENV_ON_WIN),
+    "time1",
+    [
+        datetime.datetime.today().time(),
+        datetime.time(0, 0, 0),
+        datetime.time(0, 0, 0, 1),
+        datetime.time(0, 28, 45, 109827),
+        datetime.time(0, 59, 59, 999999),
+        datetime.time(1, 0, 0),
+        datetime.time(5, 59, 59, 0),
+        datetime.time(5, 59, 59, 1),
+    ],
+    "tz",
+    [
+        None,
+        pytest.param("Europe/Brussels", marks=SKIP_TZ_ENV_ON_WIN),
+        pytest.param("Asia/Pyongyang", marks=SKIP_TZ_ENV_ON_WIN),
+        pytest.param("America/New_York", marks=SKIP_TZ_ENV_ON_WIN),
+    ],
 def test_chrono_system_clock_roundtrip_time(time1, tz, monkeypatch):
     if tz is not None:
         monkeypatch.setenv("TZ", "/usr/share/zoneinfo/{}".format(tz))
@@ -111,10 +114,7 @@ def test_chrono_system_clock_roundtrip_time(time1, tz, monkeypatch):
     assert isinstance(time2, datetime.time)
     # Hour, Minute, Second & Microsecond should be the same after the round trip
-    assert time1.hour == time2.hour
-    assert time1.minute == time2.minute
-    assert time1.second == time2.second
-    assert time1.microsecond == time2.microsecond
+    assert time1 == time2
     # There should be no date information (i.e. date = python base date)
     assert date2.year == 1970
@@ -134,9 +134,13 @@ def test_chrono_duration_roundtrip():
     cpp_diff = m.test_chrono3(diff)
-    assert cpp_diff.days == diff.days
-    assert cpp_diff.seconds == diff.seconds
-    assert cpp_diff.microseconds == diff.microseconds
+    assert cpp_diff == diff
+    # Negative timedelta roundtrip
+    diff = datetime.timedelta(microseconds=-1)
+    cpp_diff = m.test_chrono3(diff)
+    assert cpp_diff == diff
 def test_chrono_duration_subtraction_equivalence():
@@ -147,9 +151,7 @@ def test_chrono_duration_subtraction_equivalence():
     diff = date2 - date1
     cpp_diff = m.test_chrono4(date2, date1)
-    assert cpp_diff.days == diff.days
-    assert cpp_diff.seconds == diff.seconds
-    assert cpp_diff.microseconds == diff.microseconds
+    assert cpp_diff == diff
 def test_chrono_duration_subtraction_equivalence_date():
@@ -160,9 +162,7 @@ def test_chrono_duration_subtraction_equivalence_date():
     diff = date2 - date1
     cpp_diff = m.test_chrono4(date2, date1)
-    assert cpp_diff.days == diff.days
-    assert cpp_diff.seconds == diff.seconds
-    assert cpp_diff.microseconds == diff.microseconds
+    assert cpp_diff == diff
 def test_chrono_steady_clock():
@@ -177,9 +177,7 @@ def test_chrono_steady_clock_roundtrip():
     assert isinstance(time2, datetime.timedelta)
     # They should be identical (no information lost on roundtrip)
-    assert time1.days == time2.days
-    assert time1.seconds == time2.seconds
-    assert time1.microseconds == time2.microseconds
+    assert time1 == time2
 def test_floating_point_duration():
@@ -199,7 +197,7 @@ def test_floating_point_duration():
 def test_nano_timepoint():
     time = datetime.datetime.now()
     time1 = m.test_nano_timepoint(time, datetime.timedelta(seconds=60))
-    assert(time1 == time + datetime.timedelta(seconds=60))
+    assert time1 == time + datetime.timedelta(seconds=60)
 def test_chrono_different_resolutions():
diff --git a/wrap/pybind11/tests/test_class.cpp b/wrap/pybind11/tests/test_class.cpp
index b0e3d3a4b6..52a41a3bc0 100644
--- a/wrap/pybind11/tests/test_class.cpp
+++ b/wrap/pybind11/tests/test_class.cpp
@@ -7,18 +7,27 @@
     BSD-style license that can be found in the LICENSE file.
+#if defined(__INTEL_COMPILER) && __cplusplus >= 201703L
+// Intel compiler requires a separate header file to support aligned new operators
+// and does not set the __cpp_aligned_new feature macro.
+// This header needs to be included before pybind11.
+#include <aligned_new>
 #include "pybind11_tests.h"
 #include "constructor_stats.h"
 #include "local_bindings.h"
 #include <pybind11/stl.h>
+#include <utility>
 #if defined(_MSC_VER)
 #  pragma warning(disable: 4324) // warning C4324: structure was padded due to alignment specifier
 // test_brace_initialization
 struct NoBraceInitialization {
-    NoBraceInitialization(std::vector<int> v) : vec{std::move(v)} {}
+    explicit NoBraceInitialization(std::vector<int> v) : vec{std::move(v)} {}
     template <typename T>
     NoBraceInitialization(std::initializer_list<T> l) : vec(l) {}
@@ -38,10 +47,26 @@ TEST_SUBMODULE(class_, m) {
         ~NoConstructor() { print_destroyed(this); }
+    struct NoConstructorNew {
+        NoConstructorNew() = default;
+        NoConstructorNew(const NoConstructorNew &) = default;
+        NoConstructorNew(NoConstructorNew &&) = default;
+        static NoConstructorNew *new_instance() {
+            auto *ptr = new NoConstructorNew();
+            print_created(ptr, "via new_instance");
+            return ptr;
+        }
+        ~NoConstructorNew() { print_destroyed(this); }
+    };
     py::class_<NoConstructor>(m, "NoConstructor")
         .def_static("new_instance", &NoConstructor::new_instance, "Return an instance");
+    py::class_<NoConstructorNew>(m, "NoConstructorNew")
+        .def(py::init([](const NoConstructorNew &self) { return self; })) // Need a NOOP __init__
+        .def_static("__new__",
+                    [](const py::object &) { return NoConstructorNew::new_instance(); });
     // test_inheritance
     class Pet {
@@ -56,18 +81,18 @@ TEST_SUBMODULE(class_, m) {
     class Dog : public Pet {
-        Dog(const std::string &name) : Pet(name, "dog") {}
+        explicit Dog(const std::string &name) : Pet(name, "dog") {}
         std::string bark() const { return "Woof!"; }
     class Rabbit : public Pet {
-        Rabbit(const std::string &name) : Pet(name, "parrot") {}
+        explicit Rabbit(const std::string &name) : Pet(name, "parrot") {}
     class Hamster : public Pet {
-        Hamster(const std::string &name) : Pet(name, "rodent") {}
+        explicit Hamster(const std::string &name) : Pet(name, "rodent") {}
     class Chimera : public Pet {
@@ -122,7 +147,7 @@ TEST_SUBMODULE(class_, m) {
     m.def("return_none", []() -> BaseClass* { return nullptr; });
     // test_isinstance
-    m.def("check_instances", [](py::list l) {
+    m.def("check_instances", [](const py::list &l) {
         return py::make_tuple(
@@ -144,22 +169,17 @@ TEST_SUBMODULE(class_, m) {
         //     return py::type::of<int>();
         if (category == 1)
             return py::type::of<DerivedClass1>();
-        else
-            return py::type::of<Invalid>();
+        return py::type::of<Invalid>();
-    m.def("get_type_of", [](py::object ob) {
-        return py::type::of(ob);
-    });
+    m.def("get_type_of", [](py::object ob) { return py::type::of(std::move(ob)); });
-    m.def("as_type", [](py::object ob) {
-        auto tp = py::type(ob);
-        if (py::isinstance<py::type>(ob))
-            return tp;
-        else
-            throw std::runtime_error("Invalid type");
+    m.def("get_type_classic", [](py::handle h) {
+        return h.get_type();
+    m.def("as_type", [](const py::object &ob) { return py::type(ob); });
     // test_mismatched_holder
     struct MismatchBase1 { };
     struct MismatchDerived1 : MismatchBase1 { };
@@ -168,12 +188,12 @@ TEST_SUBMODULE(class_, m) {
     struct MismatchDerived2 : MismatchBase2 { };
     m.def("mismatched_holder_1", []() {
-        auto mod = py::module::import("__main__");
+        auto mod = py::module_::import("__main__");
         py::class_<MismatchBase1, std::shared_ptr<MismatchBase1>>(mod, "MismatchBase1");
         py::class_<MismatchDerived1, MismatchBase1>(mod, "MismatchDerived1");
     m.def("mismatched_holder_2", []() {
-        auto mod = py::module::import("__main__");
+        auto mod = py::module_::import("__main__");
         py::class_<MismatchBase2>(mod, "MismatchBase2");
         py::class_<MismatchDerived2, std::shared_ptr<MismatchDerived2>,
                    MismatchBase2>(mod, "MismatchDerived2");
@@ -204,7 +224,7 @@ TEST_SUBMODULE(class_, m) {
     struct ConvertibleFromUserType {
         int i;
-        ConvertibleFromUserType(UserType u) : i(u.value()) { }
+        explicit ConvertibleFromUserType(UserType u) : i(u.value()) {}
     py::class_<ConvertibleFromUserType>(m, "AcceptsUserType")
@@ -212,7 +232,7 @@ TEST_SUBMODULE(class_, m) {
     py::implicitly_convertible<UserType, ConvertibleFromUserType>();
     m.def("implicitly_convert_argument", [](const ConvertibleFromUserType &r) { return r.i; });
-    m.def("implicitly_convert_variable", [](py::object o) {
+    m.def("implicitly_convert_variable", [](const py::object &o) {
         // `o` is `UserType` and `r` is a reference to a temporary created by implicit
         // conversion. This is valid when called inside a bound function because the temp
         // object is attached to the same life support system as the arguments.
@@ -231,7 +251,8 @@ TEST_SUBMODULE(class_, m) {
         auto def = new PyMethodDef{"f", f, METH_VARARGS, nullptr};
-        return py::reinterpret_steal<py::object>(PyCFunction_NewEx(def, nullptr, m.ptr()));
+        py::capsule def_capsule(def, [](void *ptr) { delete reinterpret_cast<PyMethodDef *>(ptr); });
+        return py::reinterpret_steal<py::object>(PyCFunction_NewEx(def, def_capsule.ptr(), m.ptr()));
     // test_operator_new_delete
@@ -258,7 +279,7 @@ TEST_SUBMODULE(class_, m) {
     struct PyAliasedHasOpNewDelSize : AliasedHasOpNewDelSize {
         PyAliasedHasOpNewDelSize() = default;
-        PyAliasedHasOpNewDelSize(int) { }
+        explicit PyAliasedHasOpNewDelSize(int) {}
         std::uint64_t j;
     struct HasOpNewDelBoth {
@@ -322,6 +343,10 @@ TEST_SUBMODULE(class_, m) {
     class PublicistB : public ProtectedB {
+        // [workaround(intel)] = default does not work here
+        // Removing or defaulting this destructor results in linking errors with the Intel compiler
+        // (in Debug builds only, tested with icpc (ICC) 2021.1 Beta 20200827)
+        ~PublicistB() override {};  // NOLINT(modernize-use-equals-default)
         using ProtectedB::foo;
@@ -385,7 +410,7 @@ TEST_SUBMODULE(class_, m) {
     struct StringWrapper { std::string str; };
     m.def("test_error_after_conversions", [](int) {});
-          [](StringWrapper) -> NotRegistered { return {}; });
+          [](const StringWrapper &) -> NotRegistered { return {}; });
     py::class_<StringWrapper>(m, "StringWrapper").def(py::init<std::string>());
     py::implicitly_convertible<std::string, StringWrapper>();
@@ -406,6 +431,7 @@ TEST_SUBMODULE(class_, m) {
     struct IsNonFinalFinal {};
     py::class_<IsNonFinalFinal>(m, "IsNonFinalFinal", py::is_final());
+    // test_exception_rvalue_abort
     struct PyPrintDestructor {
         PyPrintDestructor() = default;
         ~PyPrintDestructor() {
@@ -416,6 +442,55 @@ TEST_SUBMODULE(class_, m) {
     py::class_<PyPrintDestructor>(m, "PyPrintDestructor")
         .def("throw_something", &PyPrintDestructor::throw_something);
+    // test_multiple_instances_with_same_pointer
+    struct SamePointer {};
+    static SamePointer samePointer;
+    py::class_<SamePointer, std::unique_ptr<SamePointer, py::nodelete>>(m, "SamePointer")
+        .def(py::init([]() { return &samePointer; }));
+    struct Empty {};
+    py::class_<Empty>(m, "Empty")
+        .def(py::init<>());
+    // test_base_and_derived_nested_scope
+    struct BaseWithNested {
+        struct Nested {};
+    };
+    struct DerivedWithNested : BaseWithNested {
+        struct Nested {};
+    };
+    py::class_<BaseWithNested> baseWithNested_class(m, "BaseWithNested");
+    py::class_<DerivedWithNested, BaseWithNested> derivedWithNested_class(m, "DerivedWithNested");
+    py::class_<BaseWithNested::Nested>(baseWithNested_class, "Nested")
+        .def_static("get_name", []() { return "BaseWithNested::Nested"; });
+    py::class_<DerivedWithNested::Nested>(derivedWithNested_class, "Nested")
+        .def_static("get_name", []() { return "DerivedWithNested::Nested"; });
+    // test_register_duplicate_class
+    struct Duplicate {};
+    struct OtherDuplicate {};
+    struct DuplicateNested {};
+    struct OtherDuplicateNested {};
+    m.def("register_duplicate_class_name", [](const py::module_ &m) {
+        py::class_<Duplicate>(m, "Duplicate");
+        py::class_<OtherDuplicate>(m, "Duplicate");
+    });
+    m.def("register_duplicate_class_type", [](const py::module_ &m) {
+        py::class_<OtherDuplicate>(m, "OtherDuplicate");
+        py::class_<OtherDuplicate>(m, "YetAnotherDuplicate");
+    });
+    m.def("register_duplicate_nested_class_name", [](const py::object &gt) {
+        py::class_<DuplicateNested>(gt, "DuplicateNested");
+        py::class_<OtherDuplicateNested>(gt, "DuplicateNested");
+    });
+    m.def("register_duplicate_nested_class_type", [](const py::object &gt) {
+        py::class_<OtherDuplicateNested>(gt, "OtherDuplicateNested");
+        py::class_<OtherDuplicateNested>(gt, "YetAnotherDuplicateNested");
+    });
 template <int N> class BreaksBase { public:
@@ -433,15 +508,15 @@ using DoesntBreak5 = py::class_<BreaksBase<5>>;
 using DoesntBreak6 = py::class_<BreaksBase<6>, std::shared_ptr<BreaksBase<6>>, BreaksTramp<6>>;
 using DoesntBreak7 = py::class_<BreaksBase<7>, BreaksTramp<7>, std::shared_ptr<BreaksBase<7>>>;
 using DoesntBreak8 = py::class_<BreaksBase<8>, std::shared_ptr<BreaksBase<8>>>;
-#define CHECK_BASE(N) static_assert(std::is_same<typename DoesntBreak##N::type, BreaksBase<N>>::value, \
+#define CHECK_BASE(N) static_assert(std::is_same<typename DoesntBreak##N::type, BreaksBase<(N)>>::value, \
         "DoesntBreak" #N " has wrong type!")
-#define CHECK_ALIAS(N) static_assert(DoesntBreak##N::has_alias && std::is_same<typename DoesntBreak##N::type_alias, BreaksTramp<N>>::value, \
+#define CHECK_ALIAS(N) static_assert(DoesntBreak##N::has_alias && std::is_same<typename DoesntBreak##N::type_alias, BreaksTramp<(N)>>::value, \
         "DoesntBreak" #N " has wrong type_alias!")
 #define CHECK_NOALIAS(N) static_assert(!DoesntBreak##N::has_alias && std::is_void<typename DoesntBreak##N::type_alias>::value, \
         "DoesntBreak" #N " has type alias, but shouldn't!")
-#define CHECK_HOLDER(N, TYPE) static_assert(std::is_same<typename DoesntBreak##N::holder_type, std::TYPE##_ptr<BreaksBase<N>>>::value, \
+#define CHECK_HOLDER(N, TYPE) static_assert(std::is_same<typename DoesntBreak##N::holder_type, std::TYPE##_ptr<BreaksBase<(N)>>>::value, \
         "DoesntBreak" #N " has wrong holder_type!")
 CHECK_HOLDER(1, unique); CHECK_HOLDER(2, unique); CHECK_HOLDER(3, unique); CHECK_HOLDER(4, unique); CHECK_HOLDER(5, unique);
 CHECK_HOLDER(6, shared); CHECK_HOLDER(7, shared); CHECK_HOLDER(8, shared);
@@ -451,7 +526,7 @@ CHECK_HOLDER(6, shared); CHECK_HOLDER(7, shared); CHECK_HOLDER(8, shared);
 // failures occurs).
 // We have to actually look into the type: the typedef alone isn't enough to instantiate the type:
-#define CHECK_BROKEN(N) static_assert(std::is_same<typename Breaks##N::type, BreaksBase<-N>>::value, \
+#define CHECK_BROKEN(N) static_assert(std::is_same<typename Breaks##N::type, BreaksBase<-(N)>>::value, \
         "Breaks1 has wrong type!");
 //// Two holder classes:
diff --git a/wrap/pybind11/tests/test_class.py b/wrap/pybind11/tests/test_class.py
index be21f3709f..caafe2068d 100644
--- a/wrap/pybind11/tests/test_class.py
+++ b/wrap/pybind11/tests/test_class.py
@@ -2,9 +2,8 @@
 import pytest
 import env  # noqa: F401
+from pybind11_tests import ConstructorStats, UserType
 from pybind11_tests import class_ as m
-from pybind11_tests import UserType, ConstructorStats
 def test_repr():
@@ -26,13 +25,23 @@ def test_instance(msg):
     assert cstats.alive() == 0
+def test_instance_new(msg):
+    instance = m.NoConstructorNew()  # .__new__(m.NoConstructor.__class__)
+    cstats = ConstructorStats.get(m.NoConstructorNew)
+    assert cstats.alive() == 1
+    del instance
+    assert cstats.alive() == 0
 def test_type():
     assert m.check_type(1) == m.DerivedClass1
     with pytest.raises(RuntimeError) as execinfo:
-    assert 'pybind11::detail::get_type_info: unable to find type info' in str(execinfo.value)
-    assert 'Invalid' in str(execinfo.value)
+    assert "pybind11::detail::get_type_info: unable to find type info" in str(
+        execinfo.value
+    )
+    assert "Invalid" in str(execinfo.value)
     # Currently not supported
     # See https://github.com/pybind/pybind11/issues/2486
@@ -45,6 +54,12 @@ def test_type_of_py():
     assert m.get_type_of(int) == type
+def test_type_of_classic():
+    assert m.get_type_classic(1) == int
+    assert m.get_type_classic(m.DerivedClass1()) == m.DerivedClass1
+    assert m.get_type_classic(int) == type
 def test_type_of_py_nodelete():
     # If the above test deleted the class, this will segfault
     assert m.get_type_of(m.DerivedClass1()) == m.DerivedClass1
@@ -53,10 +68,10 @@ def test_type_of_py_nodelete():
 def test_as_type_py():
     assert m.as_type(int) == int
-    with pytest.raises(RuntimeError):
+    with pytest.raises(TypeError):
         assert m.as_type(1) == int
-    with pytest.raises(RuntimeError):
+    with pytest.raises(TypeError):
         assert m.as_type(m.DerivedClass1()) == m.DerivedClass1
@@ -67,18 +82,24 @@ def test_docstrings(doc):
     assert UserType.get_value.__name__ == "get_value"
     assert UserType.get_value.__module__ == "pybind11_tests"
-    assert doc(UserType.get_value) == """
+    assert (
+        doc(UserType.get_value)
+        == """
         get_value(self: m.UserType) -> int
         Get value using a method
+    )
     assert doc(UserType.value) == "Get/set value using a property"
-    assert doc(m.NoConstructor.new_instance) == """
+    assert (
+        doc(m.NoConstructor.new_instance)
+        == """
         new_instance() -> m.class_.NoConstructor
         Return an instance
+    )
 def test_qualname(doc):
@@ -87,51 +108,69 @@ def test_qualname(doc):
     assert m.NestBase.__qualname__ == "NestBase"
     assert m.NestBase.Nested.__qualname__ == "NestBase.Nested"
-    assert doc(m.NestBase.__init__) == """
+    assert (
+        doc(m.NestBase.__init__)
+        == """
         __init__(self: m.class_.NestBase) -> None
-    assert doc(m.NestBase.g) == """
+    )
+    assert (
+        doc(m.NestBase.g)
+        == """
         g(self: m.class_.NestBase, arg0: m.class_.NestBase.Nested) -> None
-    assert doc(m.NestBase.Nested.__init__) == """
+    )
+    assert (
+        doc(m.NestBase.Nested.__init__)
+        == """
         __init__(self: m.class_.NestBase.Nested) -> None
-    assert doc(m.NestBase.Nested.fn) == """
+    )
+    assert (
+        doc(m.NestBase.Nested.fn)
+        == """
         fn(self: m.class_.NestBase.Nested, arg0: int, arg1: m.class_.NestBase, arg2: m.class_.NestBase.Nested) -> None
     """  # noqa: E501 line too long
-    assert doc(m.NestBase.Nested.fa) == """
+    )
+    assert (
+        doc(m.NestBase.Nested.fa)
+        == """
         fa(self: m.class_.NestBase.Nested, a: int, b: m.class_.NestBase, c: m.class_.NestBase.Nested) -> None
     """  # noqa: E501 line too long
+    )
     assert m.NestBase.__module__ == "pybind11_tests.class_"
     assert m.NestBase.Nested.__module__ == "pybind11_tests.class_"
 def test_inheritance(msg):
-    roger = m.Rabbit('Rabbit')
+    roger = m.Rabbit("Rabbit")
     assert roger.name() + " is a " + roger.species() == "Rabbit is a parrot"
     assert m.pet_name_species(roger) == "Rabbit is a parrot"
-    polly = m.Pet('Polly', 'parrot')
+    polly = m.Pet("Polly", "parrot")
     assert polly.name() + " is a " + polly.species() == "Polly is a parrot"
     assert m.pet_name_species(polly) == "Polly is a parrot"
-    molly = m.Dog('Molly')
+    molly = m.Dog("Molly")
     assert molly.name() + " is a " + molly.species() == "Molly is a dog"
     assert m.pet_name_species(molly) == "Molly is a dog"
-    fred = m.Hamster('Fred')
+    fred = m.Hamster("Fred")
     assert fred.name() + " is a " + fred.species() == "Fred is a rodent"
     assert m.dog_bark(molly) == "Woof!"
     with pytest.raises(TypeError) as excinfo:
-    assert msg(excinfo.value) == """
+    assert (
+        msg(excinfo.value)
+        == """
         dog_bark(): incompatible function arguments. The following argument types are supported:
             1. (arg0: m.class_.Dog) -> str
         Invoked with: <m.class_.Pet object at 0>
+    )
     with pytest.raises(TypeError) as excinfo:
         m.Chimera("lion", "goat")
@@ -144,12 +183,11 @@ def test_inheritance_init(msg):
     class Python(m.Pet):
         def __init__(self):
     with pytest.raises(TypeError) as exc_info:
-    expected = ["m.class_.Pet.__init__() must be called when overriding __init__",
-                "Pet.__init__() must be called when overriding __init__"]  # PyPy?
-    # TODO: fix PyPy error message wrt. tp_name/__qualname__?
-    assert msg(exc_info.value) in expected
+    expected = "m.class_.Pet.__init__() must be called when overriding __init__"
+    assert msg(exc_info.value) == expected
     # Multiple bases
     class RabbitHamster(m.Rabbit, m.Hamster):
@@ -158,9 +196,8 @@ def __init__(self):
     with pytest.raises(TypeError) as exc_info:
-    expected = ["m.class_.Hamster.__init__() must be called when overriding __init__",
-                "Hamster.__init__() must be called when overriding __init__"]  # PyPy
-    assert msg(exc_info.value) in expected
+    expected = "m.class_.Hamster.__init__() must be called when overriding __init__"
+    assert msg(exc_info.value) == expected
 def test_automatic_upcasting():
@@ -188,13 +225,19 @@ def test_mismatched_holder():
     with pytest.raises(RuntimeError) as excinfo:
-    assert re.match('generic_type: type ".*MismatchDerived1" does not have a non-default '
-                    'holder type while its base ".*MismatchBase1" does', str(excinfo.value))
+    assert re.match(
+        'generic_type: type ".*MismatchDerived1" does not have a non-default '
+        'holder type while its base ".*MismatchBase1" does',
+        str(excinfo.value),
+    )
     with pytest.raises(RuntimeError) as excinfo:
-    assert re.match('generic_type: type ".*MismatchDerived2" has a non-default holder type '
-                    'while its base ".*MismatchBase2" does not', str(excinfo.value))
+    assert re.match(
+        'generic_type: type ".*MismatchDerived2" has a non-default holder type '
+        'while its base ".*MismatchBase2" does not',
+        str(excinfo.value),
+    )
 def test_override_static():
@@ -226,20 +269,20 @@ class SubAliased(m.AliasedHasOpNewDelSize):
         a = m.HasOpNewDel()
         b = m.HasOpNewDelSize()
         d = m.HasOpNewDelBoth()
-    assert capture == """
+    assert (
+        capture
+        == """
         A new 8
         B new 4
         D new 32
+    )
     sz_alias = str(m.AliasedHasOpNewDelSize.size_alias)
     sz_noalias = str(m.AliasedHasOpNewDelSize.size_noalias)
     with capture:
         c = m.AliasedHasOpNewDelSize()
         c2 = SubAliased()
-    assert capture == (
-        "C new " + sz_noalias + "\n" +
-        "C new " + sz_alias + "\n"
-    )
+    assert capture == ("C new " + sz_noalias + "\n" + "C new " + sz_alias + "\n")
     with capture:
         del a
@@ -248,21 +291,21 @@ class SubAliased(m.AliasedHasOpNewDelSize):
         del d
-    assert capture == """
+    assert (
+        capture
+        == """
         A delete
         B delete 4
         D delete
+    )
     with capture:
         del c
         del c2
-    assert capture == (
-        "C delete " + sz_noalias + "\n" +
-        "C delete " + sz_alias + "\n"
-    )
+    assert capture == ("C delete " + sz_noalias + "\n" + "C delete " + sz_alias + "\n")
 def test_bind_protected_functions():
@@ -285,7 +328,7 @@ def foo(self):
 def test_brace_initialization():
-    """ Tests that simple POD classes can be constructed using C++11 brace initialization """
+    """Tests that simple POD classes can be constructed using C++11 brace initialization"""
     a = m.BraceInitialization(123, "test")
     assert a.field1 == 123
     assert a.field2 == "test"
@@ -322,19 +365,23 @@ def test_reentrant_implicit_conversion_failure(msg):
     # ensure that there is no runaway reentrant implicit conversion (#1035)
     with pytest.raises(TypeError) as excinfo:
-    assert msg(excinfo.value) == '''
+    assert (
+        msg(excinfo.value)
+        == """
         __init__(): incompatible constructor arguments. The following argument types are supported:
             1. m.class_.BogusImplicitConversion(arg0: m.class_.BogusImplicitConversion)
         Invoked with: 0
-    '''
+    """
+    )
 def test_error_after_conversions():
     with pytest.raises(TypeError) as exc_info:
     assert str(exc_info.value).startswith(
-        "Unable to convert function return value to a Python type!")
+        "Unable to convert function return value to a Python type!"
+    )
 def test_aligned():
@@ -347,8 +394,10 @@ def test_aligned():
 def test_final():
     with pytest.raises(TypeError) as exc_info:
         class PyFinalChild(m.IsFinal):
     assert str(exc_info.value).endswith("is not an acceptable base type")
@@ -356,8 +405,10 @@ class PyFinalChild(m.IsFinal):
 def test_non_final_final():
     with pytest.raises(TypeError) as exc_info:
         class PyNonFinalFinalChild(m.IsNonFinalFinal):
     assert str(exc_info.value).endswith("is not an acceptable base type")
@@ -365,3 +416,58 @@ class PyNonFinalFinalChild(m.IsNonFinalFinal):
 def test_exception_rvalue_abort():
     with pytest.raises(RuntimeError):
+# https://github.com/pybind/pybind11/issues/1568
+def test_multiple_instances_with_same_pointer(capture):
+    n = 100
+    instances = [m.SamePointer() for _ in range(n)]
+    for i in range(n):
+        # We need to reuse the same allocated memory for with a different type,
+        # to ensure the bug in `deregister_instance_impl` is detected. Otherwise
+        # `Py_TYPE(self) == Py_TYPE(it->second)` will still succeed, even though
+        # the `instance` is already deleted.
+        instances[i] = m.Empty()
+    # No assert: if this does not trigger the error
+    #   pybind11_fail("pybind11_object_dealloc(): Tried to deallocate unregistered instance!");
+    # and just completes without crashing, we're good.
+# https://github.com/pybind/pybind11/issues/1624
+def test_base_and_derived_nested_scope():
+    assert issubclass(m.DerivedWithNested, m.BaseWithNested)
+    assert m.BaseWithNested.Nested != m.DerivedWithNested.Nested
+    assert m.BaseWithNested.Nested.get_name() == "BaseWithNested::Nested"
+    assert m.DerivedWithNested.Nested.get_name() == "DerivedWithNested::Nested"
+def test_register_duplicate_class():
+    import types
+    module_scope = types.ModuleType("module_scope")
+    with pytest.raises(RuntimeError) as exc_info:
+        m.register_duplicate_class_name(module_scope)
+    expected = (
+        'generic_type: cannot initialize type "Duplicate": '
+        "an object with that name is already defined"
+    )
+    assert str(exc_info.value) == expected
+    with pytest.raises(RuntimeError) as exc_info:
+        m.register_duplicate_class_type(module_scope)
+    expected = 'generic_type: type "YetAnotherDuplicate" is already registered!'
+    assert str(exc_info.value) == expected
+    class ClassScope:
+        pass
+    with pytest.raises(RuntimeError) as exc_info:
+        m.register_duplicate_nested_class_name(ClassScope)
+    expected = (
+        'generic_type: cannot initialize type "DuplicateNested": '
+        "an object with that name is already defined"
+    )
+    assert str(exc_info.value) == expected
+    with pytest.raises(RuntimeError) as exc_info:
+        m.register_duplicate_nested_class_type(ClassScope)
+    expected = 'generic_type: type "YetAnotherDuplicateNested" is already registered!'
+    assert str(exc_info.value) == expected
diff --git a/wrap/pybind11/tests/test_cmake_build/CMakeLists.txt b/wrap/pybind11/tests/test_cmake_build/CMakeLists.txt
index 0c0578ad3d..8bfaa386ae 100644
--- a/wrap/pybind11/tests/test_cmake_build/CMakeLists.txt
+++ b/wrap/pybind11/tests/test_cmake_build/CMakeLists.txt
@@ -25,7 +25,7 @@ function(pybind11_add_build_test name)
-    list(APPEND build_options "-DPYBIND11_PROJECT_DIR=${pybind11_SOURCE_DIR}")
+    list(APPEND build_options "-Dpybind11_SOURCE_DIR=${pybind11_SOURCE_DIR}")
     list(APPEND build_options "-DCMAKE_PREFIX_PATH=${pybind11_BINARY_DIR}/mock_install")
@@ -55,6 +55,8 @@ function(pybind11_add_build_test name)
   add_dependencies(test_cmake_build test_build_${name})
+possibly_uninitialized(PYTHON_MODULE_EXTENSION Python_INTERPRETER_ID)
@@ -77,3 +79,6 @@ if(PYBIND11_INSTALL)
 add_dependencies(check test_cmake_build)
+add_subdirectory(subdirectory_target EXCLUDE_FROM_ALL)
+add_subdirectory(subdirectory_embed EXCLUDE_FROM_ALL)
diff --git a/wrap/pybind11/tests/test_cmake_build/embed.cpp b/wrap/pybind11/tests/test_cmake_build/embed.cpp
index b9581d2fdb..a3abc8a84d 100644
--- a/wrap/pybind11/tests/test_cmake_build/embed.cpp
+++ b/wrap/pybind11/tests/test_cmake_build/embed.cpp
@@ -12,10 +12,10 @@ int main(int argc, char *argv[]) {
     py::scoped_interpreter guard{};
-    auto m = py::module::import("test_cmake_build");
+    auto m = py::module_::import("test_cmake_build");
     if (m.attr("add")(1, 2).cast<int>() != 3)
         throw std::runtime_error("embed.cpp failed");
-    py::module::import("sys").attr("argv") = py::make_tuple("test.py", "embed.cpp");
+    py::module_::import("sys").attr("argv") = py::make_tuple("test.py", "embed.cpp");
     py::eval_file(test_py_file, py::globals());
diff --git a/wrap/pybind11/tests/test_cmake_build/installed_embed/CMakeLists.txt b/wrap/pybind11/tests/test_cmake_build/installed_embed/CMakeLists.txt
index 64ae5c4bff..f7d6939982 100644
--- a/wrap/pybind11/tests/test_cmake_build/installed_embed/CMakeLists.txt
+++ b/wrap/pybind11/tests/test_cmake_build/installed_embed/CMakeLists.txt
@@ -22,5 +22,7 @@ set_target_properties(test_installed_embed PROPERTIES OUTPUT_NAME test_cmake_bui
 # This may be needed to resolve header conflicts, e.g. between Python release and debug headers.
 set_target_properties(test_installed_embed PROPERTIES NO_SYSTEM_FROM_IMPORTED ON)
-add_custom_target(check_installed_embed $<TARGET_FILE:test_installed_embed>
-                                        ${PROJECT_SOURCE_DIR}/../test.py)
+  check_installed_embed
+  $<TARGET_FILE:test_installed_embed> ${PROJECT_SOURCE_DIR}/../test.py
+  DEPENDS test_installed_embed)
diff --git a/wrap/pybind11/tests/test_cmake_build/installed_function/CMakeLists.txt b/wrap/pybind11/tests/test_cmake_build/installed_function/CMakeLists.txt
index 1a502863c0..d7ca4db55d 100644
--- a/wrap/pybind11/tests/test_cmake_build/installed_function/CMakeLists.txt
+++ b/wrap/pybind11/tests/test_cmake_build/installed_function/CMakeLists.txt
@@ -35,4 +35,5 @@ add_custom_target(
+  DEPENDS test_installed_function)
diff --git a/wrap/pybind11/tests/test_cmake_build/installed_target/CMakeLists.txt b/wrap/pybind11/tests/test_cmake_build/installed_target/CMakeLists.txt
index b38eb77470..bc5e101f1d 100644
--- a/wrap/pybind11/tests/test_cmake_build/installed_target/CMakeLists.txt
+++ b/wrap/pybind11/tests/test_cmake_build/installed_target/CMakeLists.txt
@@ -42,4 +42,5 @@ add_custom_target(
+  DEPENDS test_installed_target)
diff --git a/wrap/pybind11/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt b/wrap/pybind11/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
index c7df0cf77c..58cdd7cfd1 100644
--- a/wrap/pybind11/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
+++ b/wrap/pybind11/tests/test_cmake_build/subdirectory_embed/CMakeLists.txt
@@ -16,15 +16,17 @@ set(PYBIND11_INSTALL
     CACHE BOOL "")
 set(PYBIND11_EXPORT_NAME test_export)
-add_subdirectory(${PYBIND11_PROJECT_DIR} pybind11)
+add_subdirectory("${pybind11_SOURCE_DIR}" pybind11)
 # Test basic target functionality
 add_executable(test_subdirectory_embed ../embed.cpp)
 target_link_libraries(test_subdirectory_embed PRIVATE pybind11::embed)
 set_target_properties(test_subdirectory_embed PROPERTIES OUTPUT_NAME test_cmake_build)
-add_custom_target(check_subdirectory_embed $<TARGET_FILE:test_subdirectory_embed>
-                                           ${PROJECT_SOURCE_DIR}/../test.py)
+  check_subdirectory_embed
+  $<TARGET_FILE:test_subdirectory_embed> "${PROJECT_SOURCE_DIR}/../test.py"
+  DEPENDS test_subdirectory_embed)
 # Test custom export group -- PYBIND11_EXPORT_NAME
 add_library(test_embed_lib ../embed.cpp)
diff --git a/wrap/pybind11/tests/test_cmake_build/subdirectory_function/CMakeLists.txt b/wrap/pybind11/tests/test_cmake_build/subdirectory_function/CMakeLists.txt
index 624c600f85..01557c439a 100644
--- a/wrap/pybind11/tests/test_cmake_build/subdirectory_function/CMakeLists.txt
+++ b/wrap/pybind11/tests/test_cmake_build/subdirectory_function/CMakeLists.txt
@@ -11,7 +11,7 @@ endif()
 project(test_subdirectory_function CXX)
-add_subdirectory("${PYBIND11_PROJECT_DIR}" pybind11)
+add_subdirectory("${pybind11_SOURCE_DIR}" pybind11)
 pybind11_add_module(test_subdirectory_function ../main.cpp)
 set_target_properties(test_subdirectory_function PROPERTIES OUTPUT_NAME test_cmake_build)
@@ -31,4 +31,5 @@ add_custom_target(
+  DEPENDS test_subdirectory_function)
diff --git a/wrap/pybind11/tests/test_cmake_build/subdirectory_target/CMakeLists.txt b/wrap/pybind11/tests/test_cmake_build/subdirectory_target/CMakeLists.txt
index 2471941fb6..ba82fdee2e 100644
--- a/wrap/pybind11/tests/test_cmake_build/subdirectory_target/CMakeLists.txt
+++ b/wrap/pybind11/tests/test_cmake_build/subdirectory_target/CMakeLists.txt
@@ -11,7 +11,7 @@ endif()
 project(test_subdirectory_target CXX)
-add_subdirectory(${PYBIND11_PROJECT_DIR} pybind11)
+add_subdirectory("${pybind11_SOURCE_DIR}" pybind11)
 add_library(test_subdirectory_target MODULE ../main.cpp)
 set_target_properties(test_subdirectory_target PROPERTIES OUTPUT_NAME test_cmake_build)
@@ -37,4 +37,5 @@ add_custom_target(
+  DEPENDS test_subdirectory_target)
diff --git a/wrap/pybind11/tests/test_cmake_build/test.py b/wrap/pybind11/tests/test_cmake_build/test.py
index 87ed5135ff..972a27bea4 100644
--- a/wrap/pybind11/tests/test_cmake_build/test.py
+++ b/wrap/pybind11/tests/test_cmake_build/test.py
@@ -1,6 +1,10 @@
 # -*- coding: utf-8 -*-
 import sys
 import test_cmake_build
+if str is not bytes:  # If not Python2
+    assert isinstance(__file__, str)  # Test this is properly set
 assert test_cmake_build.add(1, 2) == 3
 print("{} imports, runs, and adds: 1 + 2 = 3".format(sys.argv[1]))
diff --git a/wrap/pybind11/tests/test_const_name.cpp b/wrap/pybind11/tests/test_const_name.cpp
new file mode 100644
index 0000000000..5cb3d16c14
--- /dev/null
+++ b/wrap/pybind11/tests/test_const_name.cpp
@@ -0,0 +1,70 @@
+// Copyright (c) 2021 The Pybind Development Team.
+// All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+#include "pybind11_tests.h"
+#if defined(_MSC_VER) && _MSC_VER < 1910
+// MSVC 2015 fails in bizarre ways.
+#else // Only test with MSVC 2017 or newer.
+// IUT = Implementation Under Test
+#    define CONST_NAME_TESTS(TEST_FUNC, IUT)                                                      \
+        std::string TEST_FUNC(int selector) {                                                     \
+            switch (selector) {                                                                   \
+                case 0:                                                                           \
+                    return IUT("").text;                                                          \
+                case 1:                                                                           \
+                    return IUT("A").text;                                                         \
+                case 2:                                                                           \
+                    return IUT("Bd").text;                                                        \
+                case 3:                                                                           \
+                    return IUT("Cef").text;                                                       \
+                case 4:                                                                           \
+                    return IUT<int>().text; /*NOLINT(bugprone-macro-parentheses)*/                \
+                case 5:                                                                           \
+                    return IUT<std::string>().text; /*NOLINT(bugprone-macro-parentheses)*/        \
+                case 6:                                                                           \
+                    return IUT<true>("T1", "T2").text; /*NOLINT(bugprone-macro-parentheses)*/     \
+                case 7:                                                                           \
+                    return IUT<false>("U1", "U2").text; /*NOLINT(bugprone-macro-parentheses)*/    \
+                case 8:                                                                           \
+                    /*NOLINTNEXTLINE(bugprone-macro-parentheses)*/                                \
+                    return IUT<true>(IUT("D1"), IUT("D2")).text;                                  \
+                case 9:                                                                           \
+                    /*NOLINTNEXTLINE(bugprone-macro-parentheses)*/                                \
+                    return IUT<false>(IUT("E1"), IUT("E2")).text;                                 \
+                case 10:                                                                          \
+                    return IUT("KeepAtEnd").text;                                                 \
+                default:                                                                          \
+                    break;                                                                        \
+            }                                                                                     \
+            throw std::runtime_error("Invalid selector value.");                                  \
+        }
+CONST_NAME_TESTS(const_name_tests, py::detail::const_name)
+CONST_NAME_TESTS(underscore_tests, py::detail::_)
+#    endif
+#endif // MSVC >= 2017
+TEST_SUBMODULE(const_name, m) {
+    m.attr("const_name_tests") = "PYBIND11_SKIP_TEST_CONST_NAME";
+    m.def("const_name_tests", const_name_tests);
+    m.attr("underscore_tests") = "PYBIND11_SKIP_TEST_CONST_NAME";
+    m.def("underscore_tests", underscore_tests);
+    m.attr("underscore_tests") = "PYBIND11_DETAIL_UNDERSCORE_BACKWARD_COMPATIBILITY not defined.";
diff --git a/wrap/pybind11/tests/test_const_name.py b/wrap/pybind11/tests/test_const_name.py
new file mode 100644
index 0000000000..d4e45e5e98
--- /dev/null
+++ b/wrap/pybind11/tests/test_const_name.py
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+import pytest
+import env
+from pybind11_tests import const_name as m
+@pytest.mark.parametrize("func", (m.const_name_tests, m.underscore_tests))
+    "selector, expected",
+    enumerate(
+        (
+            "",
+            "A",
+            "Bd",
+            "Cef",
+            "%",
+            "%",
+            "T1",
+            "U2",
+            "D1",
+            "E2",
+            "KeepAtEnd",
+        )
+    ),
+def test_const_name(func, selector, expected):
+    if isinstance(func, type(u"") if env.PY2 else str):
+        pytest.skip(func)
+    text = func(selector)
+    assert text == expected
diff --git a/wrap/pybind11/tests/test_constants_and_functions.cpp b/wrap/pybind11/tests/test_constants_and_functions.cpp
index f607795593..c0554503fa 100644
--- a/wrap/pybind11/tests/test_constants_and_functions.cpp
+++ b/wrap/pybind11/tests/test_constants_and_functions.cpp
@@ -1,5 +1,6 @@
-    tests/test_constants_and_functions.cpp -- global constants and functions, enumerations, raw byte strings
+    tests/test_constants_and_functions.cpp -- global constants and functions, enumerations, raw
+    byte strings
     Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
@@ -33,7 +34,7 @@ py::bytes return_bytes() {
     return std::string(data, 4);
-std::string print_bytes(py::bytes bytes) {
+std::string print_bytes(const py::bytes &bytes) {
     std::string ret = "bytes[";
     const auto value = static_cast<std::string>(bytes);
     for (size_t i = 0; i < value.length(); ++i) {
@@ -46,15 +47,23 @@ std::string print_bytes(py::bytes bytes) {
 // Test that we properly handle C++17 exception specifiers (which are part of the function signature
 // in C++17).  These should all still work before C++17, but don't affect the function signature.
 namespace test_exc_sp {
+// [workaround(intel)] Unable to use noexcept instead of noexcept(true)
+// Make the f1 test basically the same as the f2 test in C++17 mode for the Intel compiler as
+// it fails to compile with a plain noexcept (tested with icc (ICC) 2021.1 Beta 20200827).
+#if defined(__INTEL_COMPILER) && defined(PYBIND11_CPP17)
+int f1(int x) noexcept(true) { return x+1; }
 int f1(int x) noexcept { return x+1; }
 int f2(int x) noexcept(true) { return x+2; }
 int f3(int x) noexcept(false) { return x+3; }
-#if defined(__GNUG__)
+#if defined(__GNUG__) && !defined(__INTEL_COMPILER)
 #  pragma GCC diagnostic push
 #  pragma GCC diagnostic ignored "-Wdeprecated"
+// NOLINTNEXTLINE(modernize-use-noexcept)
 int f4(int x) throw() { return x+4; } // Deprecated equivalent to noexcept(true)
-#if defined(__GNUG__)
+#if defined(__GNUG__) && !defined(__INTEL_COMPILER)
 #  pragma GCC diagnostic pop
 struct C {
@@ -64,13 +73,15 @@ struct C {
     int m4(int x) const noexcept(true) { return x-4; }
     int m5(int x) noexcept(false) { return x-5; }
     int m6(int x) const noexcept(false) { return x-6; }
-#if defined(__GNUG__)
+#if defined(__GNUG__) && !defined(__INTEL_COMPILER)
 #  pragma GCC diagnostic push
 #  pragma GCC diagnostic ignored "-Wdeprecated"
-    int m7(int x) throw() { return x-7; }
-    int m8(int x) const throw() { return x-8; }
-#if defined(__GNUG__)
+    // NOLINTNEXTLINE(modernize-use-noexcept)
+    int m7(int x) throw() { return x - 7; }
+    // NOLINTNEXTLINE(modernize-use-noexcept)
+    int m8(int x) const throw() { return x - 8; }
+#if defined(__GNUG__) && !defined(__INTEL_COMPILER)
 #  pragma GCC diagnostic pop
@@ -122,6 +133,33 @@ TEST_SUBMODULE(constants_and_functions, m) {
     m.def("f1", f1);
     m.def("f2", f2);
+#if defined(__INTEL_COMPILER)
+#    pragma warning push
+#    pragma warning disable 878 // incompatible exception specifications
     m.def("f3", f3);
+#if defined(__INTEL_COMPILER)
+#    pragma warning pop
     m.def("f4", f4);
+    // test_function_record_leaks
+    struct LargeCapture {
+        // This should always be enough to trigger the alternative branch
+        // where `sizeof(capture) > sizeof(rec->data)`
+        uint64_t zeros[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    };
+    m.def("register_large_capture_with_invalid_arguments", [](py::module_ m) {
+        LargeCapture capture;  // VS 2015's MSVC is acting up if we create the array here
+        m.def("should_raise", [capture](int) { return capture.zeros[9] + 33; }, py::kw_only(), py::arg());
+    });
+    m.def("register_with_raising_repr", [](py::module_ m, const py::object &default_value) {
+        m.def(
+            "should_raise",
+            [](int, int, const py::object &) { return 42; },
+            "some docstring",
+            py::arg_v("x", 42),
+            py::arg_v("y", 42, "<the answer>"),
+            py::arg_v("z", default_value));
+    });
diff --git a/wrap/pybind11/tests/test_constants_and_functions.py b/wrap/pybind11/tests/test_constants_and_functions.py
index b980ccf1cc..ff13bd0f26 100644
--- a/wrap/pybind11/tests/test_constants_and_functions.py
+++ b/wrap/pybind11/tests/test_constants_and_functions.py
@@ -40,3 +40,14 @@ def test_exception_specifiers():
     assert m.f2(53) == 55
     assert m.f3(86) == 89
     assert m.f4(140) == 144
+def test_function_record_leaks():
+    class RaisingRepr:
+        def __repr__(self):
+            raise RuntimeError("Surprise!")
+    with pytest.raises(RuntimeError):
+        m.register_large_capture_with_invalid_arguments(m)
+    with pytest.raises(RuntimeError):
+        m.register_with_raising_repr(m, RaisingRepr())
diff --git a/wrap/pybind11/tests/test_copy_move.cpp b/wrap/pybind11/tests/test_copy_move.cpp
index 05d5c47677..4711a94822 100644
--- a/wrap/pybind11/tests/test_copy_move.cpp
+++ b/wrap/pybind11/tests/test_copy_move.cpp
@@ -37,9 +37,16 @@ template <> lacking_move_ctor empty<lacking_move_ctor>::instance_ = {};
 class MoveOnlyInt {
     MoveOnlyInt() { print_default_created(this); }
-    MoveOnlyInt(int v) : value{std::move(v)} { print_created(this, value); }
-    MoveOnlyInt(MoveOnlyInt &&m) { print_move_created(this, m.value); std::swap(value, m.value); }
-    MoveOnlyInt &operator=(MoveOnlyInt &&m) { print_move_assigned(this, m.value); std::swap(value, m.value); return *this; }
+    explicit MoveOnlyInt(int v) : value{v} { print_created(this, value); }
+    MoveOnlyInt(MoveOnlyInt &&m) noexcept {
+        print_move_created(this, m.value);
+        std::swap(value, m.value);
+    }
+    MoveOnlyInt &operator=(MoveOnlyInt &&m) noexcept {
+        print_move_assigned(this, m.value);
+        std::swap(value, m.value);
+        return *this;
+    }
     MoveOnlyInt(const MoveOnlyInt &) = delete;
     MoveOnlyInt &operator=(const MoveOnlyInt &) = delete;
     ~MoveOnlyInt() { print_destroyed(this); }
@@ -49,9 +56,16 @@ class MoveOnlyInt {
 class MoveOrCopyInt {
     MoveOrCopyInt() { print_default_created(this); }
-    MoveOrCopyInt(int v) : value{std::move(v)} { print_created(this, value); }
-    MoveOrCopyInt(MoveOrCopyInt &&m) { print_move_created(this, m.value); std::swap(value, m.value); }
-    MoveOrCopyInt &operator=(MoveOrCopyInt &&m) { print_move_assigned(this, m.value); std::swap(value, m.value); return *this; }
+    explicit MoveOrCopyInt(int v) : value{v} { print_created(this, value); }
+    MoveOrCopyInt(MoveOrCopyInt &&m) noexcept {
+        print_move_created(this, m.value);
+        std::swap(value, m.value);
+    }
+    MoveOrCopyInt &operator=(MoveOrCopyInt &&m) noexcept {
+        print_move_assigned(this, m.value);
+        std::swap(value, m.value);
+        return *this;
+    }
     MoveOrCopyInt(const MoveOrCopyInt &c) { print_copy_created(this, c.value); value = c.value; }
     MoveOrCopyInt &operator=(const MoveOrCopyInt &c) { print_copy_assigned(this, c.value); value = c.value; return *this; }
     ~MoveOrCopyInt() { print_destroyed(this); }
@@ -61,7 +75,7 @@ class MoveOrCopyInt {
 class CopyOnlyInt {
     CopyOnlyInt() { print_default_created(this); }
-    CopyOnlyInt(int v) : value{std::move(v)} { print_created(this, value); }
+    explicit CopyOnlyInt(int v) : value{v} { print_created(this, value); }
     CopyOnlyInt(const CopyOnlyInt &c) { print_copy_created(this, c.value); value = c.value; }
     CopyOnlyInt &operator=(const CopyOnlyInt &c) { print_copy_assigned(this, c.value); value = c.value; return *this; }
     ~CopyOnlyInt() { print_destroyed(this); }
@@ -71,13 +85,13 @@ class CopyOnlyInt {
 template <> struct type_caster<MoveOnlyInt> {
-    PYBIND11_TYPE_CASTER(MoveOnlyInt, _("MoveOnlyInt"));
+    PYBIND11_TYPE_CASTER(MoveOnlyInt, const_name("MoveOnlyInt"));
     bool load(handle src, bool) { value = MoveOnlyInt(src.cast<int>()); return true; }
     static handle cast(const MoveOnlyInt &m, return_value_policy r, handle p) { return pybind11::cast(m.value, r, p); }
 template <> struct type_caster<MoveOrCopyInt> {
-    PYBIND11_TYPE_CASTER(MoveOrCopyInt, _("MoveOrCopyInt"));
+    PYBIND11_TYPE_CASTER(MoveOrCopyInt, const_name("MoveOrCopyInt"));
     bool load(handle src, bool) { value = MoveOrCopyInt(src.cast<int>()); return true; }
     static handle cast(const MoveOrCopyInt &m, return_value_policy r, handle p) { return pybind11::cast(m.value, r, p); }
@@ -86,15 +100,15 @@ template <> struct type_caster<CopyOnlyInt> {
     CopyOnlyInt value;
-    static constexpr auto name = _("CopyOnlyInt");
+    static constexpr auto name = const_name("CopyOnlyInt");
     bool load(handle src, bool) { value = CopyOnlyInt(src.cast<int>()); return true; }
     static handle cast(const CopyOnlyInt &m, return_value_policy r, handle p) { return pybind11::cast(m.value, r, p); }
     static handle cast(const CopyOnlyInt *src, return_value_policy policy, handle parent) {
         if (!src) return none().release();
         return cast(*src, policy, parent);
-    operator CopyOnlyInt*() { return &value; }
-    operator CopyOnlyInt&() { return value; }
+    explicit operator CopyOnlyInt *() { return &value; }
+    explicit operator CopyOnlyInt &() { return value; }
     template <typename T> using cast_op_type = pybind11::detail::cast_op_type<T>;
@@ -111,14 +125,15 @@ TEST_SUBMODULE(copy_move_policies, m) {
     // test_move_and_copy_casts
-    m.def("move_and_copy_casts", [](py::object o) {
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
+    m.def("move_and_copy_casts", [](const py::object &o) {
         int r = 0;
         r += py::cast<MoveOrCopyInt>(o).value; /* moves */
         r += py::cast<MoveOnlyInt>(o).value; /* moves */
         r += py::cast<CopyOnlyInt>(o).value; /* copies */
-        MoveOrCopyInt m1(py::cast<MoveOrCopyInt>(o)); /* moves */
-        MoveOnlyInt m2(py::cast<MoveOnlyInt>(o)); /* moves */
-        CopyOnlyInt m3(py::cast<CopyOnlyInt>(o)); /* copies */
+        auto m1(py::cast<MoveOrCopyInt>(o)); /* moves */
+        auto m2(py::cast<MoveOnlyInt>(o)); /* moves */
+        auto m3(py::cast<CopyOnlyInt>(o)); /* copies */
         r += m1.value + m2.value + m3.value;
         return r;
@@ -126,7 +141,11 @@ TEST_SUBMODULE(copy_move_policies, m) {
     // test_move_and_copy_loads
     m.def("move_only", [](MoveOnlyInt m) { return m.value; });
+    // Changing this breaks the existing test: needs careful review.
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
     m.def("move_or_copy", [](MoveOrCopyInt m) { return m.value; });
+    // Changing this breaks the existing test: needs careful review.
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
     m.def("copy_only", [](CopyOnlyInt m) { return m.value; });
     m.def("move_pair", [](std::pair<MoveOnlyInt, MoveOrCopyInt> p) {
         return p.first.value + p.second.value;
@@ -186,8 +205,7 @@ TEST_SUBMODULE(copy_move_policies, m) {
             void *ptr = std::malloc(bytes);
             if (ptr)
                 return ptr;
-            else
-                throw std::bad_alloc{};
+            throw std::bad_alloc{};
     py::class_<PrivateOpNew>(m, "PrivateOpNew").def_readonly("value", &PrivateOpNew::value);
@@ -201,7 +219,7 @@ TEST_SUBMODULE(copy_move_policies, m) {
     // #389: rvp::move should fall-through to copy on non-movable objects
     struct MoveIssue1 {
         int v;
-        MoveIssue1(int v) : v{v} {}
+        explicit MoveIssue1(int v) : v{v} {}
         MoveIssue1(const MoveIssue1 &c) = default;
         MoveIssue1(MoveIssue1 &&) = delete;
@@ -209,11 +227,12 @@ TEST_SUBMODULE(copy_move_policies, m) {
     struct MoveIssue2 {
         int v;
-        MoveIssue2(int v) : v{v} {}
+        explicit MoveIssue2(int v) : v{v} {}
         MoveIssue2(MoveIssue2 &&) = default;
     py::class_<MoveIssue2>(m, "MoveIssue2").def(py::init<int>()).def_readwrite("value", &MoveIssue2::v);
-    m.def("get_moveissue1", [](int i) { return new MoveIssue1(i); }, py::return_value_policy::move);
+    // #2742: Don't expect ownership of raw pointer to `new`ed object to be transferred with `py::return_value_policy::move`
+    m.def("get_moveissue1", [](int i) { return std::unique_ptr<MoveIssue1>(new MoveIssue1(i)); }, py::return_value_policy::move);
     m.def("get_moveissue2", [](int i) { return MoveIssue2(i); }, py::return_value_policy::move);
diff --git a/wrap/pybind11/tests/test_copy_move.py b/wrap/pybind11/tests/test_copy_move.py
index 6b53993a91..eb1efddd50 100644
--- a/wrap/pybind11/tests/test_copy_move.py
+++ b/wrap/pybind11/tests/test_copy_move.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 import pytest
 from pybind11_tests import copy_move_policies as m
@@ -19,7 +20,11 @@ def test_move_and_copy_casts():
     """Cast some values in C++ via custom type casters and count the number of moves/copies."""
     cstats = m.move_and_copy_cstats()
-    c_m, c_mc, c_c = cstats["MoveOnlyInt"], cstats["MoveOrCopyInt"], cstats["CopyOnlyInt"]
+    c_m, c_mc, c_c = (
+        cstats["MoveOnlyInt"],
+        cstats["MoveOrCopyInt"],
+        cstats["CopyOnlyInt"],
+    )
     # The type move constructions/assignments below each get incremented: the move assignment comes
     # from the type_caster load; the move construction happens when extracting that via a cast or
@@ -43,7 +48,11 @@ def test_move_and_copy_loads():
     cstats = m.move_and_copy_cstats()
-    c_m, c_mc, c_c = cstats["MoveOnlyInt"], cstats["MoveOrCopyInt"], cstats["CopyOnlyInt"]
+    c_m, c_mc, c_c = (
+        cstats["MoveOnlyInt"],
+        cstats["MoveOrCopyInt"],
+        cstats["CopyOnlyInt"],
+    )
     assert m.move_only(10) == 10  # 1 move, c_m
     assert m.move_or_copy(11) == 11  # 1 move, c_mc
@@ -66,12 +75,16 @@ def test_move_and_copy_loads():
     assert c_m.alive() + c_mc.alive() + c_c.alive() == 0
-@pytest.mark.skipif(not m.has_optional, reason='no <optional>')
+@pytest.mark.skipif(not m.has_optional, reason="no <optional>")
 def test_move_and_copy_load_optional():
     """Tests move/copy loads of std::optional arguments"""
     cstats = m.move_and_copy_cstats()
-    c_m, c_mc, c_c = cstats["MoveOnlyInt"], cstats["MoveOrCopyInt"], cstats["CopyOnlyInt"]
+    c_m, c_mc, c_c = (
+        cstats["MoveOnlyInt"],
+        cstats["MoveOrCopyInt"],
+        cstats["CopyOnlyInt"],
+    )
     # The extra move/copy constructions below come from the std::optional move (which has to move
     # its arguments):
@@ -107,7 +120,7 @@ def test_private_op_new():
 def test_move_fallback():
     """#389: rvp::move should fall-through to copy on non-movable objects"""
-    m2 = m.get_moveissue2(2)
-    assert m2.value == 2
     m1 = m.get_moveissue1(1)
     assert m1.value == 1
+    m2 = m.get_moveissue2(2)
+    assert m2.value == 2
diff --git a/wrap/pybind11/tests/test_custom_type_casters.cpp b/wrap/pybind11/tests/test_custom_type_casters.cpp
index d565add264..48613ee5a1 100644
--- a/wrap/pybind11/tests/test_custom_type_casters.cpp
+++ b/wrap/pybind11/tests/test_custom_type_casters.cpp
@@ -18,7 +18,12 @@ class ArgAlwaysConverts { };
 namespace pybind11 { namespace detail {
 template <> struct type_caster<ArgInspector1> {
+    // Classic
     PYBIND11_TYPE_CASTER(ArgInspector1, _("ArgInspector1"));
+    PYBIND11_TYPE_CASTER(ArgInspector1, const_name("ArgInspector1"));
     bool load(handle src, bool convert) {
         value.arg = "loading ArgInspector1 argument " +
@@ -33,7 +38,7 @@ template <> struct type_caster<ArgInspector1> {
 template <> struct type_caster<ArgInspector2> {
-    PYBIND11_TYPE_CASTER(ArgInspector2, _("ArgInspector2"));
+    PYBIND11_TYPE_CASTER(ArgInspector2, const_name("ArgInspector2"));
     bool load(handle src, bool convert) {
         value.arg = "loading ArgInspector2 argument " +
@@ -48,7 +53,7 @@ template <> struct type_caster<ArgInspector2> {
 template <> struct type_caster<ArgAlwaysConverts> {
-    PYBIND11_TYPE_CASTER(ArgAlwaysConverts, _("ArgAlwaysConverts"));
+    PYBIND11_TYPE_CASTER(ArgAlwaysConverts, const_name("ArgAlwaysConverts"));
     bool load(handle, bool convert) {
         return convert;
@@ -67,13 +72,16 @@ class DestructionTester {
     DestructionTester() { print_default_created(this); }
     ~DestructionTester() { print_destroyed(this); }
     DestructionTester(const DestructionTester &) { print_copy_created(this); }
-    DestructionTester(DestructionTester &&) { print_move_created(this); }
+    DestructionTester(DestructionTester &&) noexcept { print_move_created(this); }
     DestructionTester &operator=(const DestructionTester &) { print_copy_assigned(this); return *this; }
-    DestructionTester &operator=(DestructionTester &&) { print_move_assigned(this); return *this; }
+    DestructionTester &operator=(DestructionTester &&) noexcept {
+        print_move_assigned(this);
+        return *this;
+    }
 namespace pybind11 { namespace detail {
 template <> struct type_caster<DestructionTester> {
-    PYBIND11_TYPE_CASTER(DestructionTester, _("DestructionTester"));
+    PYBIND11_TYPE_CASTER(DestructionTester, const_name("DestructionTester"));
     bool load(handle, bool) { return true; }
     static handle cast(const DestructionTester &, return_value_policy, handle) {
@@ -94,24 +102,35 @@ TEST_SUBMODULE(custom_type_casters, m) {
     class ArgInspector {
         ArgInspector1 f(ArgInspector1 a, ArgAlwaysConverts) { return a; }
-        std::string g(ArgInspector1 a, const ArgInspector1 &b, int c, ArgInspector2 *d, ArgAlwaysConverts) {
+        std::string g(const ArgInspector1 &a,
+                      const ArgInspector1 &b,
+                      int c,
+                      ArgInspector2 *d,
+                      ArgAlwaysConverts) {
             return a.arg + "\n" + b.arg + "\n" + std::to_string(c) + "\n" + d->arg;
         static ArgInspector2 h(ArgInspector2 a, ArgAlwaysConverts) { return a; }
+    // [workaround(intel)] ICC 20/21 breaks with py::arg().stuff, using py::arg{}.stuff works.
     py::class_<ArgInspector>(m, "ArgInspector")
         .def("f", &ArgInspector::f, py::arg(), py::arg() = ArgAlwaysConverts())
         .def("g", &ArgInspector::g, "a"_a.noconvert(), "b"_a, "c"_a.noconvert()=13, "d"_a=ArgInspector2(), py::arg() = ArgAlwaysConverts())
-        .def_static("h", &ArgInspector::h, py::arg().noconvert(), py::arg() = ArgAlwaysConverts())
+        .def_static("h", &ArgInspector::h, py::arg{}.noconvert(), py::arg() = ArgAlwaysConverts())
-    m.def("arg_inspect_func", [](ArgInspector2 a, ArgInspector1 b, ArgAlwaysConverts) { return a.arg + "\n" + b.arg; },
-            py::arg().noconvert(false), py::arg_v(nullptr, ArgInspector1()).noconvert(true), py::arg() = ArgAlwaysConverts());
-    m.def("floats_preferred", [](double f) { return 0.5 * f; }, py::arg("f"));
-    m.def("floats_only", [](double f) { return 0.5 * f; }, py::arg("f").noconvert());
-    m.def("ints_preferred", [](int i) { return i / 2; }, py::arg("i"));
-    m.def("ints_only", [](int i) { return i / 2; }, py::arg("i").noconvert());
+    m.def(
+        "arg_inspect_func",
+        [](const ArgInspector2 &a, const ArgInspector1 &b, ArgAlwaysConverts) {
+            return a.arg + "\n" + b.arg;
+        },
+        py::arg{}.noconvert(false),
+        py::arg_v(nullptr, ArgInspector1()).noconvert(true),
+        py::arg() = ArgAlwaysConverts());
+    m.def("floats_preferred", [](double f) { return 0.5 * f; }, "f"_a);
+    m.def("floats_only", [](double f) { return 0.5 * f; }, "f"_a.noconvert());
+    m.def("ints_preferred", [](int i) { return i / 2; }, "i"_a);
+    m.def("ints_only", [](int i) { return i / 2; }, "i"_a.noconvert());
     // test_custom_caster_destruction
     // Test that `take_ownership` works on types with a custom type caster when given a pointer
diff --git a/wrap/pybind11/tests/test_custom_type_casters.py b/wrap/pybind11/tests/test_custom_type_casters.py
index 9475c45168..a10646ff46 100644
--- a/wrap/pybind11/tests/test_custom_type_casters.py
+++ b/wrap/pybind11/tests/test_custom_type_casters.py
@@ -1,69 +1,96 @@
 # -*- coding: utf-8 -*-
 import pytest
 from pybind11_tests import custom_type_casters as m
 def test_noconvert_args(msg):
     a = m.ArgInspector()
-    assert msg(a.f("hi")) == """
+    assert (
+        msg(a.f("hi"))
+        == """
         loading ArgInspector1 argument WITH conversion allowed.  Argument value = hi
-    assert msg(a.g("this is a", "this is b")) == """
+    )
+    assert (
+        msg(a.g("this is a", "this is b"))
+        == """
         loading ArgInspector1 argument WITHOUT conversion allowed.  Argument value = this is a
         loading ArgInspector1 argument WITH conversion allowed.  Argument value = this is b
         loading ArgInspector2 argument WITH conversion allowed.  Argument value = (default arg inspector 2)
     """  # noqa: E501 line too long
-    assert msg(a.g("this is a", "this is b", 42)) == """
+    )
+    assert (
+        msg(a.g("this is a", "this is b", 42))
+        == """
         loading ArgInspector1 argument WITHOUT conversion allowed.  Argument value = this is a
         loading ArgInspector1 argument WITH conversion allowed.  Argument value = this is b
         loading ArgInspector2 argument WITH conversion allowed.  Argument value = (default arg inspector 2)
     """  # noqa: E501 line too long
-    assert msg(a.g("this is a", "this is b", 42, "this is d")) == """
+    )
+    assert (
+        msg(a.g("this is a", "this is b", 42, "this is d"))
+        == """
         loading ArgInspector1 argument WITHOUT conversion allowed.  Argument value = this is a
         loading ArgInspector1 argument WITH conversion allowed.  Argument value = this is b
         loading ArgInspector2 argument WITH conversion allowed.  Argument value = this is d
-    assert (a.h("arg 1") ==
-            "loading ArgInspector2 argument WITHOUT conversion allowed.  Argument value = arg 1")
-    assert msg(m.arg_inspect_func("A1", "A2")) == """
+    )
+    assert (
+        a.h("arg 1")
+        == "loading ArgInspector2 argument WITHOUT conversion allowed.  Argument value = arg 1"
+    )
+    assert (
+        msg(m.arg_inspect_func("A1", "A2"))
+        == """
         loading ArgInspector2 argument WITH conversion allowed.  Argument value = A1
         loading ArgInspector1 argument WITHOUT conversion allowed.  Argument value = A2
+    )
     assert m.floats_preferred(4) == 2.0
     assert m.floats_only(4.0) == 2.0
     with pytest.raises(TypeError) as excinfo:
-    assert msg(excinfo.value) == """
+    assert (
+        msg(excinfo.value)
+        == """
         floats_only(): incompatible function arguments. The following argument types are supported:
             1. (f: float) -> float
         Invoked with: 4
+    )
     assert m.ints_preferred(4) == 2
     assert m.ints_preferred(True) == 0
     with pytest.raises(TypeError) as excinfo:
-    assert msg(excinfo.value) == """
+    assert (
+        msg(excinfo.value)
+        == """
         ints_preferred(): incompatible function arguments. The following argument types are supported:
             1. (i: int) -> int
         Invoked with: 4.0
     """  # noqa: E501 line too long
+    )
     assert m.ints_only(4) == 2
     with pytest.raises(TypeError) as excinfo:
-    assert msg(excinfo.value) == """
+    assert (
+        msg(excinfo.value)
+        == """
         ints_only(): incompatible function arguments. The following argument types are supported:
             1. (i: int) -> int
         Invoked with: 4.0
+    )
 def test_custom_caster_destruction():
diff --git a/wrap/pybind11/tests/test_custom_type_setup.cpp b/wrap/pybind11/tests/test_custom_type_setup.cpp
new file mode 100644
index 0000000000..42fae05d5d
--- /dev/null
+++ b/wrap/pybind11/tests/test_custom_type_setup.cpp
@@ -0,0 +1,41 @@
+    tests/test_custom_type_setup.cpp -- Tests `pybind11::custom_type_setup`
+    Copyright (c) Google LLC
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+#include <pybind11/pybind11.h>
+#include "pybind11_tests.h"
+namespace py = pybind11;
+namespace {
+struct OwnsPythonObjects {
+    py::object value = py::none();
+} // namespace
+TEST_SUBMODULE(custom_type_setup, m) {
+    py::class_<OwnsPythonObjects> cls(
+        m, "OwnsPythonObjects", py::custom_type_setup([](PyHeapTypeObject *heap_type) {
+            auto *type = &heap_type->ht_type;
+            type->tp_flags |= Py_TPFLAGS_HAVE_GC;
+            type->tp_traverse = [](PyObject *self_base, visitproc visit, void *arg) {
+                auto &self = py::cast<OwnsPythonObjects &>(py::handle(self_base));
+                Py_VISIT(self.value.ptr());
+                return 0;
+            };
+            type->tp_clear = [](PyObject *self_base) {
+                auto &self = py::cast<OwnsPythonObjects &>(py::handle(self_base));
+                self.value = py::none();
+                return 0;
+            };
+        }));
+    cls.def(py::init<>());
+    cls.def_readwrite("value", &OwnsPythonObjects::value);
diff --git a/wrap/pybind11/tests/test_custom_type_setup.py b/wrap/pybind11/tests/test_custom_type_setup.py
new file mode 100644
index 0000000000..ef96f08141
--- /dev/null
+++ b/wrap/pybind11/tests/test_custom_type_setup.py
@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+import gc
+import weakref
+import pytest
+import env  # noqa: F401
+from pybind11_tests import custom_type_setup as m
+def gc_tester():
+    """Tests that an object is garbage collected.
+    Assumes that any unreferenced objects are fully collected after calling
+    `gc.collect()`.  That is true on CPython, but does not appear to reliably
+    hold on PyPy.
+    """
+    weak_refs = []
+    def add_ref(obj):
+        # PyPy does not support `gc.is_tracked`.
+        if hasattr(gc, "is_tracked"):
+            assert gc.is_tracked(obj)
+        weak_refs.append(weakref.ref(obj))
+    yield add_ref
+    gc.collect()
+    for ref in weak_refs:
+        assert ref() is None
+# PyPy does not seem to reliably garbage collect.
+def test_self_cycle(gc_tester):
+    obj = m.OwnsPythonObjects()
+    obj.value = obj
+    gc_tester(obj)
+# PyPy does not seem to reliably garbage collect.
+def test_indirect_cycle(gc_tester):
+    obj = m.OwnsPythonObjects()
+    obj_list = [obj]
+    obj.value = obj_list
+    gc_tester(obj)
diff --git a/wrap/pybind11/tests/test_docstring_options.cpp b/wrap/pybind11/tests/test_docstring_options.cpp
index 8c8f79fd5f..8a97af55fc 100644
--- a/wrap/pybind11/tests/test_docstring_options.cpp
+++ b/wrap/pybind11/tests/test_docstring_options.cpp
@@ -45,6 +45,14 @@ TEST_SUBMODULE(docstring_options, m) {
     m.def("test_function7", [](int, int) {}, py::arg("a"), py::arg("b"), "A custom docstring");
+    {
+        py::options options;
+        options.disable_user_defined_docstrings();
+        options.disable_function_signatures();
+        m.def("test_function8", []() {});
+    }
         py::options options;
diff --git a/wrap/pybind11/tests/test_docstring_options.py b/wrap/pybind11/tests/test_docstring_options.py
index 80ade0f158..8ee6613884 100644
--- a/wrap/pybind11/tests/test_docstring_options.py
+++ b/wrap/pybind11/tests/test_docstring_options.py
@@ -18,10 +18,10 @@ def test_docstring_options():
     assert m.test_overloaded3.__doc__ == "Overload docstr"
     # options.enable_function_signatures()
-    assert m.test_function3.__doc__ .startswith("test_function3(a: int, b: int) -> None")
+    assert m.test_function3.__doc__.startswith("test_function3(a: int, b: int) -> None")
-    assert m.test_function4.__doc__ .startswith("test_function4(a: int, b: int) -> None")
-    assert m.test_function4.__doc__ .endswith("A custom docstring\n")
+    assert m.test_function4.__doc__.startswith("test_function4(a: int, b: int) -> None")
+    assert m.test_function4.__doc__.endswith("A custom docstring\n")
     # options.disable_function_signatures()
     # options.disable_user_defined_docstrings()
@@ -31,8 +31,11 @@ def test_docstring_options():
     assert m.test_function6.__doc__ == "A custom docstring"
     # RAII destructor
-    assert m.test_function7.__doc__ .startswith("test_function7(a: int, b: int) -> None")
-    assert m.test_function7.__doc__ .endswith("A custom docstring\n")
+    assert m.test_function7.__doc__.startswith("test_function7(a: int, b: int) -> None")
+    assert m.test_function7.__doc__.endswith("A custom docstring\n")
+    # when all options are disabled, no docstring (instead of an empty one) should be generated
+    assert m.test_function8.__doc__ is None
     # Suppression of user-defined docstrings for non-function objects
     assert not m.DocstringTestFoo.__doc__
diff --git a/wrap/pybind11/tests/test_eigen.cpp b/wrap/pybind11/tests/test_eigen.cpp
index 56aa1a4a6f..d22a94a1a1 100644
--- a/wrap/pybind11/tests/test_eigen.cpp
+++ b/wrap/pybind11/tests/test_eigen.cpp
@@ -13,6 +13,9 @@
 #include <pybind11/stl.h>
 #if defined(_MSC_VER)
+#if _MSC_VER < 1910  // VS 2015's MSVC
+#  pragma warning(disable: 4127) // C4127: conditional expression is constant
 #  pragma warning(disable: 4996) // C4996: std::unary_negation is deprecated
@@ -54,15 +57,15 @@ void reset_refs() {
 // Returns element 2,1 from a matrix (used to test copy/nocopy)
-double get_elem(Eigen::Ref<const Eigen::MatrixXd> m) { return m(2, 1); };
+double get_elem(const Eigen::Ref<const Eigen::MatrixXd> &m) { return m(2, 1); };
 // Returns a matrix with 10*r + 100*c added to each matrix element (to help test that the matrix
 // reference is referencing rows/columns correctly).
 template <typename MatrixArgType> Eigen::MatrixXd adjust_matrix(MatrixArgType m) {
     Eigen::MatrixXd ret(m);
-    for (int c = 0; c < m.cols(); c++) for (int r = 0; r < m.rows(); r++)
-        ret(r, c) += 10*r + 100*c;
+    for (int c = 0; c < m.cols(); c++)
+        for (int r = 0; r < m.rows(); r++)
+            ret(r, c) += 10*r + 100*c;  // NOLINT(clang-analyzer-core.uninitialized.Assign)
     return ret;
@@ -93,15 +96,18 @@ TEST_SUBMODULE(eigen, m) {
     m.def("double_complex", [](const Eigen::VectorXcf &x) -> Eigen::VectorXcf { return 2.0f * x; });
     m.def("double_threec", [](py::EigenDRef<Eigen::Vector3f> x) { x *= 2; });
     m.def("double_threer", [](py::EigenDRef<Eigen::RowVector3f> x) { x *= 2; });
-    m.def("double_mat_cm", [](Eigen::MatrixXf x) -> Eigen::MatrixXf { return 2.0f * x; });
-    m.def("double_mat_rm", [](DenseMatrixR x) -> DenseMatrixR { return 2.0f * x; });
+    m.def("double_mat_cm", [](const Eigen::MatrixXf &x) -> Eigen::MatrixXf { return 2.0f * x; });
+    m.def("double_mat_rm", [](const DenseMatrixR &x) -> DenseMatrixR { return 2.0f * x; });
     // test_eigen_ref_to_python
     // Different ways of passing via Eigen::Ref; the first and second are the Eigen-recommended
-    m.def("cholesky1", [](Eigen::Ref<MatrixXdR> x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
+    m.def("cholesky1",
+          [](const Eigen::Ref<MatrixXdR> &x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
     m.def("cholesky2", [](const Eigen::Ref<const MatrixXdR> &x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
     m.def("cholesky3", [](const Eigen::Ref<MatrixXdR> &x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
-    m.def("cholesky4", [](Eigen::Ref<const MatrixXdR> x) -> Eigen::MatrixXd { return x.llt().matrixL(); });
+    m.def("cholesky4", [](const Eigen::Ref<const MatrixXdR> &x) -> Eigen::MatrixXd {
+        return x.llt().matrixL();
+    });
     // test_eigen_ref_mutators
     // Mutators: these add some value to the given element using Eigen, but Eigen should be mapping into
@@ -175,6 +181,7 @@ TEST_SUBMODULE(eigen, m) {
         ReturnTester() { print_created(this); }
         ~ReturnTester() { print_destroyed(this); }
         static Eigen::MatrixXd create() { return Eigen::MatrixXd::Ones(10, 10); }
+        // NOLINTNEXTLINE(readability-const-return-type)
         static const Eigen::MatrixXd createConst() { return Eigen::MatrixXd::Ones(10, 10); }
         Eigen::MatrixXd &get() { return mat; }
         Eigen::MatrixXd *getPtr() { return &mat; }
@@ -241,21 +248,27 @@ TEST_SUBMODULE(eigen, m) {
     // test_fixed, and various other tests
     m.def("fixed_r", [mat]() -> FixedMatrixR { return FixedMatrixR(mat); });
+    // Our Eigen does a hack which respects constness through the numpy writeable flag.
+    // Therefore, the const return actually affects this type despite being an rvalue.
+    // NOLINTNEXTLINE(readability-const-return-type)
     m.def("fixed_r_const", [mat]() -> const FixedMatrixR { return FixedMatrixR(mat); });
     m.def("fixed_c", [mat]() -> FixedMatrixC { return FixedMatrixC(mat); });
     m.def("fixed_copy_r", [](const FixedMatrixR &m) -> FixedMatrixR { return m; });
     m.def("fixed_copy_c", [](const FixedMatrixC &m) -> FixedMatrixC { return m; });
     // test_mutator_descriptors
-    m.def("fixed_mutator_r", [](Eigen::Ref<FixedMatrixR>) {});
-    m.def("fixed_mutator_c", [](Eigen::Ref<FixedMatrixC>) {});
-    m.def("fixed_mutator_a", [](py::EigenDRef<FixedMatrixC>) {});
+    m.def("fixed_mutator_r", [](const Eigen::Ref<FixedMatrixR> &) {});
+    m.def("fixed_mutator_c", [](const Eigen::Ref<FixedMatrixC> &) {});
+    m.def("fixed_mutator_a", [](const py::EigenDRef<FixedMatrixC> &) {});
     // test_dense
     m.def("dense_r", [mat]() -> DenseMatrixR { return DenseMatrixR(mat); });
     m.def("dense_c", [mat]() -> DenseMatrixC { return DenseMatrixC(mat); });
     m.def("dense_copy_r", [](const DenseMatrixR &m) -> DenseMatrixR { return m; });
     m.def("dense_copy_c", [](const DenseMatrixC &m) -> DenseMatrixC { return m; });
     // test_sparse, test_sparse_signature
-    m.def("sparse_r", [mat]() -> SparseMatrixR { return Eigen::SparseView<Eigen::MatrixXf>(mat); });
+    m.def("sparse_r", [mat]() -> SparseMatrixR {
+        // NOLINTNEXTLINE(clang-analyzer-core.uninitialized.UndefReturn)
+        return Eigen::SparseView<Eigen::MatrixXf>(mat);
+    });
     m.def("sparse_c", [mat]() -> SparseMatrixC { return Eigen::SparseView<Eigen::MatrixXf>(mat); });
     m.def("sparse_copy_r", [](const SparseMatrixR &m) -> SparseMatrixR { return m; });
     m.def("sparse_copy_c", [](const SparseMatrixC &m) -> SparseMatrixC { return m; });
@@ -272,39 +285,47 @@ TEST_SUBMODULE(eigen, m) {
     m.def("cpp_ref_r", [](py::handle m) { return m.cast<Eigen::Ref<MatrixXdR>>()(1, 0); });
     m.def("cpp_ref_any", [](py::handle m) { return m.cast<py::EigenDRef<Eigen::MatrixXd>>()(1, 0); });
+    // [workaround(intel)] ICC 20/21 breaks with py::arg().stuff, using py::arg{}.stuff works.
     // test_nocopy_wrapper
     // Test that we can prevent copying into an argument that would normally copy: First a version
     // that would allow copying (if types or strides don't match) for comparison:
     m.def("get_elem", &get_elem);
     // Now this alternative that calls the tells pybind to fail rather than copy:
-    m.def("get_elem_nocopy", [](Eigen::Ref<const Eigen::MatrixXd> m) -> double { return get_elem(m); },
-            py::arg().noconvert());
+    m.def(
+        "get_elem_nocopy",
+        [](const Eigen::Ref<const Eigen::MatrixXd> &m) -> double { return get_elem(m); },
+        py::arg{}.noconvert());
     // Also test a row-major-only no-copy const ref:
     m.def("get_elem_rm_nocopy", [](Eigen::Ref<const Eigen::Matrix<long, -1, -1, Eigen::RowMajor>> &m) -> long { return m(2, 1); },
-            py::arg().noconvert());
+            py::arg{}.noconvert());
     // test_issue738
     // Issue #738: 1xN or Nx1 2D matrices were neither accepted nor properly copied with an
     // incompatible stride value on the length-1 dimension--but that should be allowed (without
     // requiring a copy!) because the stride value can be safely ignored on a size-1 dimension.
-    m.def("iss738_f1", &adjust_matrix<const Eigen::Ref<const Eigen::MatrixXd> &>, py::arg().noconvert());
-    m.def("iss738_f2", &adjust_matrix<const Eigen::Ref<const Eigen::Matrix<double, -1, -1, Eigen::RowMajor>> &>, py::arg().noconvert());
+    m.def("iss738_f1", &adjust_matrix<const Eigen::Ref<const Eigen::MatrixXd> &>, py::arg{}.noconvert());
+    m.def("iss738_f2", &adjust_matrix<const Eigen::Ref<const Eigen::Matrix<double, -1, -1, Eigen::RowMajor>> &>, py::arg{}.noconvert());
     // test_issue1105
     // Issue #1105: when converting from a numpy two-dimensional (Nx1) or (1xN) value into a dense
-    // eigen Vector or RowVector, the argument would fail to load because the numpy copy would fail:
-    // numpy won't broadcast a Nx1 into a 1-dimensional vector.
-    m.def("iss1105_col", [](Eigen::VectorXd) { return true; });
-    m.def("iss1105_row", [](Eigen::RowVectorXd) { return true; });
+    // eigen Vector or RowVector, the argument would fail to load because the numpy copy would
+    // fail: numpy won't broadcast a Nx1 into a 1-dimensional vector.
+    m.def("iss1105_col", [](const Eigen::VectorXd &) { return true; });
+    m.def("iss1105_row", [](const Eigen::RowVectorXd &) { return true; });
     // test_named_arguments
     // Make sure named arguments are working properly:
-    m.def("matrix_multiply", [](const py::EigenDRef<const Eigen::MatrixXd> A, const py::EigenDRef<const Eigen::MatrixXd> B)
-            -> Eigen::MatrixXd {
-        if (A.cols() != B.rows()) throw std::domain_error("Nonconformable matrices!");
-        return A * B;
-    }, py::arg("A"), py::arg("B"));
+    m.def(
+        "matrix_multiply",
+        [](const py::EigenDRef<const Eigen::MatrixXd> &A,
+           const py::EigenDRef<const Eigen::MatrixXd> &B) -> Eigen::MatrixXd {
+            if (A.cols() != B.rows())
+                throw std::domain_error("Nonconformable matrices!");
+            return A * B;
+        },
+        py::arg("A"),
+        py::arg("B"));
     // test_custom_operator_new
     py::class_<CustomOperatorNew>(m, "CustomOperatorNew")
@@ -316,12 +337,12 @@ TEST_SUBMODULE(eigen, m) {
     // In case of a failure (the caster's temp array does not live long enough), creating
     // a new array (np.ones(10)) increases the chances that the temp array will be garbage
     // collected and/or that its memory will be overridden with different values.
-    m.def("get_elem_direct", [](Eigen::Ref<const Eigen::VectorXd> v) {
-        py::module::import("numpy").attr("ones")(10);
+    m.def("get_elem_direct", [](const Eigen::Ref<const Eigen::VectorXd> &v) {
+        py::module_::import("numpy").attr("ones")(10);
         return v(5);
     m.def("get_elem_indirect", [](std::vector<Eigen::Ref<const Eigen::VectorXd>> v) {
-        py::module::import("numpy").attr("ones")(10);
+        py::module_::import("numpy").attr("ones")(10);
         return v[0](5);
diff --git a/wrap/pybind11/tests/test_eigen.py b/wrap/pybind11/tests/test_eigen.py
index ac68471474..e53826cbbb 100644
--- a/wrap/pybind11/tests/test_eigen.py
+++ b/wrap/pybind11/tests/test_eigen.py
@@ -1,16 +1,21 @@
 # -*- coding: utf-8 -*-
 import pytest
 from pybind11_tests import ConstructorStats
 np = pytest.importorskip("numpy")
 m = pytest.importorskip("pybind11_tests.eigen")
-ref = np.array([[ 0.,  3,  0,  0,  0, 11],
-                [22,  0,  0,  0, 17, 11],
-                [ 7,  5,  0,  1,  0, 11],
-                [ 0,  0,  0,  0,  0, 11],
-                [ 0,  0, 14,  0,  8, 11]])
+ref = np.array(
+    [
+        [0.0, 3, 0, 0, 0, 11],
+        [22, 0, 0, 0, 17, 11],
+        [7, 5, 0, 1, 0, 11],
+        [0, 0, 0, 0, 0, 11],
+        [0, 0, 14, 0, 8, 11],
+    ]
 def assert_equal_ref(mat):
@@ -40,28 +45,37 @@ def test_dense():
 def test_partially_fixed():
-    ref2 = np.array([[0., 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]])
+    ref2 = np.array([[0.0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]])
     np.testing.assert_array_equal(m.partial_copy_four_rm_r(ref2), ref2)
     np.testing.assert_array_equal(m.partial_copy_four_rm_c(ref2), ref2)
     np.testing.assert_array_equal(m.partial_copy_four_rm_r(ref2[:, 1]), ref2[:, [1]])
     np.testing.assert_array_equal(m.partial_copy_four_rm_c(ref2[0, :]), ref2[[0], :])
-    np.testing.assert_array_equal(m.partial_copy_four_rm_r(ref2[:, (0, 2)]), ref2[:, (0, 2)])
-        m.partial_copy_four_rm_c(ref2[(3, 1, 2), :]), ref2[(3, 1, 2), :])
+        m.partial_copy_four_rm_r(ref2[:, (0, 2)]), ref2[:, (0, 2)]
+    )
+    np.testing.assert_array_equal(
+        m.partial_copy_four_rm_c(ref2[(3, 1, 2), :]), ref2[(3, 1, 2), :]
+    )
     np.testing.assert_array_equal(m.partial_copy_four_cm_r(ref2), ref2)
     np.testing.assert_array_equal(m.partial_copy_four_cm_c(ref2), ref2)
     np.testing.assert_array_equal(m.partial_copy_four_cm_r(ref2[:, 1]), ref2[:, [1]])
     np.testing.assert_array_equal(m.partial_copy_four_cm_c(ref2[0, :]), ref2[[0], :])
-    np.testing.assert_array_equal(m.partial_copy_four_cm_r(ref2[:, (0, 2)]), ref2[:, (0, 2)])
-        m.partial_copy_four_cm_c(ref2[(3, 1, 2), :]), ref2[(3, 1, 2), :])
+        m.partial_copy_four_cm_r(ref2[:, (0, 2)]), ref2[:, (0, 2)]
+    )
+    np.testing.assert_array_equal(
+        m.partial_copy_four_cm_c(ref2[(3, 1, 2), :]), ref2[(3, 1, 2), :]
+    )
     # TypeError should be raise for a shape mismatch
-    functions = [m.partial_copy_four_rm_r, m.partial_copy_four_rm_c,
-                 m.partial_copy_four_cm_r, m.partial_copy_four_cm_c]
-    matrix_with_wrong_shape = [[1, 2],
-                               [3, 4]]
+    functions = [
+        m.partial_copy_four_rm_r,
+        m.partial_copy_four_rm_c,
+        m.partial_copy_four_cm_r,
+        m.partial_copy_four_cm_c,
+    ]
+    matrix_with_wrong_shape = [[1, 2], [3, 4]]
     for f in functions:
         with pytest.raises(TypeError) as excinfo:
@@ -69,7 +83,7 @@ def test_partially_fixed():
 def test_mutator_descriptors():
-    zr = np.arange(30, dtype='float32').reshape(5, 6)  # row-major
+    zr = np.arange(30, dtype="float32").reshape(5, 6)  # row-major
     zc = zr.reshape(6, 5).transpose()  # column-major
@@ -78,18 +92,21 @@ def test_mutator_descriptors():
     with pytest.raises(TypeError) as excinfo:
-    assert ('(arg0: numpy.ndarray[numpy.float32[5, 6],'
-            ' flags.writeable, flags.c_contiguous]) -> None'
-            in str(excinfo.value))
+    assert (
+        "(arg0: numpy.ndarray[numpy.float32[5, 6],"
+        " flags.writeable, flags.c_contiguous]) -> None" in str(excinfo.value)
+    )
     with pytest.raises(TypeError) as excinfo:
-    assert ('(arg0: numpy.ndarray[numpy.float32[5, 6],'
-            ' flags.writeable, flags.f_contiguous]) -> None'
-            in str(excinfo.value))
+    assert (
+        "(arg0: numpy.ndarray[numpy.float32[5, 6],"
+        " flags.writeable, flags.f_contiguous]) -> None" in str(excinfo.value)
+    )
     with pytest.raises(TypeError) as excinfo:
-        m.fixed_mutator_a(np.array([[1, 2], [3, 4]], dtype='float32'))
-    assert ('(arg0: numpy.ndarray[numpy.float32[5, 6], flags.writeable]) -> None'
-            in str(excinfo.value))
+        m.fixed_mutator_a(np.array([[1, 2], [3, 4]], dtype="float32"))
+    assert "(arg0: numpy.ndarray[numpy.float32[5, 6], flags.writeable]) -> None" in str(
+        excinfo.value
+    )
     zr.flags.writeable = False
     with pytest.raises(TypeError):
@@ -98,26 +115,26 @@ def test_mutator_descriptors():
 def test_cpp_casting():
-    assert m.cpp_copy(m.fixed_r()) == 22.
-    assert m.cpp_copy(m.fixed_c()) == 22.
-    z = np.array([[5., 6], [7, 8]])
-    assert m.cpp_copy(z) == 7.
-    assert m.cpp_copy(m.get_cm_ref()) == 21.
-    assert m.cpp_copy(m.get_rm_ref()) == 21.
-    assert m.cpp_ref_c(m.get_cm_ref()) == 21.
-    assert m.cpp_ref_r(m.get_rm_ref()) == 21.
+    assert m.cpp_copy(m.fixed_r()) == 22.0
+    assert m.cpp_copy(m.fixed_c()) == 22.0
+    z = np.array([[5.0, 6], [7, 8]])
+    assert m.cpp_copy(z) == 7.0
+    assert m.cpp_copy(m.get_cm_ref()) == 21.0
+    assert m.cpp_copy(m.get_rm_ref()) == 21.0
+    assert m.cpp_ref_c(m.get_cm_ref()) == 21.0
+    assert m.cpp_ref_r(m.get_rm_ref()) == 21.0
     with pytest.raises(RuntimeError) as excinfo:
         # Can't reference m.fixed_c: it contains floats, m.cpp_ref_any wants doubles
-    assert 'Unable to cast Python instance' in str(excinfo.value)
+    assert "Unable to cast Python instance" in str(excinfo.value)
     with pytest.raises(RuntimeError) as excinfo:
         # Can't reference m.fixed_r: it contains floats, m.cpp_ref_any wants doubles
-    assert 'Unable to cast Python instance' in str(excinfo.value)
-    assert m.cpp_ref_any(m.ReturnTester.create()) == 1.
+    assert "Unable to cast Python instance" in str(excinfo.value)
+    assert m.cpp_ref_any(m.ReturnTester.create()) == 1.0
-    assert m.cpp_ref_any(m.get_cm_ref()) == 21.
-    assert m.cpp_ref_any(m.get_cm_ref()) == 21.
+    assert m.cpp_ref_any(m.get_cm_ref()) == 21.0
+    assert m.cpp_ref_any(m.get_cm_ref()) == 21.0
 def test_pass_readonly_array():
@@ -149,7 +166,7 @@ def test_nonunit_stride_from_python():
     # Mutator:
-    np.testing.assert_array_equal(counting_mat, [[0., 2, 2], [6, 16, 10], [6, 14, 8]])
+    np.testing.assert_array_equal(counting_mat, [[0.0, 2, 2], [6, 16, 10], [6, 14, 8]])
 def test_negative_stride_from_python(msg):
@@ -178,26 +195,36 @@ def test_negative_stride_from_python(msg):
     # Mutator:
     with pytest.raises(TypeError) as excinfo:
-    assert msg(excinfo.value) == """
+    assert (
+        msg(excinfo.value)
+        == """
         double_threer(): incompatible function arguments. The following argument types are supported:
             1. (arg0: numpy.ndarray[numpy.float32[1, 3], flags.writeable]) -> None
-        Invoked with: """ + repr(np.array([ 5.,  4.,  3.], dtype='float32'))  # noqa: E501 line too long
+        Invoked with: """  # noqa: E501 line too long
+        + repr(np.array([5.0, 4.0, 3.0], dtype="float32"))
+    )
     with pytest.raises(TypeError) as excinfo:
-    assert msg(excinfo.value) == """
+    assert (
+        msg(excinfo.value)
+        == """
         double_threec(): incompatible function arguments. The following argument types are supported:
             1. (arg0: numpy.ndarray[numpy.float32[3, 1], flags.writeable]) -> None
-        Invoked with: """ + repr(np.array([ 7.,  4.,  1.], dtype='float32'))  # noqa: E501 line too long
+        Invoked with: """  # noqa: E501 line too long
+        + repr(np.array([7.0, 4.0, 1.0], dtype="float32"))
+    )
 def test_nonunit_stride_to_python():
     assert np.all(m.diagonal(ref) == ref.diagonal())
     assert np.all(m.diagonal_1(ref) == ref.diagonal(1))
     for i in range(-5, 7):
-        assert np.all(m.diagonal_n(ref, i) == ref.diagonal(i)), "m.diagonal_n({})".format(i)
+        assert np.all(
+            m.diagonal_n(ref, i) == ref.diagonal(i)
+        ), "m.diagonal_n({})".format(i)
     assert np.all(m.block(ref, 2, 1, 3, 3) == ref[2:5, 1:4])
     assert np.all(m.block(ref, 1, 4, 4, 2) == ref[1:, 4:])
@@ -207,8 +234,10 @@ def test_nonunit_stride_to_python():
 def test_eigen_ref_to_python():
     chols = [m.cholesky1, m.cholesky2, m.cholesky3, m.cholesky4]
     for i, chol in enumerate(chols, start=1):
-        mymat = chol(np.array([[1., 2, 4], [2, 13, 23], [4, 23, 77]]))
-        assert np.all(mymat == np.array([[1, 0, 0], [2, 3, 0], [4, 5, 6]])), "cholesky{}".format(i)
+        mymat = chol(np.array([[1.0, 2, 4], [2, 13, 23], [4, 23, 77]]))
+        assert np.all(
+            mymat == np.array([[1, 0, 0], [2, 3, 0], [4, 5, 6]])
+        ), "cholesky{}".format(i)
 def assign_both(a1, a2, r, c, v):
@@ -325,8 +354,12 @@ def test_eigen_return_references():
     np.testing.assert_array_equal(a_block1, master[3:5, 3:5])
     np.testing.assert_array_equal(a_block2, master[2:5, 2:4])
     np.testing.assert_array_equal(a_block3, master[6:10, 7:10])
-    np.testing.assert_array_equal(a_corn1, master[0::master.shape[0] - 1, 0::master.shape[1] - 1])
-    np.testing.assert_array_equal(a_corn2, master[0::master.shape[0] - 1, 0::master.shape[1] - 1])
+    np.testing.assert_array_equal(
+        a_corn1, master[0 :: master.shape[0] - 1, 0 :: master.shape[1] - 1]
+    )
+    np.testing.assert_array_equal(
+        a_corn2, master[0 :: master.shape[0] - 1, 0 :: master.shape[1] - 1]
+    )
     np.testing.assert_array_equal(a_copy1, c1want)
     np.testing.assert_array_equal(a_copy2, c2want)
@@ -355,16 +388,28 @@ def test_eigen_keepalive():
     cstats = ConstructorStats.get(m.ReturnTester)
     assert cstats.alive() == 1
     unsafe = [a.ref(), a.ref_const(), a.block(1, 2, 3, 4)]
-    copies = [a.copy_get(), a.copy_view(), a.copy_ref(), a.copy_ref_const(),
-              a.copy_block(4, 3, 2, 1)]
+    copies = [
+        a.copy_get(),
+        a.copy_view(),
+        a.copy_ref(),
+        a.copy_ref_const(),
+        a.copy_block(4, 3, 2, 1),
+    ]
     del a
     assert cstats.alive() == 0
     del unsafe
     del copies
-    for meth in [m.ReturnTester.get, m.ReturnTester.get_ptr, m.ReturnTester.view,
-                 m.ReturnTester.view_ptr, m.ReturnTester.ref_safe, m.ReturnTester.ref_const_safe,
-                 m.ReturnTester.corners, m.ReturnTester.corners_const]:
+    for meth in [
+        m.ReturnTester.get,
+        m.ReturnTester.get_ptr,
+        m.ReturnTester.view,
+        m.ReturnTester.view_ptr,
+        m.ReturnTester.ref_safe,
+        m.ReturnTester.ref_const_safe,
+        m.ReturnTester.corners,
+        m.ReturnTester.corners_const,
+    ]:
         assert_keeps_alive(m.ReturnTester, meth)
     for meth in [m.ReturnTester.block_safe, m.ReturnTester.block_const]:
@@ -374,18 +419,18 @@ def test_eigen_keepalive():
 def test_eigen_ref_mutators():
     """Tests Eigen's ability to mutate numpy values"""
-    orig = np.array([[1., 2, 3], [4, 5, 6], [7, 8, 9]])
+    orig = np.array([[1.0, 2, 3], [4, 5, 6], [7, 8, 9]])
     zr = np.array(orig)
-    zc = np.array(orig, order='F')
+    zc = np.array(orig, order="F")
     m.add_rm(zr, 1, 0, 100)
-    assert np.all(zr == np.array([[1., 2, 3], [104, 5, 6], [7, 8, 9]]))
+    assert np.all(zr == np.array([[1.0, 2, 3], [104, 5, 6], [7, 8, 9]]))
     m.add_cm(zc, 1, 0, 200)
-    assert np.all(zc == np.array([[1., 2, 3], [204, 5, 6], [7, 8, 9]]))
+    assert np.all(zc == np.array([[1.0, 2, 3], [204, 5, 6], [7, 8, 9]]))
     m.add_any(zr, 1, 0, 20)
-    assert np.all(zr == np.array([[1., 2, 3], [124, 5, 6], [7, 8, 9]]))
+    assert np.all(zr == np.array([[1.0, 2, 3], [124, 5, 6], [7, 8, 9]]))
     m.add_any(zc, 1, 0, 10)
-    assert np.all(zc == np.array([[1., 2, 3], [214, 5, 6], [7, 8, 9]]))
+    assert np.all(zc == np.array([[1.0, 2, 3], [214, 5, 6], [7, 8, 9]]))
     # Can't reference a col-major array with a row-major Ref, and vice versa:
     with pytest.raises(TypeError):
@@ -406,8 +451,8 @@ def test_eigen_ref_mutators():
     cornersr = zr[0::2, 0::2]
     cornersc = zc[0::2, 0::2]
-    assert np.all(cornersr == np.array([[1., 3], [7, 9]]))
-    assert np.all(cornersc == np.array([[1., 3], [7, 9]]))
+    assert np.all(cornersr == np.array([[1.0, 3], [7, 9]]))
+    assert np.all(cornersc == np.array([[1.0, 3], [7, 9]]))
     with pytest.raises(TypeError):
         m.add_rm(cornersr, 0, 1, 25)
@@ -419,8 +464,8 @@ def test_eigen_ref_mutators():
         m.add_cm(cornersc, 0, 1, 25)
     m.add_any(cornersr, 0, 1, 25)
     m.add_any(cornersc, 0, 1, 44)
-    assert np.all(zr == np.array([[1., 2, 28], [4, 5, 6], [7, 8, 9]]))
-    assert np.all(zc == np.array([[1., 2, 47], [4, 5, 6], [7, 8, 9]]))
+    assert np.all(zr == np.array([[1.0, 2, 28], [4, 5, 6], [7, 8, 9]]))
+    assert np.all(zc == np.array([[1.0, 2, 47], [4, 5, 6], [7, 8, 9]]))
     # You shouldn't be allowed to pass a non-writeable array to a mutating Eigen method:
     zro = zr[0:4, 0:4]
@@ -458,7 +503,7 @@ def test_numpy_ref_mutators():
     assert not zrro.flags.owndata and not zrro.flags.writeable
     zc[1, 2] = 99
-    expect = np.array([[11., 12, 13], [21, 22, 99], [31, 32, 33]])
+    expect = np.array([[11.0, 12, 13], [21, 22, 99], [31, 32, 33]])
     # We should have just changed zc, of course, but also zcro and the original eigen matrix
     assert np.all(zc == expect)
     assert np.all(zcro == expect)
@@ -506,18 +551,20 @@ def test_both_ref_mutators():
     assert np.all(z == z3)
     assert np.all(z == z4)
     assert np.all(z == z5)
-    expect = np.array([[0., 22, 20], [31, 37, 33], [41, 42, 38]])
+    expect = np.array([[0.0, 22, 20], [31, 37, 33], [41, 42, 38]])
     assert np.all(z == expect)
-    y = np.array(range(100), dtype='float64').reshape(10, 10)
+    y = np.array(range(100), dtype="float64").reshape(10, 10)
     y2 = m.incr_matrix_any(y, 10)  # np -> eigen -> np
-    y3 = m.incr_matrix_any(y2[0::2, 0::2], -33)  # np -> eigen -> np slice -> np -> eigen -> np
+    y3 = m.incr_matrix_any(
+        y2[0::2, 0::2], -33
+    )  # np -> eigen -> np slice -> np -> eigen -> np
     y4 = m.even_rows(y3)  # numpy -> eigen slice -> (... y3)
     y5 = m.even_cols(y4)  # numpy -> eigen slice -> (... y4)
     y6 = m.incr_matrix_any(y5, 1000)  # numpy -> eigen -> (... y5)
     # Apply same mutations using just numpy:
-    yexpect = np.array(range(100), dtype='float64').reshape(10, 10)
+    yexpect = np.array(range(100), dtype="float64").reshape(10, 10)
     yexpect += 10
     yexpect[0::2, 0::2] -= 33
     yexpect[0::4, 0::4] += 1000
@@ -532,10 +579,14 @@ def test_both_ref_mutators():
 def test_nocopy_wrapper():
     # get_elem requires a column-contiguous matrix reference, but should be
     # callable with other types of matrix (via copying):
-    int_matrix_colmajor = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], order='F')
-    dbl_matrix_colmajor = np.array(int_matrix_colmajor, dtype='double', order='F', copy=True)
-    int_matrix_rowmajor = np.array(int_matrix_colmajor, order='C', copy=True)
-    dbl_matrix_rowmajor = np.array(int_matrix_rowmajor, dtype='double', order='C', copy=True)
+    int_matrix_colmajor = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], order="F")
+    dbl_matrix_colmajor = np.array(
+        int_matrix_colmajor, dtype="double", order="F", copy=True
+    )
+    int_matrix_rowmajor = np.array(int_matrix_colmajor, order="C", copy=True)
+    dbl_matrix_rowmajor = np.array(
+        int_matrix_rowmajor, dtype="double", order="C", copy=True
+    )
     # All should be callable via get_elem:
     assert m.get_elem(int_matrix_colmajor) == 8
@@ -546,32 +597,38 @@ def test_nocopy_wrapper():
     # All but the second should fail with m.get_elem_nocopy:
     with pytest.raises(TypeError) as excinfo:
-    assert ('get_elem_nocopy(): incompatible function arguments.' in str(excinfo.value) and
-            ', flags.f_contiguous' in str(excinfo.value))
+    assert "get_elem_nocopy(): incompatible function arguments." in str(
+        excinfo.value
+    ) and ", flags.f_contiguous" in str(excinfo.value)
     assert m.get_elem_nocopy(dbl_matrix_colmajor) == 8
     with pytest.raises(TypeError) as excinfo:
-    assert ('get_elem_nocopy(): incompatible function arguments.' in str(excinfo.value) and
-            ', flags.f_contiguous' in str(excinfo.value))
+    assert "get_elem_nocopy(): incompatible function arguments." in str(
+        excinfo.value
+    ) and ", flags.f_contiguous" in str(excinfo.value)
     with pytest.raises(TypeError) as excinfo:
-    assert ('get_elem_nocopy(): incompatible function arguments.' in str(excinfo.value) and
-            ', flags.f_contiguous' in str(excinfo.value))
+    assert "get_elem_nocopy(): incompatible function arguments." in str(
+        excinfo.value
+    ) and ", flags.f_contiguous" in str(excinfo.value)
     # For the row-major test, we take a long matrix in row-major, so only the third is allowed:
     with pytest.raises(TypeError) as excinfo:
-    assert ('get_elem_rm_nocopy(): incompatible function arguments.' in str(excinfo.value) and
-            ', flags.c_contiguous' in str(excinfo.value))
+    assert "get_elem_rm_nocopy(): incompatible function arguments." in str(
+        excinfo.value
+    ) and ", flags.c_contiguous" in str(excinfo.value)
     with pytest.raises(TypeError) as excinfo:
-    assert ('get_elem_rm_nocopy(): incompatible function arguments.' in str(excinfo.value) and
-            ', flags.c_contiguous' in str(excinfo.value))
+    assert "get_elem_rm_nocopy(): incompatible function arguments." in str(
+        excinfo.value
+    ) and ", flags.c_contiguous" in str(excinfo.value)
     assert m.get_elem_rm_nocopy(int_matrix_rowmajor) == 8
     with pytest.raises(TypeError) as excinfo:
-    assert ('get_elem_rm_nocopy(): incompatible function arguments.' in str(excinfo.value) and
-            ', flags.c_contiguous' in str(excinfo.value))
+    assert "get_elem_rm_nocopy(): incompatible function arguments." in str(
+        excinfo.value
+    ) and ", flags.c_contiguous" in str(excinfo.value)
 def test_eigen_ref_life_support():
@@ -589,12 +646,9 @@ def test_eigen_ref_life_support():
 def test_special_matrix_objects():
-    assert np.all(m.incr_diag(7) == np.diag([1., 2, 3, 4, 5, 6, 7]))
+    assert np.all(m.incr_diag(7) == np.diag([1.0, 2, 3, 4, 5, 6, 7]))
-    asymm = np.array([[ 1.,  2,  3,  4],
-                      [ 5,  6,  7,  8],
-                      [ 9, 10, 11, 12],
-                      [13, 14, 15, 16]])
+    asymm = np.array([[1.0, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]])
     symm_lower = np.array(asymm)
     symm_upper = np.array(asymm)
     for i in range(4):
@@ -607,41 +661,51 @@ def test_special_matrix_objects():
 def test_dense_signature(doc):
-    assert doc(m.double_col) == """
+    assert (
+        doc(m.double_col)
+        == """
         double_col(arg0: numpy.ndarray[numpy.float32[m, 1]]) -> numpy.ndarray[numpy.float32[m, 1]]
-    assert doc(m.double_row) == """
+    )
+    assert (
+        doc(m.double_row)
+        == """
         double_row(arg0: numpy.ndarray[numpy.float32[1, n]]) -> numpy.ndarray[numpy.float32[1, n]]
-    assert doc(m.double_complex) == ("""
+    )
+    assert doc(m.double_complex) == (
+        """
         double_complex(arg0: numpy.ndarray[numpy.complex64[m, 1]])"""
-                                     """ -> numpy.ndarray[numpy.complex64[m, 1]]
-    """)
-    assert doc(m.double_mat_rm) == ("""
+        """ -> numpy.ndarray[numpy.complex64[m, 1]]
+    """
+    )
+    assert doc(m.double_mat_rm) == (
+        """
         double_mat_rm(arg0: numpy.ndarray[numpy.float32[m, n]])"""
-                                    """ -> numpy.ndarray[numpy.float32[m, n]]
-    """)
+        """ -> numpy.ndarray[numpy.float32[m, n]]
+    """
+    )
 def test_named_arguments():
     a = np.array([[1.0, 2], [3, 4], [5, 6]])
     b = np.ones((2, 1))
-    assert np.all(m.matrix_multiply(a, b) == np.array([[3.], [7], [11]]))
-    assert np.all(m.matrix_multiply(A=a, B=b) == np.array([[3.], [7], [11]]))
-    assert np.all(m.matrix_multiply(B=b, A=a) == np.array([[3.], [7], [11]]))
+    assert np.all(m.matrix_multiply(a, b) == np.array([[3.0], [7], [11]]))
+    assert np.all(m.matrix_multiply(A=a, B=b) == np.array([[3.0], [7], [11]]))
+    assert np.all(m.matrix_multiply(B=b, A=a) == np.array([[3.0], [7], [11]]))
     with pytest.raises(ValueError) as excinfo:
         m.matrix_multiply(b, a)
-    assert str(excinfo.value) == 'Nonconformable matrices!'
+    assert str(excinfo.value) == "Nonconformable matrices!"
     with pytest.raises(ValueError) as excinfo:
         m.matrix_multiply(A=b, B=a)
-    assert str(excinfo.value) == 'Nonconformable matrices!'
+    assert str(excinfo.value) == "Nonconformable matrices!"
     with pytest.raises(ValueError) as excinfo:
         m.matrix_multiply(B=a, A=b)
-    assert str(excinfo.value) == 'Nonconformable matrices!'
+    assert str(excinfo.value) == "Nonconformable matrices!"
 def test_sparse():
@@ -656,21 +720,31 @@ def test_sparse():
 def test_sparse_signature(doc):
-    assert doc(m.sparse_copy_r) == """
+    assert (
+        doc(m.sparse_copy_r)
+        == """
         sparse_copy_r(arg0: scipy.sparse.csr_matrix[numpy.float32]) -> scipy.sparse.csr_matrix[numpy.float32]
     """  # noqa: E501 line too long
-    assert doc(m.sparse_copy_c) == """
+    )
+    assert (
+        doc(m.sparse_copy_c)
+        == """
         sparse_copy_c(arg0: scipy.sparse.csc_matrix[numpy.float32]) -> scipy.sparse.csc_matrix[numpy.float32]
     """  # noqa: E501 line too long
+    )
 def test_issue738():
     """Ignore strides on a length-1 dimension (even if they would be incompatible length > 1)"""
-    assert np.all(m.iss738_f1(np.array([[1., 2, 3]])) == np.array([[1., 102, 203]]))
-    assert np.all(m.iss738_f1(np.array([[1.], [2], [3]])) == np.array([[1.], [12], [23]]))
-    assert np.all(m.iss738_f2(np.array([[1., 2, 3]])) == np.array([[1., 102, 203]]))
-    assert np.all(m.iss738_f2(np.array([[1.], [2], [3]])) == np.array([[1.], [12], [23]]))
+    assert np.all(m.iss738_f1(np.array([[1.0, 2, 3]])) == np.array([[1.0, 102, 203]]))
+    assert np.all(
+        m.iss738_f1(np.array([[1.0], [2], [3]])) == np.array([[1.0], [12], [23]])
+    )
+    assert np.all(m.iss738_f2(np.array([[1.0, 2, 3]])) == np.array([[1.0, 102, 203]]))
+    assert np.all(
+        m.iss738_f2(np.array([[1.0], [2], [3]])) == np.array([[1.0], [12], [23]])
+    )
 def test_issue1105():
diff --git a/wrap/pybind11/tests/test_embed/CMakeLists.txt b/wrap/pybind11/tests/test_embed/CMakeLists.txt
index 2e298fa7e4..edb8961a7d 100644
--- a/wrap/pybind11/tests/test_embed/CMakeLists.txt
+++ b/wrap/pybind11/tests/test_embed/CMakeLists.txt
@@ -1,10 +1,13 @@
+possibly_uninitialized(PYTHON_MODULE_EXTENSION Python_INTERPRETER_ID)
+  message(STATUS "Skipping embed test on PyPy")
   add_custom_target(cpptest) # Dummy target on PyPy. Embedding is not supported.
   set(_suppress_unused_variable_warning "${DOWNLOAD_CATCH}")
-find_package(Catch 2.13.0)
+find_package(Catch 2.13.2)
   message(STATUS "Building interpreter tests using Catch v${CATCH_VERSION}")
@@ -22,12 +25,13 @@ pybind11_enable_warnings(test_embed)
 target_link_libraries(test_embed PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads)
-  file(COPY test_interpreter.py DESTINATION "${CMAKE_CURRENT_BINARY_DIR}")
+  file(COPY test_interpreter.py test_trampoline.py DESTINATION "${CMAKE_CURRENT_BINARY_DIR}")
   COMMAND "$<TARGET_FILE:test_embed>"
+  DEPENDS test_embed
 pybind11_add_module(external_module THIN_LTO external_module.cpp)
diff --git a/wrap/pybind11/tests/test_embed/external_module.cpp b/wrap/pybind11/tests/test_embed/external_module.cpp
index e9a6058b17..4909522993 100644
--- a/wrap/pybind11/tests/test_embed/external_module.cpp
+++ b/wrap/pybind11/tests/test_embed/external_module.cpp
@@ -9,7 +9,7 @@ namespace py = pybind11;
 PYBIND11_MODULE(external_module, m) {
     class A {
-        A(int value) : v{value} {};
+        explicit A(int value) : v{value} {};
         int v;
diff --git a/wrap/pybind11/tests/test_embed/test_interpreter.cpp b/wrap/pybind11/tests/test_embed/test_interpreter.cpp
index 753ce54dcd..508975eb3c 100644
--- a/wrap/pybind11/tests/test_embed/test_interpreter.cpp
+++ b/wrap/pybind11/tests/test_embed/test_interpreter.cpp
@@ -8,20 +8,23 @@
 #include <catch.hpp>
-#include <thread>
+#include <cstdlib>
 #include <fstream>
 #include <functional>
+#include <thread>
+#include <utility>
 namespace py = pybind11;
 using namespace py::literals;
 class Widget {
-    Widget(std::string message) : message(message) { }
+    explicit Widget(std::string message) : message(std::move(message)) {}
     virtual ~Widget() = default;
     std::string the_message() const { return message; }
     virtual int the_answer() const = 0;
+    virtual std::string argv0() const = 0;
     std::string message;
@@ -31,6 +34,23 @@ class PyWidget final : public Widget {
     using Widget::Widget;
     int the_answer() const override { PYBIND11_OVERRIDE_PURE(int, Widget, the_answer); }
+    std::string argv0() const override { PYBIND11_OVERRIDE_PURE(std::string, Widget, argv0); }
+class test_override_cache_helper {
+    virtual int func() { return 0; }
+    test_override_cache_helper() = default;
+    virtual ~test_override_cache_helper() = default;
+    // Non-copyable
+    test_override_cache_helper &operator=(test_override_cache_helper const &Right) = delete;
+    test_override_cache_helper(test_override_cache_helper const &Copy) = delete;
+class test_override_cache_helper_trampoline : public test_override_cache_helper {
+    int func() override { PYBIND11_OVERRIDE(int, test_override_cache_helper, func); }
 PYBIND11_EMBEDDED_MODULE(widget_module, m) {
@@ -41,6 +61,12 @@ PYBIND11_EMBEDDED_MODULE(widget_module, m) {
     m.def("add", [](int i, int j) { return i + j; });
+PYBIND11_EMBEDDED_MODULE(trampoline_module, m) {
+    py::class_<test_override_cache_helper, test_override_cache_helper_trampoline, std::shared_ptr<test_override_cache_helper>>(m, "test_override_cache_helper")
+        .def(py::init_alias<>())
+        .def("func", &test_override_cache_helper::func);
 PYBIND11_EMBEDDED_MODULE(throw_exception, ) {
     throw std::runtime_error("C++ Error");
@@ -51,17 +77,17 @@ PYBIND11_EMBEDDED_MODULE(throw_error_already_set, ) {
 TEST_CASE("Pass classes and data between modules defined in C++ and Python") {
-    auto module = py::module::import("test_interpreter");
-    REQUIRE(py::hasattr(module, "DerivedWidget"));
+    auto module_ = py::module_::import("test_interpreter");
+    REQUIRE(py::hasattr(module_, "DerivedWidget"));
-    auto locals = py::dict("hello"_a="Hello, World!", "x"_a=5, **module.attr("__dict__"));
+    auto locals = py::dict("hello"_a="Hello, World!", "x"_a=5, **module_.attr("__dict__"));
         widget = DerivedWidget("{} - {}".format(hello, x))
         message = widget.the_message
     )", py::globals(), locals);
     REQUIRE(locals["message"].cast<std::string>() == "Hello, World! - 5");
-    auto py_widget = module.attr("DerivedWidget")("The question");
+    auto py_widget = module_.attr("DerivedWidget")("The question");
     auto message = py_widget.attr("the_message");
     REQUIRE(message.cast<std::string>() == "The question");
@@ -69,12 +95,55 @@ TEST_CASE("Pass classes and data between modules defined in C++ and Python") {
     REQUIRE(cpp_widget.the_answer() == 42);
+TEST_CASE("Override cache") {
+    auto module_ = py::module_::import("test_trampoline");
+    REQUIRE(py::hasattr(module_, "func"));
+    REQUIRE(py::hasattr(module_, "func2"));
+    auto locals = py::dict(**module_.attr("__dict__"));
+    int i = 0;
+    for (; i < 1500; ++i) {
+        std::shared_ptr<test_override_cache_helper> p_obj;
+        std::shared_ptr<test_override_cache_helper> p_obj2;
+        py::object loc_inst = locals["func"]();
+        p_obj = py::cast<std::shared_ptr<test_override_cache_helper>>(loc_inst);
+        int ret = p_obj->func();
+        REQUIRE(ret == 42);
+        loc_inst = locals["func2"]();
+        p_obj2 = py::cast<std::shared_ptr<test_override_cache_helper>>(loc_inst);
+        p_obj2->func();
+    }
 TEST_CASE("Import error handling") {
-    REQUIRE_NOTHROW(py::module::import("widget_module"));
-    REQUIRE_THROWS_WITH(py::module::import("throw_exception"),
+    REQUIRE_NOTHROW(py::module_::import("widget_module"));
+    REQUIRE_THROWS_WITH(py::module_::import("throw_exception"),
                         "ImportError: C++ Error");
-    REQUIRE_THROWS_WITH(py::module::import("throw_error_already_set"),
+#if PY_VERSION_HEX >= 0x03030000
+    REQUIRE_THROWS_WITH(py::module_::import("throw_error_already_set"),
+                        Catch::Contains("ImportError: initialization failed"));
+    auto locals = py::dict("is_keyerror"_a=false, "message"_a="not set");
+    py::exec(R"(
+        try:
+            import throw_error_already_set
+        except ImportError as e:
+            is_keyerror = type(e.__cause__) == KeyError
+            message = str(e.__cause__)
+    )", py::globals(), locals);
+    REQUIRE(locals["is_keyerror"].cast<bool>() == true);
+    REQUIRE(locals["message"].cast<std::string>() == "'missing'");
+    REQUIRE_THROWS_WITH(py::module_::import("throw_error_already_set"),
                         Catch::Contains("ImportError: KeyError"));
 TEST_CASE("There can be only one interpreter") {
@@ -102,19 +171,19 @@ bool has_pybind11_internals_builtin() {
 bool has_pybind11_internals_static() {
     auto **&ipp = py::detail::get_internals_pp();
-    return ipp && *ipp;
+    return (ipp != nullptr) && (*ipp != nullptr);
 TEST_CASE("Restart the interpreter") {
     // Verify pre-restart state.
-    REQUIRE(py::module::import("widget_module").attr("add")(1, 2).cast<int>() == 3);
+    REQUIRE(py::module_::import("widget_module").attr("add")(1, 2).cast<int>() == 3);
-    REQUIRE(py::module::import("external_module").attr("A")(123).attr("value").cast<int>() == 123);
+    REQUIRE(py::module_::import("external_module").attr("A")(123).attr("value").cast<int>() == 123);
     // local and foreign module internals should point to the same internals:
     REQUIRE(reinterpret_cast<uintptr_t>(*py::detail::get_internals_pp()) ==
-            py::module::import("external_module").attr("internals_at")().cast<uintptr_t>());
+            py::module_::import("external_module").attr("internals_at")().cast<uintptr_t>());
     // Restart the interpreter.
@@ -130,14 +199,14 @@ TEST_CASE("Restart the interpreter") {
     REQUIRE(reinterpret_cast<uintptr_t>(*py::detail::get_internals_pp()) ==
-            py::module::import("external_module").attr("internals_at")().cast<uintptr_t>());
+            py::module_::import("external_module").attr("internals_at")().cast<uintptr_t>());
     // Make sure that an interpreter with no get_internals() created until finalize still gets the
     // internals destroyed
     bool ran = false;
-    py::module::import("__main__").attr("internals_destroy_test") =
+    py::module_::import("__main__").attr("internals_destroy_test") =
         py::capsule(&ran, [](void *ran) { py::detail::get_internals(); *static_cast<bool *>(ran) = true; });
@@ -149,20 +218,20 @@ TEST_CASE("Restart the interpreter") {
     // C++ modules can be reloaded.
-    auto cpp_module = py::module::import("widget_module");
+    auto cpp_module = py::module_::import("widget_module");
     REQUIRE(cpp_module.attr("add")(1, 2).cast<int>() == 3);
     // C++ type information is reloaded and can be used in python modules.
-    auto py_module = py::module::import("test_interpreter");
+    auto py_module = py::module_::import("test_interpreter");
     auto py_widget = py_module.attr("DerivedWidget")("Hello after restart");
     REQUIRE(py_widget.attr("the_message").cast<std::string>() == "Hello after restart");
 TEST_CASE("Subinterpreter") {
     // Add tags to the modules in the main interpreter and test the basics.
-    py::module::import("__main__").attr("main_tag") = "main interpreter";
+    py::module_::import("__main__").attr("main_tag") = "main interpreter";
-        auto m = py::module::import("widget_module");
+        auto m = py::module_::import("widget_module");
         m.attr("extension_module_tag") = "added to module in main interpreter";
         REQUIRE(m.attr("add")(1, 2).cast<int>() == 3);
@@ -181,9 +250,9 @@ TEST_CASE("Subinterpreter") {
     // Modules tags should be gone.
-    REQUIRE_FALSE(py::hasattr(py::module::import("__main__"), "tag"));
+    REQUIRE_FALSE(py::hasattr(py::module_::import("__main__"), "tag"));
-        auto m = py::module::import("widget_module");
+        auto m = py::module_::import("widget_module");
         REQUIRE_FALSE(py::hasattr(m, "extension_module_tag"));
         // Function bindings should still work.
@@ -194,8 +263,8 @@ TEST_CASE("Subinterpreter") {
-    REQUIRE(py::hasattr(py::module::import("__main__"), "main_tag"));
-    REQUIRE(py::hasattr(py::module::import("widget_module"), "extension_module_tag"));
+    REQUIRE(py::hasattr(py::module_::import("__main__"), "main_tag"));
+    REQUIRE(py::hasattr(py::module_::import("widget_module"), "extension_module_tag"));
 TEST_CASE("Execution frame") {
@@ -245,7 +314,7 @@ TEST_CASE("Reload module from file") {
     // Disable generation of cached bytecode (.pyc files) for this test, otherwise
     // Python might pick up an old version from the cache instead of the new versions
     // of the .py files generated below
-    auto sys = py::module::import("sys");
+    auto sys = py::module_::import("sys");
     bool dont_write_bytecode = sys.attr("dont_write_bytecode").cast<bool>();
     sys.attr("dont_write_bytecode") = true;
     // Reset the value at scope exit
@@ -267,8 +336,8 @@ TEST_CASE("Reload module from file") {
     // Import the module from file
-    auto module = py::module::import(module_name.c_str());
-    int result = module.attr("test")().cast<int>();
+    auto module_ = py::module_::import(module_name.c_str());
+    int result = module_.attr("test")().cast<int>();
     REQUIRE(result == 1);
     // Update the module .py file with a small change
@@ -278,7 +347,29 @@ TEST_CASE("Reload module from file") {
     // Reload the module
-    module.reload();
-    result = module.attr("test")().cast<int>();
+    module_.reload();
+    result = module_.attr("test")().cast<int>();
     REQUIRE(result == 2);
+TEST_CASE("sys.argv gets initialized properly") {
+    py::finalize_interpreter();
+    {
+        py::scoped_interpreter default_scope;
+        auto module = py::module::import("test_interpreter");
+        auto py_widget = module.attr("DerivedWidget")("The question");
+        const auto &cpp_widget = py_widget.cast<const Widget &>();
+        REQUIRE(cpp_widget.argv0().empty());
+    }
+    {
+        char *argv[] = {strdup("a.out")};
+        py::scoped_interpreter argv_scope(true, 1, argv);
+        std::free(argv[0]);
+        auto module = py::module::import("test_interpreter");
+        auto py_widget = module.attr("DerivedWidget")("The question");
+        const auto &cpp_widget = py_widget.cast<const Widget &>();
+        REQUIRE(cpp_widget.argv0() == "a.out");
+    }
+    py::initialize_interpreter();
diff --git a/wrap/pybind11/tests/test_embed/test_interpreter.py b/wrap/pybind11/tests/test_embed/test_interpreter.py
index 6174ede446..5ab55a4b37 100644
--- a/wrap/pybind11/tests/test_embed/test_interpreter.py
+++ b/wrap/pybind11/tests/test_embed/test_interpreter.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+import sys
 from widget_module import Widget
@@ -8,3 +10,6 @@ def __init__(self, message):
     def the_answer(self):
         return 42
+    def argv0(self):
+        return sys.argv[0]
diff --git a/wrap/pybind11/tests/test_embed/test_trampoline.py b/wrap/pybind11/tests/test_embed/test_trampoline.py
new file mode 100644
index 0000000000..87c8fa44c3
--- /dev/null
+++ b/wrap/pybind11/tests/test_embed/test_trampoline.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+import trampoline_module
+def func():
+    class Test(trampoline_module.test_override_cache_helper):
+        def func(self):
+            return 42
+    return Test()
+def func2():
+    class Test(trampoline_module.test_override_cache_helper):
+        pass
+    return Test()
diff --git a/wrap/pybind11/tests/test_enum.cpp b/wrap/pybind11/tests/test_enum.cpp
index 3153089208..40c48d412a 100644
--- a/wrap/pybind11/tests/test_enum.cpp
+++ b/wrap/pybind11/tests/test_enum.cpp
@@ -84,4 +84,65 @@ TEST_SUBMODULE(enums, m) {
             .value("ONE", SimpleEnum::THREE)
+    // test_enum_scalar
+    enum UnscopedUCharEnum : unsigned char {};
+    enum class ScopedShortEnum : short {};
+    enum class ScopedLongEnum : long {};
+    enum UnscopedUInt64Enum : std::uint64_t {};
+    static_assert(py::detail::all_of<
+        std::is_same<py::enum_<UnscopedUCharEnum>::Scalar, unsigned char>,
+        std::is_same<py::enum_<ScopedShortEnum>::Scalar, short>,
+        std::is_same<py::enum_<ScopedLongEnum>::Scalar, long>,
+        std::is_same<py::enum_<UnscopedUInt64Enum>::Scalar, std::uint64_t>
+    >::value, "Error during the deduction of enum's scalar type with normal integer underlying");
+    // test_enum_scalar_with_char_underlying
+    enum class ScopedCharEnum   : char     { Zero, Positive };
+    enum class ScopedWCharEnum  : wchar_t  { Zero, Positive };
+    enum class ScopedChar32Enum : char32_t { Zero, Positive };
+    enum class ScopedChar16Enum : char16_t { Zero, Positive };
+    // test the scalar of char type enums according to chapter 'Character types'
+    // from https://en.cppreference.com/w/cpp/language/types
+    static_assert(py::detail::any_of<
+        std::is_same<py::enum_<ScopedCharEnum>::Scalar, signed char>, // e.g. gcc on x86
+        std::is_same<py::enum_<ScopedCharEnum>::Scalar, unsigned char>  // e.g. arm linux
+    >::value, "char should be cast to either signed char or unsigned char");
+    static_assert(
+        sizeof(py::enum_<ScopedWCharEnum>::Scalar) == 2 ||
+        sizeof(py::enum_<ScopedWCharEnum>::Scalar) == 4
+    , "wchar_t should be either 16 bits (Windows) or 32 (everywhere else)");
+    static_assert(py::detail::all_of<
+        std::is_same<py::enum_<ScopedChar32Enum>::Scalar, std::uint_least32_t>,
+        std::is_same<py::enum_<ScopedChar16Enum>::Scalar, std::uint_least16_t>
+    >::value, "char32_t, char16_t (and char8_t)'s size, signedness, and alignment is determined");
+#if defined(PYBIND11_HAS_U8STRING)
+    enum class ScopedChar8Enum : char8_t { Zero, Positive };
+    static_assert(std::is_same<py::enum_<ScopedChar8Enum>::Scalar, unsigned char>::value);
+    // test_char_underlying_enum
+    py::enum_<ScopedCharEnum>(m, "ScopedCharEnum")
+        .value("Zero", ScopedCharEnum::Zero)
+        .value("Positive", ScopedCharEnum::Positive);
+    py::enum_<ScopedWCharEnum>(m, "ScopedWCharEnum")
+        .value("Zero", ScopedWCharEnum::Zero)
+        .value("Positive", ScopedWCharEnum::Positive);
+    py::enum_<ScopedChar32Enum>(m, "ScopedChar32Enum")
+        .value("Zero", ScopedChar32Enum::Zero)
+        .value("Positive", ScopedChar32Enum::Positive);
+    py::enum_<ScopedChar16Enum>(m, "ScopedChar16Enum")
+        .value("Zero", ScopedChar16Enum::Zero)
+        .value("Positive", ScopedChar16Enum::Positive);
+    // test_bool_underlying_enum
+    enum class ScopedBoolEnum : bool { FALSE, TRUE };
+    // bool is unsigned (std::is_signed returns false) and 1-byte long, so represented with u8
+    static_assert(std::is_same<py::enum_<ScopedBoolEnum>::Scalar, std::uint8_t>::value, "");
+    py::enum_<ScopedBoolEnum>(m, "ScopedBoolEnum")
+        .value("FALSE", ScopedBoolEnum::FALSE)
+        .value("TRUE", ScopedBoolEnum::TRUE);
diff --git a/wrap/pybind11/tests/test_enum.py b/wrap/pybind11/tests/test_enum.py
index bfaa193e9b..14c754e726 100644
--- a/wrap/pybind11/tests/test_enum.py
+++ b/wrap/pybind11/tests/test_enum.py
@@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-
 import pytest
+import env
 from pybind11_tests import enums as m
@@ -7,32 +9,50 @@ def test_unscoped_enum():
     assert str(m.UnscopedEnum.EOne) == "UnscopedEnum.EOne"
     assert str(m.UnscopedEnum.ETwo) == "UnscopedEnum.ETwo"
     assert str(m.EOne) == "UnscopedEnum.EOne"
+    assert repr(m.UnscopedEnum.EOne) == "<UnscopedEnum.EOne: 1>"
+    assert repr(m.UnscopedEnum.ETwo) == "<UnscopedEnum.ETwo: 2>"
+    assert repr(m.EOne) == "<UnscopedEnum.EOne: 1>"
     # name property
     assert m.UnscopedEnum.EOne.name == "EOne"
+    assert m.UnscopedEnum.EOne.value == 1
     assert m.UnscopedEnum.ETwo.name == "ETwo"
-    assert m.EOne.name == "EOne"
-    # name readonly
+    assert m.UnscopedEnum.ETwo.value == 2
+    assert m.EOne is m.UnscopedEnum.EOne
+    # name, value readonly
     with pytest.raises(AttributeError):
         m.UnscopedEnum.EOne.name = ""
-    # name returns a copy
-    foo = m.UnscopedEnum.EOne.name
-    foo = "bar"
+    with pytest.raises(AttributeError):
+        m.UnscopedEnum.EOne.value = 10
+    # name, value returns a copy
+    # TODO: Neither the name nor value tests actually check against aliasing.
+    # Use a mutable type that has reference semantics.
+    nonaliased_name = m.UnscopedEnum.EOne.name
+    nonaliased_name = "bar"  # noqa: F841
     assert m.UnscopedEnum.EOne.name == "EOne"
+    nonaliased_value = m.UnscopedEnum.EOne.value
+    nonaliased_value = 10  # noqa: F841
+    assert m.UnscopedEnum.EOne.value == 1
     # __members__ property
-    assert m.UnscopedEnum.__members__ == \
-        {"EOne": m.UnscopedEnum.EOne, "ETwo": m.UnscopedEnum.ETwo, "EThree": m.UnscopedEnum.EThree}
+    assert m.UnscopedEnum.__members__ == {
+        "EOne": m.UnscopedEnum.EOne,
+        "ETwo": m.UnscopedEnum.ETwo,
+        "EThree": m.UnscopedEnum.EThree,
+    }
     # __members__ readonly
     with pytest.raises(AttributeError):
         m.UnscopedEnum.__members__ = {}
     # __members__ returns a copy
-    foo = m.UnscopedEnum.__members__
-    foo["bar"] = "baz"
-    assert m.UnscopedEnum.__members__ == \
-        {"EOne": m.UnscopedEnum.EOne, "ETwo": m.UnscopedEnum.ETwo, "EThree": m.UnscopedEnum.EThree}
+    nonaliased_members = m.UnscopedEnum.__members__
+    nonaliased_members["bar"] = "baz"
+    assert m.UnscopedEnum.__members__ == {
+        "EOne": m.UnscopedEnum.EOne,
+        "ETwo": m.UnscopedEnum.ETwo,
+        "EThree": m.UnscopedEnum.EThree,
+    }
-    for docstring_line in '''An unscoped enumeration
+    for docstring_line in """An unscoped enumeration
@@ -40,7 +60,9 @@ def test_unscoped_enum():
   ETwo : Docstring for ETwo
-  EThree : Docstring for EThree'''.split('\n'):
+  EThree : Docstring for EThree""".split(
+        "\n"
+    ):
         assert docstring_line in m.UnscopedEnum.__doc__
     # Unscoped enums will accept ==/!= int comparisons
@@ -50,10 +72,10 @@ def test_unscoped_enum():
     assert y != 3
     assert 3 != y
     # Compare with None
-    assert (y != None)  # noqa: E711
+    assert y != None  # noqa: E711
     assert not (y == None)  # noqa: E711
     # Compare with an object
-    assert (y != object())
+    assert y != object()
     assert not (y == object())
     # Compare with string
     assert y != "2"
@@ -62,16 +84,16 @@ def test_unscoped_enum():
     assert not (y == "2")
     with pytest.raises(TypeError):
-        y < object()
+        y < object()  # noqa: B015
     with pytest.raises(TypeError):
-        y <= object()
+        y <= object()  # noqa: B015
     with pytest.raises(TypeError):
-        y > object()
+        y > object()  # noqa: B015
     with pytest.raises(TypeError):
-        y >= object()
+        y >= object()  # noqa: B015
     with pytest.raises(TypeError):
         y | object()
@@ -116,20 +138,20 @@ def test_scoped_enum():
     assert z != 3
     assert 3 != z
     # Compare with None
-    assert (z != None)  # noqa: E711
+    assert z != None  # noqa: E711
     assert not (z == None)  # noqa: E711
     # Compare with an object
-    assert (z != object())
+    assert z != object()
     assert not (z == object())
     # Scoped enums will *NOT* accept >, <, >= and <= int comparisons (Will throw exceptions)
     with pytest.raises(TypeError):
-        z > 3
+        z > 3  # noqa: B015
     with pytest.raises(TypeError):
-        z < 3
+        z < 3  # noqa: B015
     with pytest.raises(TypeError):
-        z >= 3
+        z >= 3  # noqa: B015
     with pytest.raises(TypeError):
-        z <= 3
+        z <= 3  # noqa: B015
     # order
     assert m.ScopedEnum.Two < m.ScopedEnum.Three
@@ -143,6 +165,8 @@ def test_scoped_enum():
 def test_implicit_conversion():
     assert str(m.ClassWithUnscopedEnum.EMode.EFirstMode) == "EMode.EFirstMode"
     assert str(m.ClassWithUnscopedEnum.EFirstMode) == "EMode.EFirstMode"
+    assert repr(m.ClassWithUnscopedEnum.EMode.EFirstMode) == "<EMode.EFirstMode: 1>"
+    assert repr(m.ClassWithUnscopedEnum.EFirstMode) == "<EMode.EFirstMode: 1>"
     f = m.ClassWithUnscopedEnum.test_function
     first = m.ClassWithUnscopedEnum.EFirstMode
@@ -167,7 +191,7 @@ def test_implicit_conversion():
     x[f(first)] = 3
     x[f(second)] = 4
     # Hashing test
-    assert str(x) == "{EMode.EFirstMode: 3, EMode.ESecondMode: 4}"
+    assert repr(x) == "{<EMode.EFirstMode: 1>: 3, <EMode.ESecondMode: 2>: 4}"
 def test_binary_operators():
@@ -195,13 +219,54 @@ def test_binary_operators():
 def test_enum_to_int():
+    m.test_enum_to_int(m.ScopedCharEnum.Positive)
+    m.test_enum_to_int(m.ScopedBoolEnum.TRUE)
+    m.test_enum_to_uint(m.ScopedCharEnum.Positive)
+    m.test_enum_to_uint(m.ScopedBoolEnum.TRUE)
+    m.test_enum_to_long_long(m.ScopedCharEnum.Positive)
+    m.test_enum_to_long_long(m.ScopedBoolEnum.TRUE)
 def test_duplicate_enum_name():
     with pytest.raises(ValueError) as excinfo:
     assert str(excinfo.value) == 'SimpleEnum: element "ONE" already exists!'
+def test_char_underlying_enum():  # Issue #1331/PR #1334:
+    assert type(m.ScopedCharEnum.Positive.__int__()) is int
+    assert int(m.ScopedChar16Enum.Zero) == 0
+    assert hash(m.ScopedChar32Enum.Positive) == 1
+    if env.PY2:
+        assert m.ScopedCharEnum.Positive.__getstate__() == 1  # long
+    else:
+        assert type(m.ScopedCharEnum.Positive.__getstate__()) is int
+    assert m.ScopedWCharEnum(1) == m.ScopedWCharEnum.Positive
+    with pytest.raises(TypeError):
+        # Even if the underlying type is char, only an int can be used to construct the enum:
+        m.ScopedCharEnum("0")
+def test_bool_underlying_enum():
+    assert type(m.ScopedBoolEnum.TRUE.__int__()) is int
+    assert int(m.ScopedBoolEnum.FALSE) == 0
+    assert hash(m.ScopedBoolEnum.TRUE) == 1
+    if env.PY2:
+        assert m.ScopedBoolEnum.TRUE.__getstate__() == 1  # long
+    else:
+        assert type(m.ScopedBoolEnum.TRUE.__getstate__()) is int
+    assert m.ScopedBoolEnum(1) == m.ScopedBoolEnum.TRUE
+    # Enum could construct with a bool
+    # (bool is a strict subclass of int, and False will be converted to 0)
+    assert m.ScopedBoolEnum(False) == m.ScopedBoolEnum.FALSE
+def test_docstring_signatures():
+    for enum_type in [m.ScopedEnum, m.UnscopedEnum]:
+        for attr in enum_type.__dict__.values():
+            # Issue #2623/PR #2637: Add argument names to enum_ methods
+            assert "arg0" not in (attr.__doc__ or "")
diff --git a/wrap/pybind11/tests/test_eval.cpp b/wrap/pybind11/tests/test_eval.cpp
index e094821911..29366f6798 100644
--- a/wrap/pybind11/tests/test_eval.cpp
+++ b/wrap/pybind11/tests/test_eval.cpp
@@ -9,12 +9,14 @@
 #include <pybind11/eval.h>
 #include "pybind11_tests.h"
+#include <utility>
 TEST_SUBMODULE(eval_, m) {
     // test_evals
-    auto global = py::dict(py::module::import("__main__").attr("__dict__"));
+    auto global = py::dict(py::module_::import("__main__").attr("__dict__"));
     m.def("test_eval_statements", [global]() {
         auto local = py::dict();
@@ -64,10 +66,10 @@ TEST_SUBMODULE(eval_, m) {
         auto local = py::dict();
         local["y"] = py::int_(43);
-        int val_out;
+        int val_out = 0;
         local["call_test2"] = py::cpp_function([&](int value) { val_out = value; });
-        auto result = py::eval_file(filename, global, local);
+        auto result = py::eval_file(std::move(filename), global, local);
         return val_out == 43 && result.is_none();
@@ -88,4 +90,30 @@ TEST_SUBMODULE(eval_, m) {
         return false;
+    // test_eval_empty_globals
+    m.def("eval_empty_globals", [](py::object global) {
+        if (global.is_none())
+            global = py::dict();
+        auto int_class = py::eval("isinstance(42, int)", global);
+        return global;
+    });
+    // test_eval_closure
+    m.def("test_eval_closure", []() {
+        py::dict global;
+        global["closure_value"] = 42;
+        py::dict local;
+        local["closure_value"] = 0;
+        py::exec(R"(
+            local_value = closure_value
+            def func_global():
+                return closure_value
+            def func_local():
+                return local_value
+            )", global, local);
+        return std::make_pair(global, local);
+    });
diff --git a/wrap/pybind11/tests/test_eval.py b/wrap/pybind11/tests/test_eval.py
index b6f9d1881d..1bbd991bc0 100644
--- a/wrap/pybind11/tests/test_eval.py
+++ b/wrap/pybind11/tests/test_eval.py
@@ -4,7 +4,6 @@
 import pytest
 import env  # noqa: F401
 from pybind11_tests import eval_ as m
@@ -25,3 +24,28 @@ def test_eval_file():
     assert m.test_eval_file(filename)
     assert m.test_eval_file_failure()
+def test_eval_empty_globals():
+    assert "__builtins__" in m.eval_empty_globals(None)
+    g = {}
+    assert "__builtins__" in m.eval_empty_globals(g)
+    assert "__builtins__" in g
+def test_eval_closure():
+    global_, local = m.test_eval_closure()
+    assert global_["closure_value"] == 42
+    assert local["closure_value"] == 0
+    assert "local_value" not in global_
+    assert local["local_value"] == 0
+    assert "func_global" not in global_
+    assert local["func_global"]() == 42
+    assert "func_local" not in global_
+    with pytest.raises(NameError):
+        local["func_local"]()
diff --git a/wrap/pybind11/tests/test_eval_call.py b/wrap/pybind11/tests/test_eval_call.py
index d42a0a6d30..373b67bac8 100644
--- a/wrap/pybind11/tests/test_eval_call.py
+++ b/wrap/pybind11/tests/test_eval_call.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
 # This file is called from 'test_eval.py'
-if 'call_test2' in locals():
+if "call_test2" in locals():
     call_test2(y)  # noqa: F821 undefined name
diff --git a/wrap/pybind11/tests/test_exceptions.cpp b/wrap/pybind11/tests/test_exceptions.cpp
index 6187f2efba..3aa9673828 100644
--- a/wrap/pybind11/tests/test_exceptions.cpp
+++ b/wrap/pybind11/tests/test_exceptions.cpp
@@ -6,8 +6,14 @@
     All rights reserved. Use of this source code is governed by a
     BSD-style license that can be found in the LICENSE file.
+#include "test_exceptions.h"
+#include "local_bindings.h"
 #include "pybind11_tests.h"
+#include <exception>
+#include <stdexcept>
+#include <utility>
 // A type that should be raised as an exception in Python
 class MyException : public std::exception {
@@ -32,6 +38,13 @@ class MyException3 {
     explicit MyException3(const char * m) : message{m} {}
     virtual const char * what() const noexcept {return message.c_str();}
+    // Rule of 5 BEGIN: to preempt compiler warnings.
+    MyException3(const MyException3&) = default;
+    MyException3(MyException3&&) = default;
+    MyException3& operator=(const MyException3&) = default;
+    MyException3& operator=(MyException3&&) = default;
+    virtual ~MyException3() = default;
+    // Rule of 5 END.
     std::string message = "";
@@ -58,8 +71,19 @@ class MyException5_1 : public MyException5 {
     using MyException5::MyException5;
+// Exception that will be caught via the module local translator.
+class MyException6 : public std::exception {
+    explicit MyException6(const char * m) : message{m} {}
+    const char * what() const noexcept override {return message.c_str();}
+    std::string message = "";
 struct PythonCallInDestructor {
-    PythonCallInDestructor(const py::dict &d) : d(d) {}
+    explicit PythonCallInDestructor(const py::dict &d) : d(d) {}
     ~PythonCallInDestructor() { d["good"] = true; }
     py::dict d;
@@ -68,7 +92,7 @@ struct PythonCallInDestructor {
 struct PythonAlreadySetInDestructor {
-    PythonAlreadySetInDestructor(const py::str &s) : s(s) {}
+    explicit PythonAlreadySetInDestructor(const py::str &s) : s(s) {}
     ~PythonAlreadySetInDestructor() {
         py::dict foo;
         try {
@@ -83,7 +107,6 @@ struct PythonAlreadySetInDestructor {
     py::str s;
 TEST_SUBMODULE(exceptions, m) {
     m.def("throw_std_exception", []() {
         throw std::runtime_error("This exception was intentionally thrown.");
@@ -128,14 +151,29 @@ TEST_SUBMODULE(exceptions, m) {
     // A slightly more complicated one that declares MyException5_1 as a subclass of MyException5
     py::register_exception<MyException5_1>(m, "MyException5_1", ex5.ptr());
+    //py::register_local_exception<LocalSimpleException>(m, "LocalSimpleException")
+    py::register_local_exception_translator([](std::exception_ptr p) {
+      try {
+          if (p) {
+            std::rethrow_exception(p);
+          }
+      } catch (const MyException6 &e) {
+        PyErr_SetString(PyExc_RuntimeError, e.what());
+      }
+    });
     m.def("throws1", []() { throw MyException("this error should go to a custom type"); });
     m.def("throws2", []() { throw MyException2("this error should go to a standard Python exception"); });
     m.def("throws3", []() { throw MyException3("this error cannot be translated"); });
     m.def("throws4", []() { throw MyException4("this error is rethrown"); });
     m.def("throws5", []() { throw MyException5("this is a helper-defined translated exception"); });
     m.def("throws5_1", []() { throw MyException5_1("MyException5 subclass"); });
+    m.def("throws6", []() { throw MyException6("MyException6 only handled in this module"); });
     m.def("throws_logic_error", []() { throw std::logic_error("this error should fall through to the standard handler"); });
-    m.def("throws_overflow_error", []() {throw std::overflow_error(""); });
+    m.def("throws_overflow_error", []() { throw std::overflow_error(""); });
+    m.def("throws_local_error", []() { throw LocalException("never caught"); });
+    m.def("throws_local_simple_error", []() { throw LocalSimpleException("this mod"); });
     m.def("exception_matches", []() {
         py::dict foo;
         try {
@@ -163,7 +201,7 @@ TEST_SUBMODULE(exceptions, m) {
     m.def("modulenotfound_exception_matches_base", []() {
         try {
             // On Python >= 3.6, this raises a ModuleNotFoundError, a subclass of ImportError
-            py::module::import("nonexistent");
+            py::module_::import("nonexistent");
         catch (py::error_already_set &ex) {
             if (!ex.matches(PyExc_ImportError)) throw;
@@ -191,34 +229,65 @@ TEST_SUBMODULE(exceptions, m) {
         throw py::error_already_set();
-    m.def("python_call_in_destructor", [](py::dict d) {
+    m.def("python_call_in_destructor", [](const py::dict &d) {
+        bool retval = false;
         try {
             PythonCallInDestructor set_dict_in_destructor(d);
             PyErr_SetString(PyExc_ValueError, "foo");
             throw py::error_already_set();
         } catch (const py::error_already_set&) {
-            return true;
+            retval = true;
-        return false;
+        return retval;
-    m.def("python_alreadyset_in_destructor", [](py::str s) {
+    m.def("python_alreadyset_in_destructor", [](const py::str &s) {
         PythonAlreadySetInDestructor alreadyset_in_destructor(s);
         return true;
     // test_nested_throws
-    m.def("try_catch", [m](py::object exc_type, py::function f, py::args args) {
-        try { f(*args); }
-        catch (py::error_already_set &ex) {
-            if (ex.matches(exc_type))
-                py::print(ex.what());
-            else
-                throw;
-        }
-    });
+    m.def("try_catch",
+          [m](const py::object &exc_type, const py::function &f, const py::args &args) {
+              try {
+                  f(*args);
+              } catch (py::error_already_set &ex) {
+                  if (ex.matches(exc_type))
+                      py::print(ex.what());
+                  else
+                      throw;
+              }
+          });
     // Test repr that cannot be displayed
     m.def("simple_bool_passthrough", [](bool x) {return x;});
+    m.def("throw_should_be_translated_to_key_error", []() { throw shared_exception(); });
+#if PY_VERSION_HEX >= 0x03030000
+    m.def("raise_from", []() {
+        PyErr_SetString(PyExc_ValueError, "inner");
+        py::raise_from(PyExc_ValueError, "outer");
+        throw py::error_already_set();
+    });
+    m.def("raise_from_already_set", []() {
+        try {
+            PyErr_SetString(PyExc_ValueError, "inner");
+            throw py::error_already_set();
+        } catch (py::error_already_set& e) {
+            py::raise_from(e, PyExc_ValueError, "outer");
+            throw py::error_already_set();
+        }
+    });
+    m.def("throw_nested_exception", []() {
+        try {
+            throw std::runtime_error("Inner Exception");
+        } catch (const std::runtime_error &) {
+            std::throw_with_nested(std::runtime_error("Outer Exception"));
+        }
+    });
diff --git a/wrap/pybind11/tests/test_exceptions.h b/wrap/pybind11/tests/test_exceptions.h
new file mode 100644
index 0000000000..9d428312eb
--- /dev/null
+++ b/wrap/pybind11/tests/test_exceptions.h
@@ -0,0 +1,12 @@
+#pragma once
+#include "pybind11_tests.h"
+#include <stdexcept>
+// shared exceptions for cross_module_tests
+class PYBIND11_EXPORT_EXCEPTION shared_exception : public pybind11::builtin_exception {
+    using builtin_exception::builtin_exception;
+    explicit shared_exception() : shared_exception("") {}
+    void set_error() const override { PyErr_SetString(PyExc_RuntimeError, what()); }
diff --git a/wrap/pybind11/tests/test_exceptions.py b/wrap/pybind11/tests/test_exceptions.py
index 7d7088d00b..d698b1312e 100644
--- a/wrap/pybind11/tests/test_exceptions.py
+++ b/wrap/pybind11/tests/test_exceptions.py
@@ -3,8 +3,9 @@
 import pytest
-from pybind11_tests import exceptions as m
+import env
 import pybind11_cross_module_tests as cm
+from pybind11_tests import exceptions as m
 def test_std_exception(msg):
@@ -23,7 +24,23 @@ def test_error_already_set(msg):
     assert msg(excinfo.value) == "foo"
-def test_cross_module_exceptions():
+def test_raise_from(msg):
+    with pytest.raises(ValueError) as excinfo:
+        m.raise_from()
+    assert msg(excinfo.value) == "outer"
+    assert msg(excinfo.value.__cause__) == "inner"
+def test_raise_from_already_set(msg):
+    with pytest.raises(ValueError) as excinfo:
+        m.raise_from_already_set()
+    assert msg(excinfo.value) == "outer"
+    assert msg(excinfo.value.__cause__) == "inner"
+def test_cross_module_exceptions(msg):
     with pytest.raises(RuntimeError) as excinfo:
     assert str(excinfo.value) == "My runtime error"
@@ -43,6 +60,27 @@ def test_cross_module_exceptions():
     with pytest.raises(StopIteration) as excinfo:
+    with pytest.raises(cm.LocalSimpleException) as excinfo:
+        cm.throw_local_simple_error()
+    assert msg(excinfo.value) == "external mod"
+    with pytest.raises(KeyError) as excinfo:
+        cm.throw_local_error()
+    # KeyError is a repr of the key, so it has an extra set of quotes
+    assert str(excinfo.value) == "'just local'"
+    "env.PYPY and env.MACOS",
+    raises=RuntimeError,
+    reason="Expected failure with PyPy and libc++ (Issue #2847 & PR #2999)",
+def test_cross_module_exception_translator():
+    with pytest.raises(KeyError):
+        # translator registered in cross_module_tests
+        m.throw_should_be_translated_to_key_error()
 def test_python_call_in_catch():
     d = {}
@@ -50,31 +88,44 @@ def test_python_call_in_catch():
     assert d["good"] is True
+def ignore_pytest_unraisable_warning(f):
+    unraisable = "PytestUnraisableExceptionWarning"
+    if hasattr(pytest, unraisable):  # Python >= 3.8 and pytest >= 6
+        dec = pytest.mark.filterwarnings("ignore::pytest.{}".format(unraisable))
+        return dec(f)
+    else:
+        return f
+# TODO: find out why this fails on PyPy, https://foss.heptapod.net/pypy/pypy/-/issues/3583
+@pytest.mark.xfail(env.PYPY, reason="Failure on PyPy 3.8 (7.3.7)", strict=False)
 def test_python_alreadyset_in_destructor(monkeypatch, capsys):
     hooked = False
     triggered = [False]  # mutable, so Python 2.7 closure can modify it
-    if hasattr(sys, 'unraisablehook'):  # Python 3.8+
+    if hasattr(sys, "unraisablehook"):  # Python 3.8+
         hooked = True
-        default_hook = sys.unraisablehook
+        # Don't take `sys.unraisablehook`, as that's overwritten by pytest
+        default_hook = sys.__unraisablehook__
         def hook(unraisable_hook_args):
             exc_type, exc_value, exc_tb, err_msg, obj = unraisable_hook_args
-            if obj == 'already_set demo':
+            if obj == "already_set demo":
                 triggered[0] = True
         # Use monkeypatch so pytest can apply and remove the patch as appropriate
-        monkeypatch.setattr(sys, 'unraisablehook', hook)
+        monkeypatch.setattr(sys, "unraisablehook", hook)
-    assert m.python_alreadyset_in_destructor('already_set demo') is True
+    assert m.python_alreadyset_in_destructor("already_set demo") is True
     if hooked:
         assert triggered[0] is True
     _, captured_stderr = capsys.readouterr()
     # Error message is different in Python 2 and 3, check for words that appear in both
-    assert 'ignored' in captured_stderr and 'already_set demo' in captured_stderr
+    assert "ignored" in captured_stderr and "already_set demo" in captured_stderr
 def test_exception_matches():
@@ -107,7 +158,9 @@ def test_custom(msg):
     # Can we fall-through to the default handler?
     with pytest.raises(RuntimeError) as excinfo:
-    assert msg(excinfo.value) == "this error should fall through to the standard handler"
+    assert (
+        msg(excinfo.value) == "this error should fall through to the standard handler"
+    )
     # OverFlow error translation.
     with pytest.raises(OverflowError) as excinfo:
@@ -166,7 +219,13 @@ def pycatch(exctype, f, *args):
     # C++ -> Python -> C++ -> Python
     with capture:
-            m.MyException5, pycatch, m.MyException, m.try_catch, m.MyException, throw_myex5)
+            m.MyException5,
+            pycatch,
+            m.MyException,
+            m.try_catch,
+            m.MyException,
+            throw_myex5,
+        )
     assert str(capture).startswith("MyException5: nested error 5")
     # C++ -> Python -> C++
@@ -180,12 +239,37 @@ def pycatch(exctype, f, *args):
     assert str(excinfo.value) == "this is a helper-defined translated exception"
+def test_throw_nested_exception():
+    with pytest.raises(RuntimeError) as excinfo:
+        m.throw_nested_exception()
+    assert str(excinfo.value) == "Outer Exception"
+    assert str(excinfo.value.__cause__) == "Inner Exception"
 # This can often happen if you wrap a pybind11 class in a Python wrapper
 def test_invalid_repr():
     class MyRepr(object):
         def __repr__(self):
             raise AttributeError("Example error")
     with pytest.raises(TypeError):
+def test_local_translator(msg):
+    """Tests that a local translator works and that the local translator from
+    the cross module is not applied"""
+    with pytest.raises(RuntimeError) as excinfo:
+        m.throws6()
+    assert msg(excinfo.value) == "MyException6 only handled in this module"
+    with pytest.raises(RuntimeError) as excinfo:
+        m.throws_local_error()
+    assert not isinstance(excinfo.value, KeyError)
+    assert msg(excinfo.value) == "never caught"
+    with pytest.raises(Exception) as excinfo:
+        m.throws_local_simple_error()
+    assert not isinstance(excinfo.value, cm.LocalSimpleException)
+    assert msg(excinfo.value) == "this mod"
diff --git a/wrap/pybind11/tests/test_factory_constructors.cpp b/wrap/pybind11/tests/test_factory_constructors.cpp
index 2368dabb8d..660e2896af 100644
--- a/wrap/pybind11/tests/test_factory_constructors.cpp
+++ b/wrap/pybind11/tests/test_factory_constructors.cpp
@@ -8,35 +8,45 @@
     BSD-style license that can be found in the LICENSE file.
-#include "pybind11_tests.h"
 #include "constructor_stats.h"
+#include "pybind11_tests.h"
 #include <cmath>
 #include <new>
+#include <utility>
 // Classes for testing python construction via C++ factory function:
 // Not publicly constructible, copyable, or movable:
 class TestFactory1 {
     friend class TestFactoryHelper;
     TestFactory1() : value("(empty)") { print_default_created(this); }
-    TestFactory1(int v) : value(std::to_string(v)) { print_created(this, value); }
-    TestFactory1(std::string v) : value(std::move(v)) { print_created(this, value); }
+    explicit TestFactory1(int v) : value(std::to_string(v)) { print_created(this, value); }
+    explicit TestFactory1(std::string v) : value(std::move(v)) { print_created(this, value); }
+    std::string value;
     TestFactory1(TestFactory1 &&) = delete;
     TestFactory1(const TestFactory1 &) = delete;
     TestFactory1 &operator=(TestFactory1 &&) = delete;
     TestFactory1 &operator=(const TestFactory1 &) = delete;
-    std::string value;
     ~TestFactory1() { print_destroyed(this); }
 // Non-public construction, but moveable:
 class TestFactory2 {
     friend class TestFactoryHelper;
     TestFactory2() : value("(empty2)") { print_default_created(this); }
-    TestFactory2(int v) : value(std::to_string(v)) { print_created(this, value); }
-    TestFactory2(std::string v) : value(std::move(v)) { print_created(this, value); }
+    explicit TestFactory2(int v) : value(std::to_string(v)) { print_created(this, value); }
+    explicit TestFactory2(std::string v) : value(std::move(v)) { print_created(this, value); }
-    TestFactory2(TestFactory2 &&m) { value = std::move(m.value); print_move_created(this); }
-    TestFactory2 &operator=(TestFactory2 &&m) { value = std::move(m.value); print_move_assigned(this); return *this; }
+    TestFactory2(TestFactory2 &&m) noexcept {
+        value = std::move(m.value);
+        print_move_created(this);
+    }
+    TestFactory2 &operator=(TestFactory2 &&m) noexcept {
+        value = std::move(m.value);
+        print_move_assigned(this);
+        return *this;
+    }
     std::string value;
     ~TestFactory2() { print_destroyed(this); }
@@ -45,11 +55,19 @@ class TestFactory3 {
     friend class TestFactoryHelper;
     TestFactory3() : value("(empty3)") { print_default_created(this); }
-    TestFactory3(int v) : value(std::to_string(v)) { print_created(this, value); }
+    explicit TestFactory3(int v) : value(std::to_string(v)) { print_created(this, value); }
-    TestFactory3(std::string v) : value(std::move(v)) { print_created(this, value); }
-    TestFactory3(TestFactory3 &&m) { value = std::move(m.value); print_move_created(this); }
-    TestFactory3 &operator=(TestFactory3 &&m) { value = std::move(m.value); print_move_assigned(this); return *this; }
+    explicit TestFactory3(std::string v) : value(std::move(v)) { print_created(this, value); }
+    TestFactory3(TestFactory3 &&m) noexcept {
+        value = std::move(m.value);
+        print_move_created(this);
+    }
+    TestFactory3 &operator=(TestFactory3 &&m) noexcept {
+        value = std::move(m.value);
+        print_move_assigned(this);
+        return *this;
+    }
     std::string value;
     virtual ~TestFactory3() { print_destroyed(this); }
@@ -57,13 +75,13 @@ class TestFactory3 {
 class TestFactory4 : public TestFactory3 {
     TestFactory4() : TestFactory3() { print_default_created(this); }
-    TestFactory4(int v) : TestFactory3(v) { print_created(this, v); }
+    explicit TestFactory4(int v) : TestFactory3(v) { print_created(this, v); }
     ~TestFactory4() override { print_destroyed(this); }
 // Another class for an invalid downcast test
 class TestFactory5 : public TestFactory3 {
-    TestFactory5(int i) : TestFactory3(i) { print_created(this, i); }
+    explicit TestFactory5(int i) : TestFactory3(i) { print_created(this, i); }
     ~TestFactory5() override { print_destroyed(this); }
@@ -72,22 +90,35 @@ class TestFactory6 {
     int value;
     bool alias = false;
-    TestFactory6(int i) : value{i} { print_created(this, i); }
-    TestFactory6(TestFactory6 &&f) { print_move_created(this); value = f.value; alias = f.alias; }
+    explicit TestFactory6(int i) : value{i} { print_created(this, i); }
+    TestFactory6(TestFactory6 &&f) noexcept {
+        print_move_created(this);
+        value = f.value;
+        alias = f.alias;
+    }
     TestFactory6(const TestFactory6 &f) { print_copy_created(this); value = f.value; alias = f.alias; }
     virtual ~TestFactory6() { print_destroyed(this); }
     virtual int get() { return value; }
-    bool has_alias() { return alias; }
+    bool has_alias() const { return alias; }
 class PyTF6 : public TestFactory6 {
     // Special constructor that allows the factory to construct a PyTF6 from a TestFactory6 only
     // when an alias is needed:
-    PyTF6(TestFactory6 &&base) : TestFactory6(std::move(base)) { alias = true; print_created(this, "move", value); }
-    PyTF6(int i) : TestFactory6(i) { alias = true; print_created(this, i); }
-    PyTF6(PyTF6 &&f) : TestFactory6(std::move(f)) { print_move_created(this); }
+    explicit PyTF6(TestFactory6 &&base) : TestFactory6(std::move(base)) {
+        alias = true;
+        print_created(this, "move", value);
+    }
+    explicit PyTF6(int i) : TestFactory6(i) {
+        alias = true;
+        print_created(this, i);
+    }
+    PyTF6(PyTF6 &&f) noexcept : TestFactory6(std::move(f)) { print_move_created(this); }
     PyTF6(const PyTF6 &f) : TestFactory6(f) { print_copy_created(this); }
-    PyTF6(std::string s) : TestFactory6((int) s.size()) { alias = true; print_created(this, s); }
+    explicit PyTF6(std::string s) : TestFactory6((int) s.size()) {
+        alias = true;
+        print_created(this, s);
+    }
     ~PyTF6() override { print_destroyed(this); }
     int get() override { PYBIND11_OVERRIDE(int, TestFactory6, get, /*no args*/); }
@@ -97,17 +128,24 @@ class TestFactory7 {
     int value;
     bool alias = false;
-    TestFactory7(int i) : value{i} { print_created(this, i); }
-    TestFactory7(TestFactory7 &&f) { print_move_created(this); value = f.value; alias = f.alias; }
+    explicit TestFactory7(int i) : value{i} { print_created(this, i); }
+    TestFactory7(TestFactory7 &&f) noexcept {
+        print_move_created(this);
+        value = f.value;
+        alias = f.alias;
+    }
     TestFactory7(const TestFactory7 &f) { print_copy_created(this); value = f.value; alias = f.alias; }
     virtual ~TestFactory7() { print_destroyed(this); }
     virtual int get() { return value; }
-    bool has_alias() { return alias; }
+    bool has_alias() const { return alias; }
 class PyTF7 : public TestFactory7 {
-    PyTF7(int i) : TestFactory7(i) { alias = true; print_created(this, i); }
-    PyTF7(PyTF7 &&f) : TestFactory7(std::move(f)) { print_move_created(this); }
+    explicit PyTF7(int i) : TestFactory7(i) {
+        alias = true;
+        print_created(this, i);
+    }
+    PyTF7(PyTF7 &&f) noexcept : TestFactory7(std::move(f)) { print_move_created(this); }
     PyTF7(const PyTF7 &f) : TestFactory7(f) { print_copy_created(this); }
     ~PyTF7() override { print_destroyed(this); }
     int get() override { PYBIND11_OVERRIDE(int, TestFactory7, get, /*no args*/); }
@@ -122,7 +160,9 @@ class TestFactoryHelper {
     // Holder:
     static std::unique_ptr<TestFactory1> construct1(int a) { return std::unique_ptr<TestFactory1>(new TestFactory1(a)); }
     // pointer again
-    static TestFactory1 *construct1_string(std::string a) { return new TestFactory1(a); }
+    static TestFactory1 *construct1_string(std::string a) {
+        return new TestFactory1(std::move(a));
+    }
     // Moveable type:
     // pointer:
@@ -130,7 +170,7 @@ class TestFactoryHelper {
     // holder:
     static std::unique_ptr<TestFactory2> construct2(int a) { return std::unique_ptr<TestFactory2>(new TestFactory2(a)); }
     // by value moving:
-    static TestFactory2 construct2(std::string a) { return TestFactory2(a); }
+    static TestFactory2 construct2(std::string a) { return TestFactory2(std::move(a)); }
     // shared_ptr holder type:
     // pointer:
@@ -142,7 +182,7 @@ class TestFactoryHelper {
 TEST_SUBMODULE(factory_constructors, m) {
     // Define various trivial types to allow simpler overload resolution:
-    py::module m_tag = m.def_submodule("tag");
+    py::module_ m_tag = m.def_submodule("tag");
 #define MAKE_TAG_TYPE(Name) \
     struct Name##_tag {}; \
     py::class_<Name##_tag>(m_tag, #Name "_tag").def(py::init<>()); \
@@ -173,21 +213,27 @@ TEST_SUBMODULE(factory_constructors, m) {
     py::class_<TestFactory2>(m, "TestFactory2")
         .def(py::init([](pointer_tag, int v) { return TestFactoryHelper::construct2(v); }))
-        .def(py::init([](unique_ptr_tag, std::string v) { return TestFactoryHelper::construct2(v); }))
+        .def(py::init([](unique_ptr_tag, std::string v) {
+            return TestFactoryHelper::construct2(std::move(v));
+        }))
         .def(py::init([](move_tag) { return TestFactoryHelper::construct2(); }))
-        .def_readwrite("value", &TestFactory2::value)
-        ;
+        .def_readwrite("value", &TestFactory2::value);
     // Stateful & reused:
     int c = 1;
     auto c4a = [c](pointer_tag, TF4_tag, int a) { (void) c; return new TestFactory4(a);};
     // test_init_factory_basic, test_init_factory_casting
-    py::class_<TestFactory3, std::shared_ptr<TestFactory3>>(m, "TestFactory3")
+    py::class_<TestFactory3, std::shared_ptr<TestFactory3>> pyTestFactory3(m, "TestFactory3");
+    pyTestFactory3
         .def(py::init([](pointer_tag, int v) { return TestFactoryHelper::construct3(v); }))
-        .def(py::init([](shared_ptr_tag) { return TestFactoryHelper::construct3(); }))
-        .def("__init__", [](TestFactory3 &self, std::string v) { new (&self) TestFactory3(v); }) // placement-new ctor
+        .def(py::init([](shared_ptr_tag) { return TestFactoryHelper::construct3(); }));
+    ignoreOldStyleInitWarnings([&pyTestFactory3]() {
+        pyTestFactory3.def("__init__", [](TestFactory3 &self, std::string v) {
+            new (&self) TestFactory3(std::move(v));
+        }); // placement-new ctor
+    });
+    pyTestFactory3
         // factories returning a derived type:
         .def(py::init(c4a)) // derived ptr
         .def(py::init([](pointer_tag, TF5_tag, int a) { return new TestFactory5(a); }))
@@ -216,58 +262,60 @@ TEST_SUBMODULE(factory_constructors, m) {
     py::class_<TestFactory6, PyTF6>(m, "TestFactory6")
         .def(py::init([](base_tag, int i) { return TestFactory6(i); }))
         .def(py::init([](alias_tag, int i) { return PyTF6(i); }))
-        .def(py::init([](alias_tag, std::string s) { return PyTF6(s); }))
+        .def(py::init([](alias_tag, std::string s) { return PyTF6(std::move(s)); }))
         .def(py::init([](alias_tag, pointer_tag, int i) { return new PyTF6(i); }))
         .def(py::init([](base_tag, pointer_tag, int i) { return new TestFactory6(i); }))
-        .def(py::init([](base_tag, alias_tag, pointer_tag, int i) { return (TestFactory6 *) new PyTF6(i); }))
+        .def(py::init(
+            [](base_tag, alias_tag, pointer_tag, int i) { return (TestFactory6 *) new PyTF6(i); }))
         .def("get", &TestFactory6::get)
         .def("has_alias", &TestFactory6::has_alias)
-        .def_static("get_cstats", &ConstructorStats::get<TestFactory6>, py::return_value_policy::reference)
-        .def_static("get_alias_cstats", &ConstructorStats::get<PyTF6>, py::return_value_policy::reference)
-        ;
+        .def_static(
+            "get_cstats", &ConstructorStats::get<TestFactory6>, py::return_value_policy::reference)
+        .def_static(
+            "get_alias_cstats", &ConstructorStats::get<PyTF6>, py::return_value_policy::reference);
     // test_init_factory_dual
     // Separate alias constructor testing
     py::class_<TestFactory7, PyTF7, std::shared_ptr<TestFactory7>>(m, "TestFactory7")
-        .def(py::init(
-            [](int i) { return TestFactory7(i); },
-            [](int i) { return PyTF7(i); }))
-        .def(py::init(
-            [](pointer_tag, int i) { return new TestFactory7(i); },
-            [](pointer_tag, int i) { return new PyTF7(i); }))
-        .def(py::init(
-            [](mixed_tag, int i) { return new TestFactory7(i); },
-            [](mixed_tag, int i) { return PyTF7(i); }))
-        .def(py::init(
-            [](mixed_tag, std::string s) { return TestFactory7((int) s.size()); },
-            [](mixed_tag, std::string s) { return new PyTF7((int) s.size()); }))
-        .def(py::init(
-            [](base_tag, pointer_tag, int i) { return new TestFactory7(i); },
-            [](base_tag, pointer_tag, int i) { return (TestFactory7 *) new PyTF7(i); }))
-        .def(py::init(
-            [](alias_tag, pointer_tag, int i) { return new PyTF7(i); },
-            [](alias_tag, pointer_tag, int i) { return new PyTF7(10*i); }))
+        .def(py::init([](int i) { return TestFactory7(i); }, [](int i) { return PyTF7(i); }))
+        .def(py::init([](pointer_tag, int i) { return new TestFactory7(i); },
+                      [](pointer_tag, int i) { return new PyTF7(i); }))
+        .def(py::init([](mixed_tag, int i) { return new TestFactory7(i); },
+                      [](mixed_tag, int i) { return PyTF7(i); }))
+        .def(py::init([](mixed_tag, const std::string &s) { return TestFactory7((int) s.size()); },
+                      [](mixed_tag, const std::string &s) { return new PyTF7((int) s.size()); }))
+        .def(py::init([](base_tag, pointer_tag, int i) { return new TestFactory7(i); },
+                      [](base_tag, pointer_tag, int i) { return (TestFactory7 *) new PyTF7(i); }))
+        .def(py::init([](alias_tag, pointer_tag, int i) { return new PyTF7(i); },
+                      [](alias_tag, pointer_tag, int i) { return new PyTF7(10 * i); }))
             [](shared_ptr_tag, base_tag, int i) { return std::make_shared<TestFactory7>(i); },
-            [](shared_ptr_tag, base_tag, int i) { auto *p = new PyTF7(i); return std::shared_ptr<TestFactory7>(p); }))
-        .def(py::init(
-            [](shared_ptr_tag, invalid_base_tag, int i) { return std::make_shared<TestFactory7>(i); },
-            [](shared_ptr_tag, invalid_base_tag, int i) { return std::make_shared<TestFactory7>(i); })) // <-- invalid alias factory
+            [](shared_ptr_tag, base_tag, int i) {
+                auto *p = new PyTF7(i);
+                return std::shared_ptr<TestFactory7>(p);
+            }))
+        .def(py::init([](shared_ptr_tag,
+                         invalid_base_tag,
+                         int i) { return std::make_shared<TestFactory7>(i); },
+                      [](shared_ptr_tag, invalid_base_tag, int i) {
+                          return std::make_shared<TestFactory7>(i);
+                      })) // <-- invalid alias factory
         .def("get", &TestFactory7::get)
         .def("has_alias", &TestFactory7::has_alias)
-        .def_static("get_cstats", &ConstructorStats::get<TestFactory7>, py::return_value_policy::reference)
-        .def_static("get_alias_cstats", &ConstructorStats::get<PyTF7>, py::return_value_policy::reference)
-        ;
+        .def_static(
+            "get_cstats", &ConstructorStats::get<TestFactory7>, py::return_value_policy::reference)
+        .def_static(
+            "get_alias_cstats", &ConstructorStats::get<PyTF7>, py::return_value_policy::reference);
     // test_placement_new_alternative
     // Class with a custom new operator but *without* a placement new operator (issue #948)
     class NoPlacementNew {
-        NoPlacementNew(int i) : i(i) { }
+        explicit NoPlacementNew(int i) : i(i) {}
         static void *operator new(std::size_t s) {
             auto *p = ::operator new(s);
             py::print("operator new called, returning", reinterpret_cast<uintptr_t>(p));
@@ -291,8 +339,8 @@ TEST_SUBMODULE(factory_constructors, m) {
     // Class that has verbose operator_new/operator_delete calls
     struct NoisyAlloc {
         NoisyAlloc(const NoisyAlloc &) = default;
-        NoisyAlloc(int i) { py::print(py::str("NoisyAlloc(int {})").format(i)); }
-        NoisyAlloc(double d) { py::print(py::str("NoisyAlloc(double {})").format(d)); }
+        explicit NoisyAlloc(int i) { py::print(py::str("NoisyAlloc(int {})").format(i)); }
+        explicit NoisyAlloc(double d) { py::print(py::str("NoisyAlloc(double {})").format(d)); }
         ~NoisyAlloc() { py::print("~NoisyAlloc()"); }
         static void *operator new(size_t s) { py::print("noisy new"); return ::operator new(s); }
@@ -304,27 +352,33 @@ TEST_SUBMODULE(factory_constructors, m) {
         static void operator delete(void *p) { py::print("noisy delete"); ::operator delete(p); }
-    py::class_<NoisyAlloc>(m, "NoisyAlloc")
+    py::class_<NoisyAlloc> pyNoisyAlloc(m, "NoisyAlloc");
         // Since these overloads have the same number of arguments, the dispatcher will try each of
         // them until the arguments convert.  Thus we can get a pre-allocation here when passing a
         // single non-integer:
-        .def("__init__", [](NoisyAlloc *a, int i) { new (a) NoisyAlloc(i); }) // Regular constructor, runs first, requires preallocation
-        .def(py::init([](double d) { return new NoisyAlloc(d); }))
-        // The two-argument version: first the factory pointer overload.
-        .def(py::init([](int i, int) { return new NoisyAlloc(i); }))
-        // Return-by-value:
-        .def(py::init([](double d, int) { return NoisyAlloc(d); }))
-        // Old-style placement new init; requires preallocation
-        .def("__init__", [](NoisyAlloc &a, double d, double) { new (&a) NoisyAlloc(d); })
-        // Requires deallocation of previous overload preallocated value:
-        .def(py::init([](int i, double) { return new NoisyAlloc(i); }))
-        // Regular again: requires yet another preallocation
-        .def("__init__", [](NoisyAlloc &a, int i, std::string) { new (&a) NoisyAlloc(i); })
-        ;
+    ignoreOldStyleInitWarnings([&pyNoisyAlloc]() {
+        pyNoisyAlloc.def("__init__", [](NoisyAlloc *a, int i) { new (a) NoisyAlloc(i); }); // Regular constructor, runs first, requires preallocation
+    });
+    pyNoisyAlloc.def(py::init([](double d) { return new NoisyAlloc(d); }));
+    // The two-argument version: first the factory pointer overload.
+    pyNoisyAlloc.def(py::init([](int i, int) { return new NoisyAlloc(i); }));
+    // Return-by-value:
+    pyNoisyAlloc.def(py::init([](double d, int) { return NoisyAlloc(d); }));
+    // Old-style placement new init; requires preallocation
+    ignoreOldStyleInitWarnings([&pyNoisyAlloc]() {
+        pyNoisyAlloc.def("__init__", [](NoisyAlloc &a, double d, double) { new (&a) NoisyAlloc(d); });
+    });
+    // Requires deallocation of previous overload preallocated value:
+    pyNoisyAlloc.def(py::init([](int i, double) { return new NoisyAlloc(i); }));
+    // Regular again: requires yet another preallocation
+    ignoreOldStyleInitWarnings([&pyNoisyAlloc]() {
+        pyNoisyAlloc.def(
+            "__init__", [](NoisyAlloc &a, int i, const std::string &) { new (&a) NoisyAlloc(i); });
+    });
     // static_assert testing (the following def's should all fail with appropriate compilation errors):
 #if 0
diff --git a/wrap/pybind11/tests/test_factory_constructors.py b/wrap/pybind11/tests/test_factory_constructors.py
index b141c13de9..8bc0269852 100644
--- a/wrap/pybind11/tests/test_factory_constructors.py
+++ b/wrap/pybind11/tests/test_factory_constructors.py
@@ -1,18 +1,21 @@
 # -*- coding: utf-8 -*-
-import pytest
 import re
-import env  # noqa: F401
+import pytest
+import env  # noqa: F401
+from pybind11_tests import ConstructorStats
 from pybind11_tests import factory_constructors as m
 from pybind11_tests.factory_constructors import tag
-from pybind11_tests import ConstructorStats
 def test_init_factory_basic():
     """Tests py::init_factory() wrapper around various ways of returning the object"""
-    cstats = [ConstructorStats.get(c) for c in [m.TestFactory1, m.TestFactory2, m.TestFactory3]]
+    cstats = [
+        ConstructorStats.get(c)
+        for c in [m.TestFactory1, m.TestFactory2, m.TestFactory3]
+    ]
     cstats[0].alive()  # force gc
     n_inst = ConstructorStats.detail_reg_inst()
@@ -41,12 +44,12 @@ def test_init_factory_basic():
     z3 = m.TestFactory3("bye")
     assert z3.value == "bye"
-    for null_ptr_kind in [tag.null_ptr,
-                          tag.null_unique_ptr,
-                          tag.null_shared_ptr]:
+    for null_ptr_kind in [tag.null_ptr, tag.null_unique_ptr, tag.null_shared_ptr]:
         with pytest.raises(TypeError) as excinfo:
-        assert str(excinfo.value) == "pybind11::init(): factory function returned nullptr"
+        assert (
+            str(excinfo.value) == "pybind11::init(): factory function returned nullptr"
+        )
     assert [i.alive() for i in cstats] == [3, 3, 3]
     assert ConstructorStats.detail_reg_inst() == n_inst + 9
@@ -61,7 +64,7 @@ def test_init_factory_basic():
     assert [i.values() for i in cstats] == [
         ["3", "hi!"],
         ["7", "hi again"],
-        ["42", "bye"]
+        ["42", "bye"],
     assert [i.default_constructions for i in cstats] == [1, 1, 1]
@@ -69,7 +72,9 @@ def test_init_factory_basic():
 def test_init_factory_signature(msg):
     with pytest.raises(TypeError) as excinfo:
         m.TestFactory1("invalid", "constructor", "arguments")
-    assert msg(excinfo.value) == """
+    assert (
+        msg(excinfo.value)
+        == """
         __init__(): incompatible constructor arguments. The following argument types are supported:
             1. m.factory_constructors.TestFactory1(arg0: m.factory_constructors.tag.unique_ptr_tag, arg1: int)
             2. m.factory_constructors.TestFactory1(arg0: str)
@@ -78,8 +83,11 @@ def test_init_factory_signature(msg):
         Invoked with: 'invalid', 'constructor', 'arguments'
     """  # noqa: E501 line too long
+    )
-    assert msg(m.TestFactory1.__init__.__doc__) == """
+    assert (
+        msg(m.TestFactory1.__init__.__doc__)
+        == """
         __init__(*args, **kwargs)
         Overloaded function.
@@ -91,12 +99,16 @@ def test_init_factory_signature(msg):
         4. __init__(self: m.factory_constructors.TestFactory1, arg0: handle, arg1: int, arg2: handle) -> None
     """  # noqa: E501 line too long
+    )
 def test_init_factory_casting():
     """Tests py::init_factory() wrapper with various upcasting and downcasting returns"""
-    cstats = [ConstructorStats.get(c) for c in [m.TestFactory3, m.TestFactory4, m.TestFactory5]]
+    cstats = [
+        ConstructorStats.get(c)
+        for c in [m.TestFactory3, m.TestFactory4, m.TestFactory5]
+    ]
     cstats[0].alive()  # force gc
     n_inst = ConstructorStats.detail_reg_inst()
@@ -134,7 +146,7 @@ def test_init_factory_casting():
     assert [i.values() for i in cstats] == [
         ["4", "5", "6", "7", "8"],
         ["4", "5", "8"],
-        ["6", "7"]
+        ["6", "7"],
@@ -204,7 +216,7 @@ def get(self):
     assert [i.values() for i in cstats] == [
         ["1", "8", "3", "4", "5", "6", "123", "10", "47"],
-        ["hi there", "3", "4", "6", "move", "123", "why hello!", "move", "47"]
+        ["hi there", "3", "4", "6", "move", "123", "why hello!", "move", "47"],
@@ -268,9 +280,11 @@ def get(self):
     assert not g1.has_alias()
     with pytest.raises(TypeError) as excinfo:
         PythFactory7(tag.shared_ptr, tag.invalid_base, 14)
-    assert (str(excinfo.value) ==
-            "pybind11::init(): construction failed: returned holder-wrapped instance is not an "
-            "alias instance")
+    assert (
+        str(excinfo.value)
+        == "pybind11::init(): construction failed: returned holder-wrapped instance is not an "
+        "alias instance"
+    )
     assert [i.alive() for i in cstats] == [13, 7]
     assert ConstructorStats.detail_reg_inst() == n_inst + 13
@@ -284,7 +298,7 @@ def get(self):
     assert [i.values() for i in cstats] == [
         ["1", "2", "3", "4", "5", "6", "7", "8", "9", "100", "11", "12", "13", "14"],
-        ["2", "4", "6", "8", "9", "100", "12"]
+        ["2", "4", "6", "8", "9", "100", "12"],
@@ -294,7 +308,7 @@ def test_no_placement_new(capture):
     with capture:
         a = m.NoPlacementNew(123)
-    found = re.search(r'^operator new called, returning (\d+)\n$', str(capture))
+    found = re.search(r"^operator new called, returning (\d+)\n$", str(capture))
     assert found
     assert a.i == 123
     with capture:
@@ -305,7 +319,7 @@ def test_no_placement_new(capture):
     with capture:
         b = m.NoPlacementNew()
-    found = re.search(r'^operator new called, returning (\d+)\n$', str(capture))
+    found = re.search(r"^operator new called, returning (\d+)\n$", str(capture))
     assert found
     assert b.i == 100
     with capture:
@@ -333,7 +347,7 @@ def create_and_destroy(*args):
 def strip_comments(s):
-    return re.sub(r'\s+#.*', '', s)
+    return re.sub(r"\s+#.*", "", s)
 def test_reallocation_a(capture, msg):
@@ -345,7 +359,9 @@ def test_reallocation_a(capture, msg):
     with capture:
-    assert msg(capture) == """
+    assert (
+        msg(capture)
+        == """
         noisy new
         noisy placement new
         NoisyAlloc(int 1)
@@ -353,12 +369,14 @@ def test_reallocation_a(capture, msg):
         noisy delete
+    )
 def test_reallocation_b(capture, msg):
     with capture:
-    assert msg(capture) == strip_comments("""
+    assert msg(capture) == strip_comments(
+        """
         noisy new               # allocation required to attempt first overload
         noisy delete            # have to dealloc before considering factory init overload
         noisy new               # pointer factory calling "new", part 1: allocation
@@ -366,51 +384,59 @@ def test_reallocation_b(capture, msg):
         ~NoisyAlloc()  # Destructor
         noisy delete   # operator delete
-    """)
+    """
+    )
 def test_reallocation_c(capture, msg):
     with capture:
         create_and_destroy(2, 3)
-    assert msg(capture) == strip_comments("""
+    assert msg(capture) == strip_comments(
+        """
         noisy new          # pointer factory calling "new", allocation
         NoisyAlloc(int 2)  # constructor
         ~NoisyAlloc()  # Destructor
         noisy delete   # operator delete
-    """)
+    """
+    )
 def test_reallocation_d(capture, msg):
     with capture:
         create_and_destroy(2.5, 3)
-    assert msg(capture) == strip_comments("""
+    assert msg(capture) == strip_comments(
+        """
         NoisyAlloc(double 2.5)  # construction (local func variable: operator_new not called)
         noisy new               # return-by-value "new" part 1: allocation
         ~NoisyAlloc()           # moved-away local func variable destruction
         ~NoisyAlloc()  # Destructor
         noisy delete   # operator delete
-    """)
+    """
+    )
 def test_reallocation_e(capture, msg):
     with capture:
         create_and_destroy(3.5, 4.5)
-    assert msg(capture) == strip_comments("""
+    assert msg(capture) == strip_comments(
+        """
         noisy new               # preallocation needed before invoking placement-new overload
         noisy placement new     # Placement new
         NoisyAlloc(double 3.5)  # construction
         ~NoisyAlloc()  # Destructor
         noisy delete   # operator delete
-    """)
+    """
+    )
 def test_reallocation_f(capture, msg):
     with capture:
         create_and_destroy(4, 0.5)
-    assert msg(capture) == strip_comments("""
+    assert msg(capture) == strip_comments(
+        """
         noisy new          # preallocation needed before invoking placement-new overload
         noisy delete       # deallocation of preallocated storage
         noisy new          # Factory pointer allocation
@@ -418,13 +444,15 @@ def test_reallocation_f(capture, msg):
         ~NoisyAlloc()  # Destructor
         noisy delete   # operator delete
-    """)
+    """
+    )
 def test_reallocation_g(capture, msg):
     with capture:
         create_and_destroy(5, "hi")
-    assert msg(capture) == strip_comments("""
+    assert msg(capture) == strip_comments(
+        """
         noisy new            # preallocation needed before invoking first placement new
         noisy delete         # delete before considering new-style constructor
         noisy new            # preallocation for second placement new
@@ -433,13 +461,15 @@ def test_reallocation_g(capture, msg):
         ~NoisyAlloc()  # Destructor
         noisy delete   # operator delete
-    """)
+    """
+    )
 def test_invalid_self():
     """Tests invocation of the pybind-registered base class with an invalid `self` argument.  You
     can only actually do this on Python 3: Python 2 raises an exception itself if you try."""
     class NotPybindDerived(object):
@@ -456,23 +486,35 @@ def __init__(self, bad):
     # Same as above, but for a class with an alias:
     class BrokenTF6(m.TestFactory6):
         def __init__(self, bad):
-            if bad == 1:
+            if bad == 0:
+                m.TestFactory6.__init__()
+            elif bad == 1:
                 a = m.TestFactory2(tag.pointer, 1)
                 m.TestFactory6.__init__(a, tag.base, 1)
             elif bad == 2:
                 a = m.TestFactory2(tag.pointer, 1)
                 m.TestFactory6.__init__(a, tag.alias, 1)
             elif bad == 3:
-                m.TestFactory6.__init__(NotPybindDerived.__new__(NotPybindDerived), tag.base, 1)
+                m.TestFactory6.__init__(
+                    NotPybindDerived.__new__(NotPybindDerived), tag.base, 1
+                )
             elif bad == 4:
-                m.TestFactory6.__init__(NotPybindDerived.__new__(NotPybindDerived), tag.alias, 1)
+                m.TestFactory6.__init__(
+                    NotPybindDerived.__new__(NotPybindDerived), tag.alias, 1
+                )
     for arg in (1, 2):
         with pytest.raises(TypeError) as excinfo:
-        assert str(excinfo.value) == "__init__(self, ...) called with invalid `self` argument"
+        assert (
+            str(excinfo.value)
+            == "__init__(self, ...) called with invalid or missing `self` argument"
+        )
-    for arg in (1, 2, 3, 4):
+    for arg in (0, 1, 2, 3, 4):
         with pytest.raises(TypeError) as excinfo:
-        assert str(excinfo.value) == "__init__(self, ...) called with invalid `self` argument"
+        assert (
+            str(excinfo.value)
+            == "__init__(self, ...) called with invalid or missing `self` argument"
+        )
diff --git a/wrap/pybind11/tests/test_gil_scoped.cpp b/wrap/pybind11/tests/test_gil_scoped.cpp
index eb6308956c..b261085c88 100644
--- a/wrap/pybind11/tests/test_gil_scoped.cpp
+++ b/wrap/pybind11/tests/test_gil_scoped.cpp
@@ -35,20 +35,15 @@ TEST_SUBMODULE(gil_scoped, m) {
       .def("virtual_func", &VirtClass::virtual_func)
       .def("pure_virtual_func", &VirtClass::pure_virtual_func);
-    m.def("test_callback_py_obj",
-          [](py::object func) { func(); });
-    m.def("test_callback_std_func",
-          [](const std::function<void()> &func) { func(); });
-    m.def("test_callback_virtual_func",
-          [](VirtClass &virt) { virt.virtual_func(); });
-    m.def("test_callback_pure_virtual_func",
-          [](VirtClass &virt) { virt.pure_virtual_func(); });
-    m.def("test_cross_module_gil",
-          []() {
-              auto cm = py::module::import("cross_module_gil_utils");
-              auto gil_acquire = reinterpret_cast<void (*)()>(
-                  PyLong_AsVoidPtr(cm.attr("gil_acquire_funcaddr").ptr()));
-              py::gil_scoped_release gil_release;
-              gil_acquire();
-          });
+  m.def("test_callback_py_obj", [](py::object &func) { func(); });
+  m.def("test_callback_std_func", [](const std::function<void()> &func) { func(); });
+  m.def("test_callback_virtual_func", [](VirtClass &virt) { virt.virtual_func(); });
+  m.def("test_callback_pure_virtual_func", [](VirtClass &virt) { virt.pure_virtual_func(); });
+  m.def("test_cross_module_gil", []() {
+      auto cm = py::module_::import("cross_module_gil_utils");
+      auto gil_acquire
+          = reinterpret_cast<void (*)()>(PyLong_AsVoidPtr(cm.attr("gil_acquire_funcaddr").ptr()));
+      py::gil_scoped_release gil_release;
+      gil_acquire();
+  });
diff --git a/wrap/pybind11/tests/test_gil_scoped.py b/wrap/pybind11/tests/test_gil_scoped.py
index 27122cca28..0a1d62747d 100644
--- a/wrap/pybind11/tests/test_gil_scoped.py
+++ b/wrap/pybind11/tests/test_gil_scoped.py
@@ -2,10 +2,6 @@
 import multiprocessing
 import threading
-import pytest
-import env  # noqa: F401
 from pybind11_tests import gil_scoped as m
@@ -25,6 +21,7 @@ def _run_in_process(target, *args, **kwargs):
 def _python_to_cpp_to_python():
     """Calls different C++ functions that come back to Python."""
     class ExtendedVirtClass(m.VirtClass):
         def virtual_func(self):
@@ -54,8 +51,7 @@ def _python_to_cpp_to_python_from_threads(num_threads, parallel=False):
-# TODO: FIXME, sometimes returns -11 instead of 0
-@pytest.mark.xfail("env.PY > (3,8) and env.MACOS", strict=False)
+# TODO: FIXME, sometimes returns -11 (segfault) instead of 0 on macOS Python 3.9
 def test_python_to_cpp_to_python_from_thread():
     """Makes sure there is no GIL deadlock when running in a thread.
@@ -64,8 +60,7 @@ def test_python_to_cpp_to_python_from_thread():
     assert _run_in_process(_python_to_cpp_to_python_from_threads, 1) == 0
-@pytest.mark.xfail("env.PY > (3,8) and env.MACOS", strict=False)
+# TODO: FIXME on macOS Python 3.9
 def test_python_to_cpp_to_python_from_thread_multiple_parallel():
     """Makes sure there is no GIL deadlock when running in a thread multiple times in parallel.
@@ -74,18 +69,18 @@ def test_python_to_cpp_to_python_from_thread_multiple_parallel():
     assert _run_in_process(_python_to_cpp_to_python_from_threads, 8, parallel=True) == 0
-@pytest.mark.xfail("env.PY > (3,8) and env.MACOS", strict=False)
+# TODO: FIXME on macOS Python 3.9
 def test_python_to_cpp_to_python_from_thread_multiple_sequential():
     """Makes sure there is no GIL deadlock when running in a thread multiple times sequentially.
     It runs in a separate process to be able to stop and assert if it deadlocks.
-    assert _run_in_process(_python_to_cpp_to_python_from_threads, 8, parallel=False) == 0
+    assert (
+        _run_in_process(_python_to_cpp_to_python_from_threads, 8, parallel=False) == 0
+    )
-@pytest.mark.xfail("env.PY > (3,8) and env.MACOS", strict=False)
+# TODO: FIXME on macOS Python 3.9
 def test_python_to_cpp_to_python_from_process():
     """Makes sure there is no GIL deadlock when using processes.
diff --git a/wrap/pybind11/tests/test_iostream.cpp b/wrap/pybind11/tests/test_iostream.cpp
index e67f88af5f..c620b59493 100644
--- a/wrap/pybind11/tests/test_iostream.cpp
+++ b/wrap/pybind11/tests/test_iostream.cpp
@@ -7,37 +7,87 @@
     BSD-style license that can be found in the LICENSE file.
+#if defined(_MSC_VER) && _MSC_VER < 1910  // VS 2015's MSVC
+#  pragma warning(disable: 4702) // unreachable code in system header (xatomic.h(382))
 #include <pybind11/iostream.h>
 #include "pybind11_tests.h"
+#include <atomic>
 #include <iostream>
+#include <mutex>
+#include <string>
+#include <thread>
-void noisy_function(std::string msg, bool flush) {
+void noisy_function(const std::string &msg, bool flush) {
     std::cout << msg;
     if (flush)
         std::cout << std::flush;
-void noisy_funct_dual(std::string msg, std::string emsg) {
+void noisy_funct_dual(const std::string &msg, const std::string &emsg) {
     std::cout << msg;
     std::cerr << emsg;
+// object to manage C++ thread
+// simply repeatedly write to std::cerr until stopped
+// redirect is called at some point to test the safety of scoped_estream_redirect
+struct TestThread {
+    TestThread() : stop_{false} {
+        auto thread_f = [this] {
+            static std::mutex cout_mutex;
+            while (!stop_) {
+                {
+                    // #HelpAppreciated: Work on iostream.h thread safety.
+                    // Without this lock, the clang ThreadSanitizer (tsan) reliably reports a
+                    // data race, and this test is predictably flakey on Windows.
+                    // For more background see the discussion under
+                    // https://github.com/pybind/pybind11/pull/2982 and
+                    // https://github.com/pybind/pybind11/pull/2995.
+                    const std::lock_guard<std::mutex> lock(cout_mutex);
+                    std::cout << "x" << std::flush;
+                }
+                std::this_thread::sleep_for(std::chrono::microseconds(50));
+            } };
+        t_ = new std::thread(std::move(thread_f));
+    }
+    ~TestThread() {
+        delete t_;
+    }
+    void stop() { stop_ = true; }
+    void join() const {
+        py::gil_scoped_release gil_lock;
+        t_->join();
+    }
+    void sleep() {
+        py::gil_scoped_release gil_lock;
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+    }
+    std::thread *t_{nullptr};
+    std::atomic<bool> stop_;
 TEST_SUBMODULE(iostream, m) {
     // test_evals
-    m.def("captured_output_default", [](std::string msg) {
+    m.def("captured_output_default", [](const std::string &msg) {
         py::scoped_ostream_redirect redir;
         std::cout << msg << std::flush;
-    m.def("captured_output", [](std::string msg) {
-        py::scoped_ostream_redirect redir(std::cout, py::module::import("sys").attr("stdout"));
+    m.def("captured_output", [](const std::string &msg) {
+        py::scoped_ostream_redirect redir(std::cout, py::module_::import("sys").attr("stdout"));
         std::cout << msg << std::flush;
@@ -45,8 +95,8 @@ TEST_SUBMODULE(iostream, m) {
             py::arg("msg"), py::arg("flush")=true);
-    m.def("captured_err", [](std::string msg) {
-        py::scoped_ostream_redirect redir(std::cerr, py::module::import("sys").attr("stderr"));
+    m.def("captured_err", [](const std::string &msg) {
+        py::scoped_ostream_redirect redir(std::cerr, py::module_::import("sys").attr("stderr"));
         std::cerr << msg << std::flush;
@@ -56,18 +106,20 @@ TEST_SUBMODULE(iostream, m) {
             py::call_guard<py::scoped_ostream_redirect, py::scoped_estream_redirect>(),
             py::arg("msg"), py::arg("emsg"));
-    m.def("raw_output", [](std::string msg) {
-        std::cout << msg << std::flush;
-    });
+    m.def("raw_output", [](const std::string &msg) { std::cout << msg << std::flush; });
-    m.def("raw_err", [](std::string msg) {
-        std::cerr << msg << std::flush;
-    });
+    m.def("raw_err", [](const std::string &msg) { std::cerr << msg << std::flush; });
-    m.def("captured_dual", [](std::string msg, std::string emsg) {
-        py::scoped_ostream_redirect redirout(std::cout, py::module::import("sys").attr("stdout"));
-        py::scoped_ostream_redirect redirerr(std::cerr, py::module::import("sys").attr("stderr"));
+    m.def("captured_dual", [](const std::string &msg, const std::string &emsg) {
+        py::scoped_ostream_redirect redirout(std::cout, py::module_::import("sys").attr("stdout"));
+        py::scoped_ostream_redirect redirerr(std::cerr, py::module_::import("sys").attr("stderr"));
         std::cout << msg << std::flush;
         std::cerr << emsg << std::flush;
+    py::class_<TestThread>(m, "TestThread")
+        .def(py::init<>())
+        .def("stop", &TestThread::stop)
+        .def("join", &TestThread::join)
+        .def("sleep", &TestThread::sleep);
diff --git a/wrap/pybind11/tests/test_iostream.py b/wrap/pybind11/tests/test_iostream.py
index 7ac4fcece0..7f18ca65c6 100644
--- a/wrap/pybind11/tests/test_iostream.py
+++ b/wrap/pybind11/tests/test_iostream.py
@@ -1,9 +1,9 @@
 # -*- coding: utf-8 -*-
-from pybind11_tests import iostream as m
 import sys
 from contextlib import contextmanager
+from pybind11_tests import iostream as m
     # Python 3
     from io import StringIO
@@ -18,6 +18,7 @@
     # Python 3.4
     from contextlib import redirect_stdout
 except ImportError:
     def redirect_stdout(target):
         original = sys.stdout
@@ -25,10 +26,12 @@ def redirect_stdout(target):
         sys.stdout = original
     # Python 3.5
     from contextlib import redirect_stderr
 except ImportError:
     def redirect_stderr(target):
         original = sys.stderr
@@ -42,16 +45,16 @@ def test_captured(capsys):
     stdout, stderr = capsys.readouterr()
     assert stdout == msg
-    assert stderr == ''
+    assert stderr == ""
     stdout, stderr = capsys.readouterr()
     assert stdout == msg
-    assert stderr == ''
+    assert stderr == ""
     stdout, stderr = capsys.readouterr()
-    assert stdout == ''
+    assert stdout == ""
     assert stderr == msg
@@ -63,7 +66,97 @@ def test_captured_large_string(capsys):
     stdout, stderr = capsys.readouterr()
     assert stdout == msg
-    assert stderr == ''
+    assert stderr == ""
+def test_captured_utf8_2byte_offset0(capsys):
+    msg = "\u07FF"
+    msg = "" + msg * (1024 // len(msg) + 1)
+    m.captured_output_default(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ""
+def test_captured_utf8_2byte_offset1(capsys):
+    msg = "\u07FF"
+    msg = "1" + msg * (1024 // len(msg) + 1)
+    m.captured_output_default(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ""
+def test_captured_utf8_3byte_offset0(capsys):
+    msg = "\uFFFF"
+    msg = "" + msg * (1024 // len(msg) + 1)
+    m.captured_output_default(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ""
+def test_captured_utf8_3byte_offset1(capsys):
+    msg = "\uFFFF"
+    msg = "1" + msg * (1024 // len(msg) + 1)
+    m.captured_output_default(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ""
+def test_captured_utf8_3byte_offset2(capsys):
+    msg = "\uFFFF"
+    msg = "12" + msg * (1024 // len(msg) + 1)
+    m.captured_output_default(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ""
+def test_captured_utf8_4byte_offset0(capsys):
+    msg = "\U0010FFFF"
+    msg = "" + msg * (1024 // len(msg) + 1)
+    m.captured_output_default(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ""
+def test_captured_utf8_4byte_offset1(capsys):
+    msg = "\U0010FFFF"
+    msg = "1" + msg * (1024 // len(msg) + 1)
+    m.captured_output_default(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ""
+def test_captured_utf8_4byte_offset2(capsys):
+    msg = "\U0010FFFF"
+    msg = "12" + msg * (1024 // len(msg) + 1)
+    m.captured_output_default(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ""
+def test_captured_utf8_4byte_offset3(capsys):
+    msg = "\U0010FFFF"
+    msg = "123" + msg * (1024 // len(msg) + 1)
+    m.captured_output_default(msg)
+    stdout, stderr = capsys.readouterr()
+    assert stdout == msg
+    assert stderr == ""
 def test_guard_capture(capsys):
@@ -71,7 +164,7 @@ def test_guard_capture(capsys):
     stdout, stderr = capsys.readouterr()
     assert stdout == msg
-    assert stderr == ''
+    assert stderr == ""
 def test_series_captured(capture):
@@ -88,7 +181,7 @@ def test_flush(capfd):
     with m.ostream_redirect():
         m.noisy_function(msg, flush=False)
         stdout, stderr = capfd.readouterr()
-        assert stdout == ''
+        assert stdout == ""
         m.noisy_function(msg2, flush=True)
         stdout, stderr = capfd.readouterr()
@@ -107,15 +200,15 @@ def test_not_captured(capfd):
     stdout, stderr = capfd.readouterr()
     assert stdout == msg
-    assert stderr == ''
-    assert stream.getvalue() == ''
+    assert stderr == ""
+    assert stream.getvalue() == ""
     stream = StringIO()
     with redirect_stdout(stream):
     stdout, stderr = capfd.readouterr()
-    assert stdout == ''
-    assert stderr == ''
+    assert stdout == ""
+    assert stderr == ""
     assert stream.getvalue() == msg
@@ -125,16 +218,16 @@ def test_err(capfd):
     with redirect_stderr(stream):
     stdout, stderr = capfd.readouterr()
-    assert stdout == ''
+    assert stdout == ""
     assert stderr == msg
-    assert stream.getvalue() == ''
+    assert stream.getvalue() == ""
     stream = StringIO()
     with redirect_stderr(stream):
     stdout, stderr = capfd.readouterr()
-    assert stdout == ''
-    assert stderr == ''
+    assert stdout == ""
+    assert stderr == ""
     assert stream.getvalue() == msg
@@ -146,8 +239,8 @@ def test_multi_captured(capfd):
     stdout, stderr = capfd.readouterr()
-    assert stdout == 'bd'
-    assert stream.getvalue() == 'ac'
+    assert stdout == "bd"
+    assert stream.getvalue() == "ac"
 def test_dual(capsys):
@@ -164,14 +257,14 @@ def test_redirect(capfd):
     stdout, stderr = capfd.readouterr()
     assert stdout == msg
-    assert stream.getvalue() == ''
+    assert stream.getvalue() == ""
     stream = StringIO()
     with redirect_stdout(stream):
         with m.ostream_redirect():
     stdout, stderr = capfd.readouterr()
-    assert stdout == ''
+    assert stdout == ""
     assert stream.getvalue() == msg
     stream = StringIO()
@@ -179,7 +272,7 @@ def test_redirect(capfd):
     stdout, stderr = capfd.readouterr()
     assert stdout == msg
-    assert stream.getvalue() == ''
+    assert stream.getvalue() == ""
 def test_redirect_err(capfd):
@@ -193,7 +286,7 @@ def test_redirect_err(capfd):
     stdout, stderr = capfd.readouterr()
     assert stdout == msg
-    assert stderr == ''
+    assert stderr == ""
     assert stream.getvalue() == msg2
@@ -209,7 +302,30 @@ def test_redirect_both(capfd):
     stdout, stderr = capfd.readouterr()
-    assert stdout == ''
-    assert stderr == ''
+    assert stdout == ""
+    assert stderr == ""
     assert stream.getvalue() == msg
     assert stream2.getvalue() == msg2
+def test_threading():
+    with m.ostream_redirect(stdout=True, stderr=False):
+        # start some threads
+        threads = []
+        # start some threads
+        for _j in range(20):
+            threads.append(m.TestThread())
+        # give the threads some time to fail
+        threads[0].sleep()
+        # stop all the threads
+        for t in threads:
+            t.stop()
+        for t in threads:
+            t.join()
+        # if a thread segfaults, we don't get here
+        assert True
diff --git a/wrap/pybind11/tests/test_kwargs_and_defaults.cpp b/wrap/pybind11/tests/test_kwargs_and_defaults.cpp
index 641ec88c45..34ad2a8647 100644
--- a/wrap/pybind11/tests/test_kwargs_and_defaults.cpp
+++ b/wrap/pybind11/tests/test_kwargs_and_defaults.cpp
@@ -11,6 +11,8 @@
 #include "constructor_stats.h"
 #include <pybind11/stl.h>
+#include <utility>
 TEST_SUBMODULE(kwargs_and_defaults, m) {
     auto kw_func = [](int x, int y) { return "x=" + std::to_string(x) + ", y=" + std::to_string(y); };
@@ -37,18 +39,16 @@ TEST_SUBMODULE(kwargs_and_defaults, m) {
     m.def("args_function", [](py::args args) -> py::tuple {
         return std::move(args);
-    m.def("args_kwargs_function", [](py::args args, py::kwargs kwargs) {
+    m.def("args_kwargs_function", [](const py::args &args, const py::kwargs &kwargs) {
         return py::make_tuple(args, kwargs);
     // test_mixed_args_and_kwargs
-    m.def("mixed_plus_args", [](int i, double j, py::args args) {
-        return py::make_tuple(i, j, args);
-    });
-    m.def("mixed_plus_kwargs", [](int i, double j, py::kwargs kwargs) {
-        return py::make_tuple(i, j, kwargs);
-    });
-    auto mixed_plus_both = [](int i, double j, py::args args, py::kwargs kwargs) {
+    m.def("mixed_plus_args",
+          [](int i, double j, const py::args &args) { return py::make_tuple(i, j, args); });
+    m.def("mixed_plus_kwargs",
+          [](int i, double j, const py::kwargs &kwargs) { return py::make_tuple(i, j, kwargs); });
+    auto mixed_plus_both = [](int i, double j, const py::args &args, const py::kwargs &kwargs) {
         return py::make_tuple(i, j, args, kwargs);
     m.def("mixed_plus_args_kwargs", mixed_plus_both);
@@ -56,6 +56,23 @@ TEST_SUBMODULE(kwargs_and_defaults, m) {
     m.def("mixed_plus_args_kwargs_defaults", mixed_plus_both,
             py::arg("i") = 1, py::arg("j") = 3.14159);
+    m.def("args_kwonly",
+            [](int i, double j, const py::args &args, int z) { return py::make_tuple(i, j, args, z); },
+            "i"_a, "j"_a, "z"_a);
+    m.def("args_kwonly_kwargs",
+            [](int i, double j, const py::args &args, int z, const py::kwargs &kwargs) {
+                return py::make_tuple(i, j, args, z, kwargs); },
+            "i"_a, "j"_a, py::kw_only{}, "z"_a);
+    m.def("args_kwonly_kwargs_defaults",
+            [](int i, double j, const py::args &args, int z, const py::kwargs &kwargs) {
+                return py::make_tuple(i, j, args, z, kwargs); },
+            "i"_a = 1, "j"_a = 3.14159, "z"_a = 42);
+    m.def("args_kwonly_full_monty",
+            [](int h, int i, double j, const py::args &args, int z, const py::kwargs &kwargs) {
+                return py::make_tuple(h, i, j, args, z, kwargs); },
+            py::arg() = 1, py::arg() = 2, py::pos_only{}, "j"_a = 3.14159, "z"_a = 42);
     // test_args_refcount
     // PyPy needs a garbage collection to get the reference count values to match CPython's behaviour
     #ifdef PYPY_VERSION
@@ -65,22 +82,25 @@ TEST_SUBMODULE(kwargs_and_defaults, m) {
     m.def("arg_refcount_h", [](py::handle h) { GC_IF_NEEDED; return h.ref_count(); });
     m.def("arg_refcount_h", [](py::handle h, py::handle, py::handle) { GC_IF_NEEDED; return h.ref_count(); });
-    m.def("arg_refcount_o", [](py::object o) { GC_IF_NEEDED; return o.ref_count(); });
+    m.def("arg_refcount_o", [](const py::object &o) {
+        GC_IF_NEEDED;
+        return o.ref_count();
+    });
     m.def("args_refcount", [](py::args a) {
         py::tuple t(a.size());
         for (size_t i = 0; i < a.size(); i++)
             // Use raw Python API here to avoid an extra, intermediate incref on the tuple item:
-            t[i] = (int) Py_REFCNT(PyTuple_GET_ITEM(a.ptr(), static_cast<ssize_t>(i)));
+            t[i] = (int) Py_REFCNT(PyTuple_GET_ITEM(a.ptr(), static_cast<py::ssize_t>(i)));
         return t;
-    m.def("mixed_args_refcount", [](py::object o, py::args a) {
+    m.def("mixed_args_refcount", [](const py::object &o, py::args a) {
         py::tuple t(a.size() + 1);
         t[0] = o.ref_count();
         for (size_t i = 0; i < a.size(); i++)
             // Use raw Python API here to avoid an extra, intermediate incref on the tuple item:
-            t[i + 1] = (int) Py_REFCNT(PyTuple_GET_ITEM(a.ptr(), static_cast<ssize_t>(i)));
+            t[i + 1] = (int) Py_REFCNT(PyTuple_GET_ITEM(a.ptr(), static_cast<py::ssize_t>(i)));
         return t;
@@ -103,11 +123,17 @@ TEST_SUBMODULE(kwargs_and_defaults, m) {
             py::arg() = 3, "j"_a = 4, py::kw_only(), "k"_a = 5, "z"_a);
     m.def("kw_only_mixed", [](int i, int j) { return py::make_tuple(i, j); },
             "i"_a, py::kw_only(), "j"_a);
-    m.def("kw_only_plus_more", [](int i, int j, int k, py::kwargs kwargs) {
-            return py::make_tuple(i, j, k, kwargs); },
-            py::arg() /* positional */, py::arg("j") = -1 /* both */, py::kw_only(), py::arg("k") /* kw-only */);
-    m.def("register_invalid_kw_only", [](py::module m) {
+    m.def(
+        "kw_only_plus_more",
+        [](int i, int j, int k, const py::kwargs &kwargs) {
+            return py::make_tuple(i, j, k, kwargs);
+        },
+        py::arg() /* positional */,
+        py::arg("j") = -1 /* both */,
+        py::kw_only(),
+        py::arg("k") /* kw-only */);
+    m.def("register_invalid_kw_only", [](py::module_ m) {
         m.def("bad_kw_only", [](int i, int j) { return py::make_tuple(i, j); },
                 py::kw_only(), py::arg() /* invalid unnamed argument */, "j"_a);
@@ -137,6 +163,25 @@ TEST_SUBMODULE(kwargs_and_defaults, m) {
     // Make sure a class (not an instance) can be used as a default argument.
     // The return value doesn't matter, only that the module is importable.
-    m.def("class_default_argument", [](py::object a) { return py::repr(a); },
-        "a"_a = py::module::import("decimal").attr("Decimal"));
+    m.def(
+        "class_default_argument",
+        [](py::object a) { return py::repr(std::move(a)); },
+        "a"_a = py::module_::import("decimal").attr("Decimal"));
+    // Initial implementation of kw_only was broken when used on a method/constructor before any
+    // other arguments
+    // https://github.com/pybind/pybind11/pull/3402#issuecomment-963341987
+    struct first_arg_kw_only {};
+    py::class_<first_arg_kw_only>(m, "first_arg_kw_only")
+        .def(py::init([](int) { return first_arg_kw_only(); }),
+             py::kw_only(), // This being before any args was broken
+             py::arg("i") = 0)
+        .def("method", [](first_arg_kw_only&, int, int) {},
+             py::kw_only(), // and likewise here
+             py::arg("i") = 1, py::arg("j") = 2)
+        // Closely related: pos_only marker didn't show up properly when it was before any other
+        // arguments (although that is fairly useless in practice).
+        .def("pos_only", [](first_arg_kw_only&, int, int) {},
+                py::pos_only{}, py::arg("i"), py::arg("j"));
diff --git a/wrap/pybind11/tests/test_kwargs_and_defaults.py b/wrap/pybind11/tests/test_kwargs_and_defaults.py
index 2a81dbdc50..d61cf2aa58 100644
--- a/wrap/pybind11/tests/test_kwargs_and_defaults.py
+++ b/wrap/pybind11/tests/test_kwargs_and_defaults.py
@@ -2,7 +2,6 @@
 import pytest
 import env  # noqa: F401
 from pybind11_tests import kwargs_and_defaults as m
@@ -15,11 +14,17 @@ def test_function_signatures(doc):
     assert doc(m.kw_func_udl) == "kw_func_udl(x: int, y: int = 300) -> str"
     assert doc(m.kw_func_udl_z) == "kw_func_udl_z(x: int, y: int = 0) -> str"
     assert doc(m.args_function) == "args_function(*args) -> tuple"
-    assert doc(m.args_kwargs_function) == "args_kwargs_function(*args, **kwargs) -> tuple"
-    assert doc(m.KWClass.foo0) == \
-        "foo0(self: m.kwargs_and_defaults.KWClass, arg0: int, arg1: float) -> None"
-    assert doc(m.KWClass.foo1) == \
-        "foo1(self: m.kwargs_and_defaults.KWClass, x: int, y: float) -> None"
+    assert (
+        doc(m.args_kwargs_function) == "args_kwargs_function(*args, **kwargs) -> tuple"
+    )
+    assert (
+        doc(m.KWClass.foo0)
+        == "foo0(self: m.kwargs_and_defaults.KWClass, arg0: int, arg1: float) -> None"
+    )
+    assert (
+        doc(m.KWClass.foo1)
+        == "foo1(self: m.kwargs_and_defaults.KWClass, x: int, y: float) -> None"
+    )
 def test_named_arguments(msg):
@@ -40,7 +45,9 @@ def test_named_arguments(msg):
         # noinspection PyArgumentList
         m.kw_func2(x=5, y=10, z=12)
     assert excinfo.match(
-        r'(?s)^kw_func2\(\): incompatible.*Invoked with: kwargs: ((x=5|y=10|z=12)(, |$))' + '{3}$')
+        r"(?s)^kw_func2\(\): incompatible.*Invoked with: kwargs: ((x=5|y=10|z=12)(, |$))"
+        + "{3}$"
+    )
     assert m.kw_func4() == "{13 17}"
     assert m.kw_func4(myList=[1, 2, 3]) == "{1 2 3}"
@@ -50,11 +57,11 @@ def test_named_arguments(msg):
 def test_arg_and_kwargs():
-    args = 'arg1_value', 'arg2_value', 3
+    args = "arg1_value", "arg2_value", 3
     assert m.args_function(*args) == args
-    args = 'a1', 'a2'
-    kwargs = dict(arg3='a3', arg4=4)
+    args = "a1", "a2"
+    kwargs = dict(arg3="a3", arg4=4)
     assert m.args_kwargs_function(*args, **kwargs) == (args, kwargs)
@@ -68,47 +75,118 @@ def test_mixed_args_and_kwargs(msg):
     assert mpa(1, 2.5) == (1, 2.5, ())
     with pytest.raises(TypeError) as excinfo:
         assert mpa(1)
-    assert msg(excinfo.value) == """
+    assert (
+        msg(excinfo.value)
+        == """
         mixed_plus_args(): incompatible function arguments. The following argument types are supported:
             1. (arg0: int, arg1: float, *args) -> tuple
         Invoked with: 1
     """  # noqa: E501 line too long
+    )
     with pytest.raises(TypeError) as excinfo:
         assert mpa()
-    assert msg(excinfo.value) == """
+    assert (
+        msg(excinfo.value)
+        == """
         mixed_plus_args(): incompatible function arguments. The following argument types are supported:
             1. (arg0: int, arg1: float, *args) -> tuple
         Invoked with:
     """  # noqa: E501 line too long
+    )
-    assert mpk(-2, 3.5, pi=3.14159, e=2.71828) == (-2, 3.5, {'e': 2.71828, 'pi': 3.14159})
+    assert mpk(-2, 3.5, pi=3.14159, e=2.71828) == (
+        -2,
+        3.5,
+        {"e": 2.71828, "pi": 3.14159},
+    )
     assert mpak(7, 7.7, 7.77, 7.777, 7.7777, minusseven=-7) == (
-        7, 7.7, (7.77, 7.777, 7.7777), {'minusseven': -7})
+        7,
+        7.7,
+        (7.77, 7.777, 7.7777),
+        {"minusseven": -7},
+    )
     assert mpakd() == (1, 3.14159, (), {})
     assert mpakd(3) == (3, 3.14159, (), {})
     assert mpakd(j=2.71828) == (1, 2.71828, (), {})
-    assert mpakd(k=42) == (1, 3.14159, (), {'k': 42})
+    assert mpakd(k=42) == (1, 3.14159, (), {"k": 42})
     assert mpakd(1, 1, 2, 3, 5, 8, then=13, followedby=21) == (
-        1, 1, (2, 3, 5, 8), {'then': 13, 'followedby': 21})
+        1,
+        1,
+        (2, 3, 5, 8),
+        {"then": 13, "followedby": 21},
+    )
     # Arguments specified both positionally and via kwargs should fail:
     with pytest.raises(TypeError) as excinfo:
         assert mpakd(1, i=1)
-    assert msg(excinfo.value) == """
+    assert (
+        msg(excinfo.value)
+        == """
         mixed_plus_args_kwargs_defaults(): incompatible function arguments. The following argument types are supported:
             1. (i: int = 1, j: float = 3.14159, *args, **kwargs) -> tuple
         Invoked with: 1; kwargs: i=1
     """  # noqa: E501 line too long
+    )
     with pytest.raises(TypeError) as excinfo:
         assert mpakd(1, 2, j=1)
-    assert msg(excinfo.value) == """
+    assert (
+        msg(excinfo.value)
+        == """
         mixed_plus_args_kwargs_defaults(): incompatible function arguments. The following argument types are supported:
             1. (i: int = 1, j: float = 3.14159, *args, **kwargs) -> tuple
         Invoked with: 1, 2; kwargs: j=1
     """  # noqa: E501 line too long
+    )
+    # Arguments after a py::args are automatically keyword-only (pybind 2.9+)
+    assert m.args_kwonly(2, 2.5, z=22) == (2, 2.5, (), 22)
+    assert m.args_kwonly(2, 2.5, "a", "b", "c", z=22) == (2, 2.5, ("a", "b", "c"), 22)
+    assert m.args_kwonly(z=22, i=4, j=16) == (4, 16, (), 22)
+    with pytest.raises(TypeError) as excinfo:
+        assert m.args_kwonly(2, 2.5, 22)  # missing z= keyword
+    assert (
+        msg(excinfo.value)
+        == """
+        args_kwonly(): incompatible function arguments. The following argument types are supported:
+            1. (i: int, j: float, *args, z: int) -> tuple
+        Invoked with: 2, 2.5, 22
+    """
+    )
+    assert m.args_kwonly_kwargs(i=1, k=4, j=10, z=-1, y=9) == (
+        1,
+        10,
+        (),
+        -1,
+        {"k": 4, "y": 9},
+    )
+    assert m.args_kwonly_kwargs(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, z=11, y=12) == (
+        1,
+        2,
+        (3, 4, 5, 6, 7, 8, 9, 10),
+        11,
+        {"y": 12},
+    )
+    assert (
+        m.args_kwonly_kwargs.__doc__
+        == "args_kwonly_kwargs(i: int, j: float, *args, z: int, **kwargs) -> tuple\n"
+    )
+    assert (
+        m.args_kwonly_kwargs_defaults.__doc__
+        == "args_kwonly_kwargs_defaults(i: int = 1, j: float = 3.14159, *args, z: int = 42, **kwargs) -> tuple\n"  # noqa: E501 line too long
+    )
+    assert m.args_kwonly_kwargs_defaults() == (1, 3.14159, (), 42, {})
+    assert m.args_kwonly_kwargs_defaults(2) == (2, 3.14159, (), 42, {})
+    assert m.args_kwonly_kwargs_defaults(z=-99) == (1, 3.14159, (), -99, {})
+    assert m.args_kwonly_kwargs_defaults(5, 6, 7, 8) == (5, 6, (7, 8), 42, {})
+    assert m.args_kwonly_kwargs_defaults(5, 6, 7, m=8) == (5, 6, (7,), 42, {"m": 8})
+    assert m.args_kwonly_kwargs_defaults(5, 6, 7, m=8, z=9) == (5, 6, (7,), 9, {"m": 8})
 def test_keyword_only_args(msg):
@@ -134,9 +212,9 @@ def test_keyword_only_args(msg):
     assert m.kw_only_mixed(j=2, i=3) == (3, 2)
     assert m.kw_only_mixed(i=2, j=3) == (2, 3)
-    assert m.kw_only_plus_more(4, 5, k=6, extra=7) == (4, 5, 6, {'extra': 7})
-    assert m.kw_only_plus_more(3, k=5, j=4, extra=6) == (3, 4, 5, {'extra': 6})
-    assert m.kw_only_plus_more(2, k=3, extra=4) == (2, -1, 3, {'extra': 4})
+    assert m.kw_only_plus_more(4, 5, k=6, extra=7) == (4, 5, 6, {"extra": 7})
+    assert m.kw_only_plus_more(3, k=5, j=4, extra=6) == (3, 4, 5, {"extra": 6})
+    assert m.kw_only_plus_more(2, k=3, extra=4) == (2, -1, 3, {"extra": 4})
     with pytest.raises(TypeError) as excinfo:
         assert m.kw_only_mixed(i=1) == (1,)
@@ -144,9 +222,25 @@ def test_keyword_only_args(msg):
     with pytest.raises(RuntimeError) as excinfo:
-    assert msg(excinfo.value) == """
-        arg(): cannot specify an unnamed argument after an kw_only() annotation
+    assert (
+        msg(excinfo.value)
+        == """
+        arg(): cannot specify an unnamed argument after a kw_only() annotation or args() argument
+    )
+    # https://github.com/pybind/pybind11/pull/3402#issuecomment-963341987
+    x = m.first_arg_kw_only(i=1)
+    x.method()
+    x.method(i=1, j=2)
+    assert (
+        m.first_arg_kw_only.__init__.__doc__
+        == "__init__(self: pybind11_tests.kwargs_and_defaults.first_arg_kw_only, *, i: int = 0) -> None\n"  # noqa: E501 line too long
+    )
+    assert (
+        m.first_arg_kw_only.method.__doc__
+        == "method(self: pybind11_tests.kwargs_and_defaults.first_arg_kw_only, *, i: int = 1, j: int = 2) -> None\n"  # noqa: E501 line too long
+    )
 def test_positional_only_args(msg):
@@ -188,13 +282,65 @@ def test_positional_only_args(msg):
         m.pos_only_def_mix(1, j=4)
     assert "incompatible function arguments" in str(excinfo.value)
+    # Mix it with args and kwargs:
+    assert (
+        m.args_kwonly_full_monty.__doc__
+        == "args_kwonly_full_monty(arg0: int = 1, arg1: int = 2, /, j: float = 3.14159, *args, z: int = 42, **kwargs) -> tuple\n"  # noqa: E501 line too long
+    )
+    assert m.args_kwonly_full_monty() == (1, 2, 3.14159, (), 42, {})
+    assert m.args_kwonly_full_monty(8) == (8, 2, 3.14159, (), 42, {})
+    assert m.args_kwonly_full_monty(8, 9) == (8, 9, 3.14159, (), 42, {})
+    assert m.args_kwonly_full_monty(8, 9, 10) == (8, 9, 10.0, (), 42, {})
+    assert m.args_kwonly_full_monty(3, 4, 5, 6, 7, m=8, z=9) == (
+        3,
+        4,
+        5.0,
+        (
+            6,
+            7,
+        ),
+        9,
+        {"m": 8},
+    )
+    assert m.args_kwonly_full_monty(3, 4, 5, 6, 7, m=8, z=9) == (
+        3,
+        4,
+        5.0,
+        (
+            6,
+            7,
+        ),
+        9,
+        {"m": 8},
+    )
+    assert m.args_kwonly_full_monty(5, j=7, m=8, z=9) == (5, 2, 7.0, (), 9, {"m": 8})
+    assert m.args_kwonly_full_monty(i=5, j=7, m=8, z=9) == (
+        1,
+        2,
+        7.0,
+        (),
+        9,
+        {"i": 5, "m": 8},
+    )
+    # pos_only at the beginning of the argument list was "broken" in how it was displayed (though
+    # this is fairly useless in practice).  Related to:
+    # https://github.com/pybind/pybind11/pull/3402#issuecomment-963341987
+    assert (
+        m.first_arg_kw_only.pos_only.__doc__
+        == "pos_only(self: pybind11_tests.kwargs_and_defaults.first_arg_kw_only, /, i: int, j: int) -> None\n"  # noqa: E501 line too long
+    )
 def test_signatures():
     assert "kw_only_all(*, i: int, j: int) -> tuple\n" == m.kw_only_all.__doc__
     assert "kw_only_mixed(i: int, *, j: int) -> tuple\n" == m.kw_only_mixed.__doc__
     assert "pos_only_all(i: int, j: int, /) -> tuple\n" == m.pos_only_all.__doc__
     assert "pos_only_mix(i: int, /, j: int) -> tuple\n" == m.pos_only_mix.__doc__
-    assert "pos_kw_only_mix(i: int, /, j: int, *, k: int) -> tuple\n" == m.pos_kw_only_mix.__doc__
+    assert (
+        "pos_kw_only_mix(i: int, /, j: int, *, k: int) -> tuple\n"
+        == m.pos_kw_only_mix.__doc__
+    )
 @pytest.mark.xfail("env.PYPY and env.PY2", reason="PyPy2 doesn't double count")
@@ -219,11 +365,18 @@ def test_args_refcount():
     assert m.args_function(-1, myval) == (-1, myval)
     assert refcount(myval) == expected
-    assert m.mixed_plus_args_kwargs(5, 6.0, myval, a=myval) == (5, 6.0, (myval,), {"a": myval})
+    assert m.mixed_plus_args_kwargs(5, 6.0, myval, a=myval) == (
+        5,
+        6.0,
+        (myval,),
+        {"a": myval},
+    )
     assert refcount(myval) == expected
-    assert m.args_kwargs_function(7, 8, myval, a=1, b=myval) == \
-        ((7, 8, myval), {"a": 1, "b": myval})
+    assert m.args_kwargs_function(7, 8, myval, a=1, b=myval) == (
+        (7, 8, myval),
+        {"a": 1, "b": myval},
+    )
     assert refcount(myval) == expected
     exp3 = refcount(myval, myval, myval)
diff --git a/wrap/pybind11/tests/test_local_bindings.cpp b/wrap/pybind11/tests/test_local_bindings.cpp
index 97c02dbeb5..a5808e2f2a 100644
--- a/wrap/pybind11/tests/test_local_bindings.cpp
+++ b/wrap/pybind11/tests/test_local_bindings.cpp
@@ -10,9 +10,12 @@
 #include "pybind11_tests.h"
 #include "local_bindings.h"
 #include <pybind11/stl.h>
 #include <pybind11/stl_bind.h>
 #include <numeric>
+#include <utility>
 TEST_SUBMODULE(local_bindings, m) {
     // test_load_external
@@ -41,7 +44,7 @@ TEST_SUBMODULE(local_bindings, m) {
     // should raise a runtime error from the duplicate definition attempt.  If test_class isn't
     // available it *also* throws a runtime error (with "test_class not enabled" as value).
     m.def("register_local_external", [m]() {
-        auto main = py::module::import("pybind11_tests");
+        auto main = py::module_::import("pybind11_tests");
         if (py::hasattr(main, "class_")) {
             bind_local<LocalExternal, 7>(m, "LocalExternal", py::module_local());
@@ -75,7 +78,7 @@ TEST_SUBMODULE(local_bindings, m) {
     m.def("get_mixed_lg", [](int i) { return MixedLocalGlobal(i); });
     // test_internal_locals_differ
-    m.def("local_cpp_types_addr", []() { return (uintptr_t) &py::detail::registered_local_types_cpp(); });
+    m.def("local_cpp_types_addr", []() { return (uintptr_t) &py::detail::get_local_internals().registered_types_cpp; });
     // test_stl_caster_vs_stl_bind
     m.def("load_vector_via_caster", [](std::vector<int> v) {
@@ -86,7 +89,10 @@ TEST_SUBMODULE(local_bindings, m) {
     m.def("return_self", [](LocalVec *v) { return v; });
     m.def("return_copy", [](const LocalVec &v) { return LocalVec(v); });
-    class Cat : public pets::Pet { public: Cat(std::string name) : Pet(name) {}; };
+    class Cat : public pets::Pet {
+    public:
+        explicit Cat(std::string name) : Pet(std::move(name)) {}
+    };
     py::class_<pets::Pet>(m, "Pet", py::module_local())
         .def("get_name", &pets::Pet::name);
     // Binding for local extending class:
diff --git a/wrap/pybind11/tests/test_local_bindings.py b/wrap/pybind11/tests/test_local_bindings.py
index 5460727e1d..52b1b63358 100644
--- a/wrap/pybind11/tests/test_local_bindings.py
+++ b/wrap/pybind11/tests/test_local_bindings.py
@@ -2,7 +2,6 @@
 import pytest
 import env  # noqa: F401
 from pybind11_tests import local_bindings as m
@@ -36,8 +35,8 @@ def test_local_bindings():
     assert i2.get() == 11
     assert i2.get2() == 12
-    assert not hasattr(i1, 'get2')
-    assert not hasattr(i2, 'get3')
+    assert not hasattr(i1, "get2")
+    assert not hasattr(i2, "get3")
     # Loading within the local module
     assert m.local_value(i1) == 5
@@ -55,7 +54,9 @@ def test_nonlocal_failure():
     with pytest.raises(RuntimeError) as excinfo:
-    assert str(excinfo.value) == 'generic_type: type "NonLocalType" is already registered!'
+    assert (
+        str(excinfo.value) == 'generic_type: type "NonLocalType" is already registered!'
+    )
 def test_duplicate_local():
@@ -63,9 +64,12 @@ def test_duplicate_local():
     with pytest.raises(RuntimeError) as excinfo:
     import pybind11_tests
     assert str(excinfo.value) == (
         'generic_type: type "LocalExternal" is already registered!'
-        if hasattr(pybind11_tests, 'class_') else 'test_class not enabled')
+        if hasattr(pybind11_tests, "class_")
+        else "test_class not enabled"
+    )
 def test_stl_bind_local():
@@ -98,8 +102,8 @@ def test_stl_bind_local():
     d1["b"] = v1[1]
     d2["c"] = v2[0]
     d2["d"] = v2[1]
-    assert {i: d1[i].get() for i in d1} == {'a': 0, 'b': 1}
-    assert {i: d2[i].get() for i in d2} == {'c': 2, 'd': 3}
+    assert {i: d1[i].get() for i in d1} == {"a": 0, "b": 1}
+    assert {i: d2[i].get() for i in d2} == {"c": 2, "d": 3}
 def test_stl_bind_global():
@@ -107,15 +111,21 @@ def test_stl_bind_global():
     with pytest.raises(RuntimeError) as excinfo:
-    assert str(excinfo.value) == 'generic_type: type "NonLocalMap" is already registered!'
+    assert (
+        str(excinfo.value) == 'generic_type: type "NonLocalMap" is already registered!'
+    )
     with pytest.raises(RuntimeError) as excinfo:
-    assert str(excinfo.value) == 'generic_type: type "NonLocalVec" is already registered!'
+    assert (
+        str(excinfo.value) == 'generic_type: type "NonLocalVec" is already registered!'
+    )
     with pytest.raises(RuntimeError) as excinfo:
-    assert str(excinfo.value) == 'generic_type: type "NonLocalMap2" is already registered!'
+    assert (
+        str(excinfo.value) == 'generic_type: type "NonLocalMap2" is already registered!'
+    )
 def test_mixed_local_global():
@@ -123,6 +133,7 @@ def test_mixed_local_global():
     type can be registered even if the type is already registered globally.  With the module,
     casting will go to the local type; outside the module casting goes to the global type."""
     import pybind11_cross_module_tests as cm
@@ -145,17 +156,30 @@ def test_mixed_local_global():
-    assert [x.get() for x in a] == \
-        [101, 1002, 103, 1004, 105, 1006, 207, 2008, 109, 1010, 211, 2012]
+    assert [x.get() for x in a] == [
+        101,
+        1002,
+        103,
+        1004,
+        105,
+        1006,
+        207,
+        2008,
+        109,
+        1010,
+        211,
+        2012,
+    ]
 def test_internal_locals_differ():
     """Makes sure the internal local type map differs across the two modules"""
     import pybind11_cross_module_tests as cm
     assert m.local_cpp_types_addr() != cm.local_cpp_types_addr()
+@pytest.mark.xfail("env.PYPY and sys.pypy_version_info < (7, 3, 2)")
 def test_stl_caster_vs_stl_bind(msg):
     """One module uses a generic vector caster from `<pybind11/stl.h>` while the other
     exports `std::vector<int>` via `py:bind_vector` and `py::module_local`"""
@@ -168,13 +192,16 @@ def test_stl_caster_vs_stl_bind(msg):
     v2 = [1, 2, 3]
     assert m.load_vector_via_caster(v2) == 6
     with pytest.raises(TypeError) as excinfo:
-        cm.load_vector_via_binding(v2) == 6
-    assert msg(excinfo.value) == """
+        cm.load_vector_via_binding(v2)
+    assert (
+        msg(excinfo.value)
+        == """
     load_vector_via_binding(): incompatible function arguments. The following argument types are supported:
         1. (arg0: pybind11_cross_module_tests.VectorInt) -> int
     Invoked with: [1, 2, 3]
     """  # noqa: E501 line too long
+    )
 def test_cross_module_calls():
diff --git a/wrap/pybind11/tests/test_methods_and_attributes.cpp b/wrap/pybind11/tests/test_methods_and_attributes.cpp
index 11d4e7b350..9e55452dec 100644
--- a/wrap/pybind11/tests/test_methods_and_attributes.cpp
+++ b/wrap/pybind11/tests/test_methods_and_attributes.cpp
@@ -19,19 +19,21 @@ using overload_cast_ = pybind11::detail::overload_cast_impl<Args...>;
 class ExampleMandA {
     ExampleMandA() { print_default_created(this); }
-    ExampleMandA(int value) : value(value) { print_created(this, value); }
+    explicit ExampleMandA(int value) : value(value) { print_created(this, value); }
     ExampleMandA(const ExampleMandA &e) : value(e.value) { print_copy_created(this); }
-    ExampleMandA(std::string&&) {}
-    ExampleMandA(ExampleMandA &&e) : value(e.value) { print_move_created(this); }
+    explicit ExampleMandA(std::string &&) {}
+    ExampleMandA(ExampleMandA &&e) noexcept : value(e.value) { print_move_created(this); }
     ~ExampleMandA() { print_destroyed(this); }
-    std::string toString() {
-        return "ExampleMandA[value=" + std::to_string(value) + "]";
-    }
+    std::string toString() const { return "ExampleMandA[value=" + std::to_string(value) + "]"; }
     void operator=(const ExampleMandA &e) { print_copy_assigned(this); value = e.value; }
-    void operator=(ExampleMandA &&e) { print_move_assigned(this); value = e.value; }
+    void operator=(ExampleMandA &&e) noexcept {
+        print_move_assigned(this);
+        value = e.value;
+    }
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
     void add1(ExampleMandA other) { value += other.value; }         // passing by value
     void add2(ExampleMandA &other) { value += other.value; }        // passing by reference
     void add3(const ExampleMandA &other) { value += other.value; }  // passing by const reference
@@ -41,6 +43,7 @@ class ExampleMandA {
     void add6(int other) { value += other; }                        // passing by value
     void add7(int &other) { value += other; }                       // passing by reference
     void add8(const int &other) { value += other; }                 // passing by const reference
+    // NOLINTNEXTLINE(readability-non-const-parameter) Deliberately non-const for testing
     void add9(int *other) { value += *other; }                      // passing by pointer
     void add10(const int *other) { value += *other; }               // passing by const pointer
@@ -48,13 +51,13 @@ class ExampleMandA {
     ExampleMandA self1() { return *this; }                          // return by value
     ExampleMandA &self2() { return *this; }                         // return by reference
-    const ExampleMandA &self3() { return *this; }                   // return by const reference
+    const ExampleMandA &self3() const { return *this; }             // return by const reference
     ExampleMandA *self4() { return this; }                          // return by pointer
-    const ExampleMandA *self5() { return this; }                    // return by const pointer
+    const ExampleMandA *self5() const { return this; }              // return by const pointer
-    int internal1() { return value; }                               // return by value
+    int internal1() const { return value; }                         // return by value
     int &internal2() { return value; }                              // return by reference
-    const int &internal3() { return value; }                        // return by const reference
+    const int &internal3() const { return value; }                  // return by const reference
     int *internal4() { return &value; }                             // return by pointer
     const int *internal5() { return &value; }                       // return by const pointer
@@ -114,13 +117,21 @@ int none1(const NoneTester &obj) { return obj.answer; }
 int none2(NoneTester *obj) { return obj ? obj->answer : -1; }
 int none3(std::shared_ptr<NoneTester> &obj) { return obj ? obj->answer : -1; }
 int none4(std::shared_ptr<NoneTester> *obj) { return obj && *obj ? (*obj)->answer : -1; }
-int none5(std::shared_ptr<NoneTester> obj) { return obj ? obj->answer : -1; }
+int none5(const std::shared_ptr<NoneTester> &obj) { return obj ? obj->answer : -1; }
+// Issue #2778: implicit casting from None to object (not pointer)
+class NoneCastTester {
+    int answer = -1;
+    NoneCastTester() = default;
+    explicit NoneCastTester(int v) : answer(v) {}
 struct StrIssue {
     int val = -1;
     StrIssue() = default;
-    StrIssue(int i) : val{i} {}
+    explicit StrIssue(int i) : val{i} {}
 // Issues #854, #910: incompatible function args when member function/pointer is in unregistered base class
@@ -148,6 +159,14 @@ struct RefQualified {
     int constRefQualified(int other) const & { return value + other; }
+// Test rvalue ref param
+struct RValueRefParam {
+    std::size_t func1(std::string&& s) { return s.size(); }
+    std::size_t func2(std::string&& s) const { return s.size(); }
+    std::size_t func3(std::string&& s) & { return s.size(); }
+    std::size_t func4(std::string&& s) const & { return s.size(); }
 TEST_SUBMODULE(methods_and_attributes, m) {
     // test_methods_and_attributes
     py::class_<ExampleMandA> emna(m, "ExampleMandA");
@@ -207,12 +226,12 @@ TEST_SUBMODULE(methods_and_attributes, m) {
         // test_no_mixed_overloads
         // Raise error if trying to mix static/non-static overloads on the same name:
         .def_static("add_mixed_overloads1", []() {
-            auto emna = py::reinterpret_borrow<py::class_<ExampleMandA>>(py::module::import("pybind11_tests.methods_and_attributes").attr("ExampleMandA"));
+            auto emna = py::reinterpret_borrow<py::class_<ExampleMandA>>(py::module_::import("pybind11_tests.methods_and_attributes").attr("ExampleMandA"));
             emna.def       ("overload_mixed1", static_cast<py::str (ExampleMandA::*)(int, int)>(&ExampleMandA::overloaded))
                 .def_static("overload_mixed1", static_cast<py::str (              *)(float   )>(&ExampleMandA::overloaded));
         .def_static("add_mixed_overloads2", []() {
-            auto emna = py::reinterpret_borrow<py::class_<ExampleMandA>>(py::module::import("pybind11_tests.methods_and_attributes").attr("ExampleMandA"));
+            auto emna = py::reinterpret_borrow<py::class_<ExampleMandA>>(py::module_::import("pybind11_tests.methods_and_attributes").attr("ExampleMandA"));
             emna.def_static("overload_mixed2", static_cast<py::str (              *)(float   )>(&ExampleMandA::overloaded))
                 .def       ("overload_mixed2", static_cast<py::str (ExampleMandA::*)(int, int)>(&ExampleMandA::overloaded));
@@ -228,36 +247,41 @@ TEST_SUBMODULE(methods_and_attributes, m) {
         .def_readonly("def_readonly", &TestProperties::value)
         .def_readwrite("def_readwrite", &TestProperties::value)
-        .def_property("def_writeonly", nullptr,
-                      [](TestProperties& s,int v) { s.value = v; } )
+        .def_property("def_writeonly", nullptr, [](TestProperties &s, int v) { s.value = v; })
         .def_property("def_property_writeonly", nullptr, &TestProperties::set)
         .def_property_readonly("def_property_readonly", &TestProperties::get)
         .def_property("def_property", &TestProperties::get, &TestProperties::set)
         .def_property("def_property_impossible", nullptr, nullptr)
         .def_readonly_static("def_readonly_static", &TestProperties::static_value)
         .def_readwrite_static("def_readwrite_static", &TestProperties::static_value)
-        .def_property_static("def_writeonly_static", nullptr,
-                             [](py::object, int v) { TestProperties::static_value = v; })
-        .def_property_readonly_static("def_property_readonly_static",
-                                      [](py::object) { return TestProperties::static_get(); })
-        .def_property_static("def_property_writeonly_static", nullptr,
-                             [](py::object, int v) { return TestProperties::static_set(v); })
-        .def_property_static("def_property_static",
-                             [](py::object) { return TestProperties::static_get(); },
-                             [](py::object, int v) { TestProperties::static_set(v); })
-        .def_property_static("static_cls",
-                             [](py::object cls) { return cls; },
-                             [](py::object cls, py::function f) { f(cls); });
+        .def_property_static("def_writeonly_static",
+                             nullptr,
+                             [](const py::object &, int v) { TestProperties::static_value = v; })
+        .def_property_readonly_static(
+            "def_property_readonly_static",
+            [](const py::object &) { return TestProperties::static_get(); })
+        .def_property_static(
+            "def_property_writeonly_static",
+            nullptr,
+            [](const py::object &, int v) { return TestProperties::static_set(v); })
+        .def_property_static(
+            "def_property_static",
+            [](const py::object &) { return TestProperties::static_get(); },
+            [](const py::object &, int v) { TestProperties::static_set(v); })
+        .def_property_static(
+            "static_cls",
+            [](py::object cls) { return cls; },
+            [](const py::object &cls, const py::function &f) { f(cls); });
     py::class_<TestPropertiesOverride, TestProperties>(m, "TestPropertiesOverride")
         .def_readonly("def_readonly", &TestPropertiesOverride::value)
         .def_readonly_static("def_readonly_static", &TestPropertiesOverride::static_value);
-    auto static_get1 = [](py::object) -> const UserType & { return TestPropRVP::sv1; };
-    auto static_get2 = [](py::object) -> const UserType & { return TestPropRVP::sv2; };
-    auto static_set1 = [](py::object, int v) { TestPropRVP::sv1.set(v); };
-    auto static_set2 = [](py::object, int v) { TestPropRVP::sv2.set(v); };
+    auto static_get1 = [](const py::object &) -> const UserType & { return TestPropRVP::sv1; };
+    auto static_get2 = [](const py::object &) -> const UserType & { return TestPropRVP::sv2; };
+    auto static_set1 = [](const py::object &, int v) { TestPropRVP::sv1.set(v); };
+    auto static_set2 = [](const py::object &, int v) { TestPropRVP::sv2.set(v); };
     auto rvp_copy = py::return_value_policy::copy;
     // test_property_return_value_policies
@@ -268,21 +292,30 @@ TEST_SUBMODULE(methods_and_attributes, m) {
         .def_property_readonly("ro_func", py::cpp_function(&TestPropRVP::get2, rvp_copy))
         .def_property("rw_ref", &TestPropRVP::get1, &TestPropRVP::set1)
         .def_property("rw_copy", &TestPropRVP::get2, &TestPropRVP::set2, rvp_copy)
-        .def_property("rw_func", py::cpp_function(&TestPropRVP::get2, rvp_copy), &TestPropRVP::set2)
+        .def_property(
+            "rw_func", py::cpp_function(&TestPropRVP::get2, rvp_copy), &TestPropRVP::set2)
         .def_property_readonly_static("static_ro_ref", static_get1)
         .def_property_readonly_static("static_ro_copy", static_get2, rvp_copy)
         .def_property_readonly_static("static_ro_func", py::cpp_function(static_get2, rvp_copy))
         .def_property_static("static_rw_ref", static_get1, static_set1)
         .def_property_static("static_rw_copy", static_get2, static_set2, rvp_copy)
-        .def_property_static("static_rw_func", py::cpp_function(static_get2, rvp_copy), static_set2)
+        .def_property_static(
+            "static_rw_func", py::cpp_function(static_get2, rvp_copy), static_set2)
         // test_property_rvalue_policy
         .def_property_readonly("rvalue", &TestPropRVP::get_rvalue)
-        .def_property_readonly_static("static_rvalue", [](py::object) { return UserType(1); });
+        .def_property_readonly_static("static_rvalue",
+                                      [](const py::object &) { return UserType(1); });
     // test_metaclass_override
     struct MetaclassOverride { };
     py::class_<MetaclassOverride>(m, "MetaclassOverride", py::metaclass((PyObject *) &PyType_Type))
-        .def_property_readonly_static("readonly", [](py::object) { return 1; });
+        .def_property_readonly_static("readonly", [](const py::object &) { return 1; });
+    // test_overload_ordering
+    m.def("overload_order", [](const std::string &) { return 1; });
+    m.def("overload_order", [](const std::string &) { return 2; });
+    m.def("overload_order", [](int) { return 3; });
+    m.def("overload_order", [](int) { return 4; }, py::prepend{});
 #if !defined(PYPY_VERSION)
     // test_dynamic_attributes
@@ -308,28 +341,43 @@ TEST_SUBMODULE(methods_and_attributes, m) {
     m.attr("debug_enabled") = false;
     m.def("bad_arg_def_named", []{
-        auto m = py::module::import("pybind11_tests");
+        auto m = py::module_::import("pybind11_tests");
         m.def("should_fail", [](int, UnregisteredType) {}, py::arg(), py::arg("a") = UnregisteredType());
     m.def("bad_arg_def_unnamed", []{
-        auto m = py::module::import("pybind11_tests");
+        auto m = py::module_::import("pybind11_tests");
         m.def("should_fail", [](int, UnregisteredType) {}, py::arg(), py::arg() = UnregisteredType());
+    // [workaround(intel)] ICC 20/21 breaks with py::arg().stuff, using py::arg{}.stuff works.
     // test_accepts_none
     py::class_<NoneTester, std::shared_ptr<NoneTester>>(m, "NoneTester")
-    m.def("no_none1", &none1, py::arg().none(false));
-    m.def("no_none2", &none2, py::arg().none(false));
-    m.def("no_none3", &none3, py::arg().none(false));
-    m.def("no_none4", &none4, py::arg().none(false));
-    m.def("no_none5", &none5, py::arg().none(false));
+    m.def("no_none1", &none1, py::arg{}.none(false));
+    m.def("no_none2", &none2, py::arg{}.none(false));
+    m.def("no_none3", &none3, py::arg{}.none(false));
+    m.def("no_none4", &none4, py::arg{}.none(false));
+    m.def("no_none5", &none5, py::arg{}.none(false));
     m.def("ok_none1", &none1);
-    m.def("ok_none2", &none2, py::arg().none(true));
+    m.def("ok_none2", &none2, py::arg{}.none(true));
     m.def("ok_none3", &none3);
-    m.def("ok_none4", &none4, py::arg().none(true));
+    m.def("ok_none4", &none4, py::arg{}.none(true));
     m.def("ok_none5", &none5);
+    m.def("no_none_kwarg", &none2, "a"_a.none(false));
+    m.def("no_none_kwarg_kw_only", &none2, py::kw_only(), "a"_a.none(false));
+    // test_casts_none
+    // Issue #2778: implicit casting from None to object (not pointer)
+    py::class_<NoneCastTester>(m, "NoneCastTester")
+          .def(py::init<>())
+          .def(py::init<int>())
+          .def(py::init([](py::none const&) { return NoneCastTester{}; }));
+    py::implicitly_convertible<py::none, NoneCastTester>();
+    m.def("ok_obj_or_none", [](NoneCastTester const& foo) { return foo.answer; });
     // test_str_issue
     // Issue #283: __str__ called on uninitialized instance when constructor arguments invalid
     py::class_<StrIssue>(m, "StrIssue")
@@ -351,14 +399,14 @@ TEST_SUBMODULE(methods_and_attributes, m) {
         .def("increase_value", &RegisteredDerived::increase_value)
         .def_readwrite("rw_value", &RegisteredDerived::rw_value)
         .def_readonly("ro_value", &RegisteredDerived::ro_value)
-        // These should trigger a static_assert if uncommented
-        //.def_readwrite("fails", &UserType::value) // should trigger a static_assert if uncommented
-        //.def_readonly("fails", &UserType::value) // should trigger a static_assert if uncommented
+        // Uncommenting the next line should trigger a static_assert:
+        // .def_readwrite("fails", &UserType::value)
+        // Uncommenting the next line should trigger a static_assert:
+        // .def_readonly("fails", &UserType::value)
         .def_property("rw_value_prop", &RegisteredDerived::get_int, &RegisteredDerived::set_int)
         .def_property_readonly("ro_value_prop", &RegisteredDerived::get_double)
         // This one is in the registered class:
-        .def("sum", &RegisteredDerived::sum)
-        ;
+        .def("sum", &RegisteredDerived::sum);
     using Adapted = decltype(py::method_adaptor<RegisteredDerived>(&RegisteredDerived::do_nothing));
     static_assert(std::is_same<Adapted, void (RegisteredDerived::*)() const>::value, "");
@@ -369,4 +417,11 @@ TEST_SUBMODULE(methods_and_attributes, m) {
         .def_readonly("value", &RefQualified::value)
         .def("refQualified", &RefQualified::refQualified)
         .def("constRefQualified", &RefQualified::constRefQualified);
+    py::class_<RValueRefParam>(m, "RValueRefParam")
+        .def(py::init<>())
+        .def("func1", &RValueRefParam::func1)
+        .def("func2", &RValueRefParam::func2)
+        .def("func3", &RValueRefParam::func3)
+        .def("func4", &RValueRefParam::func4);
diff --git a/wrap/pybind11/tests/test_methods_and_attributes.py b/wrap/pybind11/tests/test_methods_and_attributes.py
index c296b6868d..fa026f9edd 100644
--- a/wrap/pybind11/tests/test_methods_and_attributes.py
+++ b/wrap/pybind11/tests/test_methods_and_attributes.py
@@ -2,9 +2,8 @@
 import pytest
 import env  # noqa: F401
-from pybind11_tests import methods_and_attributes as m
 from pybind11_tests import ConstructorStats
+from pybind11_tests import methods_and_attributes as m
 def test_methods_and_attributes():
@@ -40,17 +39,17 @@ def test_methods_and_attributes():
     assert instance1.overloaded(0) == "(int)"
     assert instance1.overloaded(1, 1.0) == "(int, float)"
     assert instance1.overloaded(2.0, 2) == "(float, int)"
-    assert instance1.overloaded(3,   3) == "(int, int)"
-    assert instance1.overloaded(4., 4.) == "(float, float)"
+    assert instance1.overloaded(3, 3) == "(int, int)"
+    assert instance1.overloaded(4.0, 4.0) == "(float, float)"
     assert instance1.overloaded_const(-3) == "(int) const"
     assert instance1.overloaded_const(5, 5.0) == "(int, float) const"
     assert instance1.overloaded_const(6.0, 6) == "(float, int) const"
-    assert instance1.overloaded_const(7,   7) == "(int, int) const"
-    assert instance1.overloaded_const(8., 8.) == "(float, float) const"
+    assert instance1.overloaded_const(7, 7) == "(int, int) const"
+    assert instance1.overloaded_const(8.0, 8.0) == "(float, float) const"
     assert instance1.overloaded_float(1, 1) == "(float, float)"
-    assert instance1.overloaded_float(1, 1.) == "(float, float)"
-    assert instance1.overloaded_float(1., 1) == "(float, float)"
-    assert instance1.overloaded_float(1., 1.) == "(float, float)"
+    assert instance1.overloaded_float(1, 1.0) == "(float, float)"
+    assert instance1.overloaded_float(1.0, 1) == "(float, float)"
+    assert instance1.overloaded_float(1.0, 1.0) == "(float, float)"
     assert instance1.value == 320
     instance1.value = 100
@@ -103,7 +102,7 @@ def test_properties():
     assert instance.def_property == 3
     with pytest.raises(AttributeError) as excinfo:
-        dummy = instance.def_property_writeonly  # noqa: F841 unused var
+        dummy = instance.def_property_writeonly  # unused var
     assert "unreadable attribute" in str(excinfo.value)
     instance.def_property_writeonly = 4
@@ -128,7 +127,7 @@ def test_static_properties():
     assert m.TestProperties.def_readwrite_static == 2
     with pytest.raises(AttributeError) as excinfo:
-        dummy = m.TestProperties.def_writeonly_static  # noqa: F841 unused var
+        dummy = m.TestProperties.def_writeonly_static  # unused var
     assert "unreadable attribute" in str(excinfo.value)
     m.TestProperties.def_writeonly_static = 3
@@ -171,6 +170,19 @@ def test_static_properties():
     assert m.TestPropertiesOverride().def_readonly == 99
     assert m.TestPropertiesOverride.def_readonly_static == 99
+    # Only static attributes can be deleted
+    del m.TestPropertiesOverride.def_readonly_static
+    assert (
+        hasattr(m.TestPropertiesOverride, "def_readonly_static")
+        and m.TestPropertiesOverride.def_readonly_static
+        is m.TestProperties.def_readonly_static
+    )
+    assert "def_readonly_static" not in m.TestPropertiesOverride.__dict__
+    properties_override = m.TestPropertiesOverride()
+    with pytest.raises(AttributeError) as excinfo:
+        del properties_override.def_readonly
+    assert "can't delete attribute" in str(excinfo.value)
 def test_static_cls():
     """Static property getter and setters expect the type object as the their only argument"""
@@ -193,7 +205,10 @@ def test_metaclass_override():
     assert type(m.MetaclassOverride).__name__ == "type"
     assert m.MetaclassOverride.readonly == 1
-    assert type(m.MetaclassOverride.__dict__["readonly"]).__name__ == "pybind11_static_property"
+    assert (
+        type(m.MetaclassOverride.__dict__["readonly"]).__name__
+        == "pybind11_static_property"
+    )
     # Regular `type` replaces the property instead of calling `__set__()`
     m.MetaclassOverride.readonly = 2
@@ -206,22 +221,26 @@ def test_no_mixed_overloads():
     with pytest.raises(RuntimeError) as excinfo:
-    assert (str(excinfo.value) ==
-            "overloading a method with both static and instance methods is not supported; " +
-            ("compile in debug mode for more details" if not debug_enabled else
-             "error while attempting to bind static method ExampleMandA.overload_mixed1"
-             "(arg0: float) -> str")
-            )
+    assert str(
+        excinfo.value
+    ) == "overloading a method with both static and instance methods is not supported; " + (
+        "compile in debug mode for more details"
+        if not debug_enabled
+        else "error while attempting to bind static method ExampleMandA.overload_mixed1"
+        "(arg0: float) -> str"
+    )
     with pytest.raises(RuntimeError) as excinfo:
-    assert (str(excinfo.value) ==
-            "overloading a method with both static and instance methods is not supported; " +
-            ("compile in debug mode for more details" if not debug_enabled else
-             "error while attempting to bind instance method ExampleMandA.overload_mixed2"
-             "(self: pybind11_tests.methods_and_attributes.ExampleMandA, arg0: int, arg1: int)"
-             " -> str")
-            )
+    assert str(
+        excinfo.value
+    ) == "overloading a method with both static and instance methods is not supported; " + (
+        "compile in debug mode for more details"
+        if not debug_enabled
+        else "error while attempting to bind instance method ExampleMandA.overload_mixed2"
+        "(self: pybind11_tests.methods_and_attributes.ExampleMandA, arg0: int, arg1: int)"
+        " -> str"
+    )
 @pytest.mark.parametrize("access", ["ro", "rw", "static_ro", "static_rw"])
@@ -333,8 +352,8 @@ def test_bad_arg_default(msg):
     assert msg(excinfo.value) == (
         "arg(): could not convert default argument 'a: UnregisteredType' in function "
         "'should_fail' into a Python object (type not registered yet?)"
-        if debug_enabled else
-        "arg(): could not convert default argument into a Python object (type not registered "
+        if debug_enabled
+        else "arg(): could not convert default argument into a Python object (type not registered "
         "yet?). Compile in debug mode for more information."
@@ -343,8 +362,8 @@ def test_bad_arg_default(msg):
     assert msg(excinfo.value) == (
         "arg(): could not convert default argument 'UnregisteredType' in function "
         "'should_fail' into a Python object (type not registered yet?)"
-        if debug_enabled else
-        "arg(): could not convert default argument into a Python object (type not registered "
+        if debug_enabled
+        else "arg(): could not convert default argument into a Python object (type not registered "
         "yet?). Compile in debug mode for more information."
@@ -381,12 +400,15 @@ def test_accepts_none(msg):
     # The first one still raises because you can't pass None as a lvalue reference arg:
     with pytest.raises(TypeError) as excinfo:
         assert m.ok_none1(None) == -1
-    assert msg(excinfo.value) == """
+    assert (
+        msg(excinfo.value)
+        == """
         ok_none1(): incompatible function arguments. The following argument types are supported:
             1. (arg0: m.methods_and_attributes.NoneTester) -> int
         Invoked with: None
+    )
     # The rest take the argument as pointer or holder, and accept None:
     assert m.ok_none2(None) == -1
@@ -394,6 +416,30 @@ def test_accepts_none(msg):
     assert m.ok_none4(None) == -1
     assert m.ok_none5(None) == -1
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none_kwarg(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none_kwarg(a=None)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none_kwarg_kw_only(None)
+    assert "incompatible function arguments" in str(excinfo.value)
+    with pytest.raises(TypeError) as excinfo:
+        m.no_none_kwarg_kw_only(a=None)
+    assert "incompatible function arguments" in str(excinfo.value)
+def test_casts_none():
+    """#2778: implicit casting from None to object (not pointer)"""
+    a = m.NoneCastTester()
+    assert m.ok_obj_or_none(a) == -1
+    a = m.NoneCastTester(4)
+    assert m.ok_obj_or_none(a) == 4
+    a = m.NoneCastTester(None)
+    assert m.ok_obj_or_none(a) == -1
+    assert m.ok_obj_or_none(None) == -1
 def test_str_issue(msg):
     """#283: __str__ called on uninitialized instance when constructor arguments invalid"""
@@ -402,13 +448,16 @@ def test_str_issue(msg):
     with pytest.raises(TypeError) as excinfo:
         str(m.StrIssue("no", "such", "constructor"))
-    assert msg(excinfo.value) == """
+    assert (
+        msg(excinfo.value)
+        == """
         __init__(): incompatible constructor arguments. The following argument types are supported:
             1. m.methods_and_attributes.StrIssue(arg0: int)
             2. m.methods_and_attributes.StrIssue()
         Invoked with: 'no', 'such', 'constructor'
+    )
 def test_unregistered_base_implementations():
@@ -438,3 +487,39 @@ def test_ref_qualified():
     assert r.value == 17
     assert r.constRefQualified(23) == 40
+def test_overload_ordering():
+    "Check to see if the normal overload order (first defined) and prepend overload order works"
+    assert m.overload_order("string") == 1
+    assert m.overload_order(0) == 4
+    # Different for Python 2 vs. 3
+    uni_name = type(u"").__name__
+    assert "1. overload_order(arg0: int) -> int" in m.overload_order.__doc__
+    assert (
+        "2. overload_order(arg0: {}) -> int".format(uni_name)
+        in m.overload_order.__doc__
+    )
+    assert (
+        "3. overload_order(arg0: {}) -> int".format(uni_name)
+        in m.overload_order.__doc__
+    )
+    assert "4. overload_order(arg0: int) -> int" in m.overload_order.__doc__
+    with pytest.raises(TypeError) as err:
+        m.overload_order(1.1)
+    assert "1. (arg0: int) -> int" in str(err.value)
+    assert "2. (arg0: {}) -> int".format(uni_name) in str(err.value)
+    assert "3. (arg0: {}) -> int".format(uni_name) in str(err.value)
+    assert "4. (arg0: int) -> int" in str(err.value)
+def test_rvalue_ref_param():
+    r = m.RValueRefParam()
+    assert r.func1("123") == 3
+    assert r.func2("1234") == 4
+    assert r.func3("12345") == 5
+    assert r.func4("123456") == 6
diff --git a/wrap/pybind11/tests/test_modules.cpp b/wrap/pybind11/tests/test_modules.cpp
index c1475fa623..ce61c1a25c 100644
--- a/wrap/pybind11/tests/test_modules.cpp
+++ b/wrap/pybind11/tests/test_modules.cpp
@@ -13,17 +13,19 @@
 TEST_SUBMODULE(modules, m) {
     // test_nested_modules
+    // This is intentionally "py::module" to verify it still can be used in place of "py::module_"
     py::module m_sub = m.def_submodule("subsubmodule");
     m_sub.def("submodule_func", []() { return "submodule_func()"; });
     // test_reference_internal
     class A {
-        A(int v) : v(v) { print_created(this, v); }
+        explicit A(int v) : v(v) { print_created(this, v); }
         ~A() { print_destroyed(this); }
         A(const A&) { print_copy_created(this); }
         A& operator=(const A &copy) { print_copy_assigned(this); v = copy.v; return *this; }
-        std::string toString() { return "A[" + std::to_string(v) + "]"; }
+        std::string toString() const { return "A[" + std::to_string(v) + "]"; }
         int v;
@@ -50,6 +52,7 @@ TEST_SUBMODULE(modules, m) {
         .def_readwrite("a1", &B::a1)  // def_readonly uses an internal reference return policy by default
         .def_readwrite("a2", &B::a2);
+    // This is intentionally "py::module" to verify it still can be used in place of "py::module_"
     m.attr("OD") = py::module::import("collections").attr("OrderedDict");
     // test_duplicate_registration
@@ -60,7 +63,8 @@ TEST_SUBMODULE(modules, m) {
         class Dupe3 { };
         class DupeException { };
-        auto dm = py::module("dummy");
+        // Go ahead and leak, until we have a non-leaking py::module_ constructor
+        auto dm = py::module_::create_extension_module("dummy", nullptr, new py::module_::module_def);
         auto failures = py::list();
         py::class_<Dupe1>(dm, "Dupe1");
diff --git a/wrap/pybind11/tests/test_modules.py b/wrap/pybind11/tests/test_modules.py
index 7e21005245..49e1ea5e30 100644
--- a/wrap/pybind11/tests/test_modules.py
+++ b/wrap/pybind11/tests/test_modules.py
@@ -1,14 +1,18 @@
 # -*- coding: utf-8 -*-
+from pybind11_tests import ConstructorStats
 from pybind11_tests import modules as m
 from pybind11_tests.modules import subsubmodule as ms
-from pybind11_tests import ConstructorStats
 def test_nested_modules():
     import pybind11_tests
     assert pybind11_tests.__name__ == "pybind11_tests"
     assert pybind11_tests.modules.__name__ == "pybind11_tests.modules"
-    assert pybind11_tests.modules.subsubmodule.__name__ == "pybind11_tests.modules.subsubmodule"
+    assert (
+        pybind11_tests.modules.subsubmodule.__name__
+        == "pybind11_tests.modules.subsubmodule"
+    )
     assert m.__name__ == "pybind11_tests.modules"
     assert ms.__name__ == "pybind11_tests.modules.subsubmodule"
@@ -35,7 +39,7 @@ def test_reference_internal():
     del b
     assert astats.alive() == 0
     assert bstats.alive() == 0
-    assert astats.values() == ['1', '2', '42', '43']
+    assert astats.values() == ["1", "2", "42", "43"]
     assert bstats.values() == []
     assert astats.default_constructions == 0
     assert bstats.default_constructions == 1
@@ -50,18 +54,20 @@ def test_reference_internal():
 def test_importing():
-    from pybind11_tests.modules import OD
     from collections import OrderedDict
+    from pybind11_tests.modules import OD
     assert OD is OrderedDict
-    assert str(OD([(1, 'a'), (2, 'b')])) == "OrderedDict([(1, 'a'), (2, 'b')])"
+    assert str(OD([(1, "a"), (2, "b")])) == "OrderedDict([(1, 'a'), (2, 'b')])"
 def test_pydoc():
     """Pydoc needs to be able to provide help() for everything inside a pybind11 module"""
-    import pybind11_tests
     import pydoc
+    import pybind11_tests
     assert pybind11_tests.__name__ == "pybind11_tests"
     assert pybind11_tests.__doc__ == "pybind11 test module"
     assert pydoc.text.docmodule(pybind11_tests)
@@ -71,3 +77,16 @@ def test_duplicate_registration():
     """Registering two things with the same name"""
     assert m.duplicate_registration() == []
+def test_builtin_key_type():
+    """Test that all the keys in the builtin modules have type str.
+    Previous versions of pybind11 would add a unicode key in python 2.
+    """
+    if hasattr(__builtins__, "keys"):
+        keys = __builtins__.keys()
+    else:  # this is to make pypy happy since builtins is different there.
+        keys = __builtins__.__dict__.keys()
+    assert {type(k) for k in keys} == {str}
diff --git a/wrap/pybind11/tests/test_multiple_inheritance.cpp b/wrap/pybind11/tests/test_multiple_inheritance.cpp
index 70e3417854..4689df4e46 100644
--- a/wrap/pybind11/tests/test_multiple_inheritance.cpp
+++ b/wrap/pybind11/tests/test_multiple_inheritance.cpp
@@ -11,10 +11,12 @@
 #include "pybind11_tests.h"
 #include "constructor_stats.h"
+namespace {
 // Many bases for testing that multiple inheritance from many classes (i.e. requiring extra
 // space for holder constructed flags) works.
 template <int N> struct BaseN {
-    BaseN(int i) : i(i) { }
+    explicit BaseN(int i) : i(i) {}
     int i;
@@ -43,13 +45,40 @@ int WithStatic2::static_value2 = 2;
 int VanillaStaticMix1::static_value = 12;
 int VanillaStaticMix2::static_value = 12;
+// test_multiple_inheritance_virtbase
+struct Base1a {
+    explicit Base1a(int i) : i(i) {}
+    int foo() const { return i; }
+    int i;
+struct Base2a {
+    explicit Base2a(int i) : i(i) {}
+    int bar() const { return i; }
+    int i;
+struct Base12a : Base1a, Base2a {
+    Base12a(int i, int j) : Base1a(i), Base2a(j) { }
+// test_mi_unaligned_base
+// test_mi_base_return
+struct I801B1 { int a = 1; I801B1() = default; I801B1(const I801B1 &) = default; virtual ~I801B1() = default; };
+struct I801B2 { int b = 2; I801B2() = default; I801B2(const I801B2 &) = default; virtual ~I801B2() = default; };
+struct I801C : I801B1, I801B2 {};
+struct I801D : I801C {}; // Indirect MI
+} // namespace
 TEST_SUBMODULE(multiple_inheritance, m) {
+    // Please do not interleave `struct` and `class` definitions with bindings code,
+    // but implement `struct`s and `class`es in the anonymous namespace above.
+    // This helps keeping the smart_holder branch in sync with master.
     // test_multiple_inheritance_mix1
     // test_multiple_inheritance_mix2
     struct Base1 {
-        Base1(int i) : i(i) { }
-        int foo() { return i; }
+        explicit Base1(int i) : i(i) {}
+        int foo() const { return i; }
         int i;
     py::class_<Base1> b1(m, "Base1");
@@ -57,8 +86,8 @@ TEST_SUBMODULE(multiple_inheritance, m) {
       .def("foo", &Base1::foo);
     struct Base2 {
-        Base2(int i) : i(i) { }
-        int bar() { return i; }
+        explicit Base2(int i) : i(i) {}
+        int bar() const { return i; }
         int i;
     py::class_<Base2> b2(m, "Base2");
@@ -79,7 +108,10 @@ TEST_SUBMODULE(multiple_inheritance, m) {
     // test_multiple_inheritance_python_many_bases
-    #define PYBIND11_BASEN(N) py::class_<BaseN<N>>(m, "BaseN" #N).def(py::init<int>()).def("f" #N, [](BaseN<N> &b) { return b.i + N; })
+#define PYBIND11_BASEN(N)                                                                         \
+    py::class_<BaseN<(N)>>(m, "BaseN" #N).def(py::init<int>()).def("f" #N, [](BaseN<N> &b) {      \
+        return b.i + (N);                                                                         \
+    })
@@ -99,41 +131,24 @@ TEST_SUBMODULE(multiple_inheritance, m) {
     // test_multiple_inheritance_virtbase
     // Test the case where not all base classes are specified, and where pybind11 requires the
     // py::multiple_inheritance flag to perform proper casting between types.
-    struct Base1a {
-        Base1a(int i) : i(i) { }
-        int foo() { return i; }
-        int i;
-    };
     py::class_<Base1a, std::shared_ptr<Base1a>>(m, "Base1a")
         .def("foo", &Base1a::foo);
-    struct Base2a {
-        Base2a(int i) : i(i) { }
-        int bar() { return i; }
-        int i;
-    };
     py::class_<Base2a, std::shared_ptr<Base2a>>(m, "Base2a")
         .def("bar", &Base2a::bar);
-    struct Base12a : Base1a, Base2a {
-        Base12a(int i, int j) : Base1a(i), Base2a(j) { }
-    };
     py::class_<Base12a, /* Base1 missing */ Base2a,
                std::shared_ptr<Base12a>>(m, "Base12a", py::multiple_inheritance())
         .def(py::init<int, int>());
     m.def("bar_base2a", [](Base2a *b) { return b->bar(); });
-    m.def("bar_base2a_sharedptr", [](std::shared_ptr<Base2a> b) { return b->bar(); });
+    m.def("bar_base2a_sharedptr", [](const std::shared_ptr<Base2a> &b) { return b->bar(); });
     // test_mi_unaligned_base
     // test_mi_base_return
     // Issue #801: invalid casting to derived type with MI bases
-    struct I801B1 { int a = 1; I801B1() = default; I801B1(const I801B1 &) = default; virtual ~I801B1() = default; };
-    struct I801B2 { int b = 2; I801B2() = default; I801B2(const I801B2 &) = default; virtual ~I801B2() = default; };
-    struct I801C : I801B1, I801B2 {};
-    struct I801D : I801C {}; // Indirect MI
     // Unregistered classes:
     struct I801B3 { int c = 3; virtual ~I801B3() = default; };
     struct I801E : I801B3, I801D {};
@@ -193,14 +208,12 @@ TEST_SUBMODULE(multiple_inheritance, m) {
         .def_readwrite_static("static_value", &VanillaStaticMix2::static_value);
-#if !(defined(PYPY_VERSION) && (PYPY_VERSION_NUM < 0x06000000))
     struct WithDict { };
     struct VanillaDictMix1 : Vanilla, WithDict { };
     struct VanillaDictMix2 : WithDict, Vanilla { };
     py::class_<WithDict>(m, "WithDict", py::dynamic_attr()).def(py::init<>());
     py::class_<VanillaDictMix1, Vanilla, WithDict>(m, "VanillaDictMix1").def(py::init<>());
     py::class_<VanillaDictMix2, WithDict, Vanilla>(m, "VanillaDictMix2").def(py::init<>());
     // test_diamond_inheritance
     // Issue #959: segfault when constructing diamond inheritance instance
@@ -217,4 +230,87 @@ TEST_SUBMODULE(multiple_inheritance, m) {
         .def("c1", [](C1 *self) { return self; });
     py::class_<D, C0, C1>(m, "D")
+    // test_pr3635_diamond_*
+    // - functions are get_{base}_{var}, return {var}
+    struct MVB {
+        MVB() = default;
+        MVB(const MVB &) = default;
+        virtual ~MVB() = default;
+        int b = 1;
+        int get_b_b() const { return b; }
+    };
+    struct MVC : virtual MVB {
+        int c = 2;
+        int get_c_b() const { return b; }
+        int get_c_c() const { return c; }
+    };
+    struct MVD0 : virtual MVC {
+        int d0 = 3;
+        int get_d0_b() const { return b; }
+        int get_d0_c() const { return c; }
+        int get_d0_d0() const { return d0; }
+    };
+    struct MVD1 : virtual MVC {
+        int d1 = 4;
+        int get_d1_b() const { return b; }
+        int get_d1_c() const { return c; }
+        int get_d1_d1() const { return d1; }
+    };
+    struct MVE : virtual MVD0, virtual MVD1 {
+        int e = 5;
+        int get_e_b() const { return b; }
+        int get_e_c() const { return c; }
+        int get_e_d0() const { return d0; }
+        int get_e_d1() const { return d1; }
+        int get_e_e() const { return e; }
+    };
+    struct MVF : virtual MVE {
+        int f = 6;
+        int get_f_b() const { return b; }
+        int get_f_c() const { return c; }
+        int get_f_d0() const { return d0; }
+        int get_f_d1() const { return d1; }
+        int get_f_e() const { return e; }
+        int get_f_f() const { return f; }
+    };
+    py::class_<MVB>(m, "MVB")
+        .def(py::init<>())
+        .def("get_b_b", &MVB::get_b_b)
+        .def_readwrite("b", &MVB::b);
+    py::class_<MVC, MVB>(m, "MVC")
+        .def(py::init<>())
+        .def("get_c_b", &MVC::get_c_b)
+        .def("get_c_c", &MVC::get_c_c)
+        .def_readwrite("c", &MVC::c);
+    py::class_<MVD0, MVC>(m, "MVD0")
+        .def(py::init<>())
+        .def("get_d0_b", &MVD0::get_d0_b)
+        .def("get_d0_c", &MVD0::get_d0_c)
+        .def("get_d0_d0", &MVD0::get_d0_d0)
+        .def_readwrite("d0", &MVD0::d0);
+    py::class_<MVD1, MVC>(m, "MVD1")
+        .def(py::init<>())
+        .def("get_d1_b", &MVD1::get_d1_b)
+        .def("get_d1_c", &MVD1::get_d1_c)
+        .def("get_d1_d1", &MVD1::get_d1_d1)
+        .def_readwrite("d1", &MVD1::d1);
+    py::class_<MVE, MVD0, MVD1>(m, "MVE")
+        .def(py::init<>())
+        .def("get_e_b", &MVE::get_e_b)
+        .def("get_e_c", &MVE::get_e_c)
+        .def("get_e_d0", &MVE::get_e_d0)
+        .def("get_e_d1", &MVE::get_e_d1)
+        .def("get_e_e", &MVE::get_e_e)
+        .def_readwrite("e", &MVE::e);
+    py::class_<MVF, MVE>(m, "MVF")
+        .def(py::init<>())
+        .def("get_f_b", &MVF::get_f_b)
+        .def("get_f_c", &MVF::get_f_c)
+        .def("get_f_d0", &MVF::get_f_d0)
+        .def("get_f_d1", &MVF::get_f_d1)
+        .def("get_f_e", &MVF::get_f_e)
+        .def("get_f_f", &MVF::get_f_f)
+        .def_readwrite("f", &MVF::f);
diff --git a/wrap/pybind11/tests/test_multiple_inheritance.py b/wrap/pybind11/tests/test_multiple_inheritance.py
index 7a0259d214..abdf25d608 100644
--- a/wrap/pybind11/tests/test_multiple_inheritance.py
+++ b/wrap/pybind11/tests/test_multiple_inheritance.py
@@ -2,7 +2,6 @@
 import pytest
 import env  # noqa: F401
 from pybind11_tests import ConstructorStats
 from pybind11_tests import multiple_inheritance as m
@@ -57,7 +56,6 @@ def __init__(self, i, j):
 @pytest.mark.skipif("env.PYPY and env.PY2")
 @pytest.mark.xfail("env.PYPY and not env.PY2")
 def test_multiple_inheritance_python():
     class MI1(m.Base1, m.Base2):
         def __init__(self, i, j):
             m.Base1.__init__(self, i)
@@ -163,7 +161,6 @@ def __init__(self, i):
 def test_multiple_inheritance_python_many_bases():
     class MIMany14(m.BaseN1, m.BaseN2, m.BaseN3, m.BaseN4):
         def __init__(self):
             m.BaseN1.__init__(self, 1)
@@ -178,8 +175,16 @@ def __init__(self):
             m.BaseN7.__init__(self, 7)
             m.BaseN8.__init__(self, 8)
-    class MIMany916(m.BaseN9, m.BaseN10, m.BaseN11, m.BaseN12, m.BaseN13, m.BaseN14, m.BaseN15,
-                    m.BaseN16):
+    class MIMany916(
+        m.BaseN9,
+        m.BaseN10,
+        m.BaseN11,
+        m.BaseN12,
+        m.BaseN13,
+        m.BaseN14,
+        m.BaseN15,
+        m.BaseN16,
+    ):
         def __init__(self):
             m.BaseN9.__init__(self, 9)
             m.BaseN10.__init__(self, 10)
@@ -225,7 +230,6 @@ def __init__(self):
 def test_multiple_inheritance_virtbase():
     class MITypePy(m.Base12a):
         def __init__(self, i, j):
             m.Base12a.__init__(self, i, j)
@@ -238,7 +242,7 @@ def __init__(self, i, j):
 def test_mi_static_properties():
     """Mixing bases with and without static properties should be possible
-     and the result should be independent of base definition order"""
+    and the result should be independent of base definition order"""
     for d in (m.VanillaStaticMix1(), m.VanillaStaticMix2()):
         assert d.vanilla() == "Vanilla"
@@ -354,3 +358,139 @@ def test_diamond_inheritance():
     assert d is d.c0().b()
     assert d is d.c1().b()
     assert d is d.c0().c1().b().c0().b()
+def test_pr3635_diamond_b():
+    o = m.MVB()
+    assert o.b == 1
+    assert o.get_b_b() == 1
+def test_pr3635_diamond_c():
+    o = m.MVC()
+    assert o.b == 1
+    assert o.c == 2
+    assert o.get_b_b() == 1
+    assert o.get_c_b() == 1
+    assert o.get_c_c() == 2
+def test_pr3635_diamond_d0():
+    o = m.MVD0()
+    assert o.b == 1
+    assert o.c == 2
+    assert o.d0 == 3
+    assert o.get_b_b() == 1
+    assert o.get_c_b() == 1
+    assert o.get_d0_b() == 1
+    assert o.get_c_c() == 2
+    assert o.get_d0_c() == 2
+    assert o.get_d0_d0() == 3
+def test_pr3635_diamond_d1():
+    o = m.MVD1()
+    assert o.b == 1
+    assert o.c == 2
+    assert o.d1 == 4
+    assert o.get_b_b() == 1
+    assert o.get_c_b() == 1
+    assert o.get_d1_b() == 1
+    assert o.get_c_c() == 2
+    assert o.get_d1_c() == 2
+    assert o.get_d1_d1() == 4
+def test_pr3635_diamond_e():
+    o = m.MVE()
+    assert o.b == 1
+    assert o.c == 2
+    assert o.d0 == 3
+    assert o.d1 == 4
+    assert o.e == 5
+    assert o.get_b_b() == 1
+    assert o.get_c_b() == 1
+    assert o.get_d0_b() == 1
+    assert o.get_d1_b() == 1
+    assert o.get_e_b() == 1
+    assert o.get_c_c() == 2
+    assert o.get_d0_c() == 2
+    assert o.get_d1_c() == 2
+    assert o.get_e_c() == 2
+    assert o.get_d0_d0() == 3
+    assert o.get_e_d0() == 3
+    assert o.get_d1_d1() == 4
+    assert o.get_e_d1() == 4
+    assert o.get_e_e() == 5
+def test_pr3635_diamond_f():
+    o = m.MVF()
+    assert o.b == 1
+    assert o.c == 2
+    assert o.d0 == 3
+    assert o.d1 == 4
+    assert o.e == 5
+    assert o.f == 6
+    assert o.get_b_b() == 1
+    assert o.get_c_b() == 1
+    assert o.get_d0_b() == 1
+    assert o.get_d1_b() == 1
+    assert o.get_e_b() == 1
+    assert o.get_f_b() == 1
+    assert o.get_c_c() == 2
+    assert o.get_d0_c() == 2
+    assert o.get_d1_c() == 2
+    assert o.get_e_c() == 2
+    assert o.get_f_c() == 2
+    assert o.get_d0_d0() == 3
+    assert o.get_e_d0() == 3
+    assert o.get_f_d0() == 3
+    assert o.get_d1_d1() == 4
+    assert o.get_e_d1() == 4
+    assert o.get_f_d1() == 4
+    assert o.get_e_e() == 5
+    assert o.get_f_e() == 5
+    assert o.get_f_f() == 6
+def test_python_inherit_from_mi():
+    """Tests extending a Python class from a single inheritor of a MI class"""
+    class PyMVF(m.MVF):
+        g = 7
+        def get_g_g(self):
+            return self.g
+    o = PyMVF()
+    assert o.b == 1
+    assert o.c == 2
+    assert o.d0 == 3
+    assert o.d1 == 4
+    assert o.e == 5
+    assert o.f == 6
+    assert o.g == 7
+    assert o.get_g_g() == 7
diff --git a/wrap/pybind11/tests/test_numpy_array.cpp b/wrap/pybind11/tests/test_numpy_array.cpp
index 33f1d7857c..30a71acc9b 100644
--- a/wrap/pybind11/tests/test_numpy_array.cpp
+++ b/wrap/pybind11/tests/test_numpy_array.cpp
@@ -13,6 +13,7 @@
 #include <pybind11/stl.h>
 #include <cstdint>
+#include <utility>
 // Size / dtype checks.
 struct DtypeCheck {
@@ -22,7 +23,7 @@ struct DtypeCheck {
 template <typename T>
 DtypeCheck get_dtype_check(const char* name) {
-    py::module np = py::module::import("numpy");
+    py::module_ np = py::module_::import("numpy");
     DtypeCheck check{};
     check.numpy = np.attr("dtype")(np.attr(name));
     check.pybind11 = py::dtype::of<T>();
@@ -89,23 +90,23 @@ template<typename... Ix> arr data_t(const arr_t& a, Ix... index) {
 template<typename... Ix> arr& mutate_data(arr& a, Ix... index) {
     auto ptr = (uint8_t *) a.mutable_data(index...);
-    for (ssize_t i = 0; i < a.nbytes() - a.offset_at(index...); i++)
+    for (py::ssize_t i = 0; i < a.nbytes() - a.offset_at(index...); i++)
         ptr[i] = (uint8_t) (ptr[i] * 2);
     return a;
 template<typename... Ix> arr_t& mutate_data_t(arr_t& a, Ix... index) {
     auto ptr = a.mutable_data(index...);
-    for (ssize_t i = 0; i < a.size() - a.index_at(index...); i++)
+    for (py::ssize_t i = 0; i < a.size() - a.index_at(index...); i++)
     return a;
-template<typename... Ix> ssize_t index_at(const arr& a, Ix... idx) { return a.index_at(idx...); }
-template<typename... Ix> ssize_t index_at_t(const arr_t& a, Ix... idx) { return a.index_at(idx...); }
-template<typename... Ix> ssize_t offset_at(const arr& a, Ix... idx) { return a.offset_at(idx...); }
-template<typename... Ix> ssize_t offset_at_t(const arr_t& a, Ix... idx) { return a.offset_at(idx...); }
-template<typename... Ix> ssize_t at_t(const arr_t& a, Ix... idx) { return a.at(idx...); }
+template<typename... Ix> py::ssize_t index_at(const arr& a, Ix... idx) { return a.index_at(idx...); }
+template<typename... Ix> py::ssize_t index_at_t(const arr_t& a, Ix... idx) { return a.index_at(idx...); }
+template<typename... Ix> py::ssize_t offset_at(const arr& a, Ix... idx) { return a.offset_at(idx...); }
+template<typename... Ix> py::ssize_t offset_at_t(const arr_t& a, Ix... idx) { return a.offset_at(idx...); }
+template<typename... Ix> py::ssize_t at_t(const arr_t& a, Ix... idx) { return a.at(idx...); }
 template<typename... Ix> arr_t& mutate_at_t(arr_t& a, Ix... idx) { a.mutable_at(idx...)++; return a; }
 #define def_index_fn(name, type) \
@@ -133,7 +134,7 @@ template <typename T, typename T2> py::handle auxiliaries(T &&r, T2 &&r2) {
 static int data_i = 42;
 TEST_SUBMODULE(numpy_array, sm) {
-    try { py::module::import("numpy"); }
+    try { py::module_::import("numpy"); }
     catch (...) { return; }
     // test_dtypes
@@ -159,9 +160,9 @@ TEST_SUBMODULE(numpy_array, sm) {
     // test_array_attributes
     sm.def("ndim", [](const arr& a) { return a.ndim(); });
     sm.def("shape", [](const arr& a) { return arr(a.ndim(), a.shape()); });
-    sm.def("shape", [](const arr& a, ssize_t dim) { return a.shape(dim); });
+    sm.def("shape", [](const arr& a, py::ssize_t dim) { return a.shape(dim); });
     sm.def("strides", [](const arr& a) { return arr(a.ndim(), a.strides()); });
-    sm.def("strides", [](const arr& a, ssize_t dim) { return a.strides(dim); });
+    sm.def("strides", [](const arr& a, py::ssize_t dim) { return a.strides(dim); });
     sm.def("writeable", [](const arr& a) { return a.writeable(); });
     sm.def("size", [](const arr& a) { return a.size(); });
     sm.def("itemsize", [](const arr& a) { return a.itemsize(); });
@@ -192,7 +193,7 @@ TEST_SUBMODULE(numpy_array, sm) {
     sm.def("scalar_int", []() { return py::array(py::dtype("i"), {}, {}, &data_i); });
     // test_wrap
-    sm.def("wrap", [](py::array a) {
+    sm.def("wrap", [](const py::array &a) {
         return py::array(
             {a.shape(), a.shape() + a.ndim()},
@@ -222,9 +223,10 @@ TEST_SUBMODULE(numpy_array, sm) {
     // test_isinstance
     sm.def("isinstance_untyped", [](py::object yes, py::object no) {
-        return py::isinstance<py::array>(yes) && !py::isinstance<py::array>(no);
+        return py::isinstance<py::array>(std::move(yes))
+               && !py::isinstance<py::array>(std::move(no));
-    sm.def("isinstance_typed", [](py::object o) {
+    sm.def("isinstance_typed", [](const py::object &o) {
         return py::isinstance<py::array_t<double>>(o) && !py::isinstance<py::array_t<int>>(o);
@@ -236,7 +238,7 @@ TEST_SUBMODULE(numpy_array, sm) {
-    sm.def("converting_constructors", [](py::object o) {
+    sm.def("converting_constructors", [](const py::object &o) {
         return py::dict(
@@ -245,69 +247,78 @@ TEST_SUBMODULE(numpy_array, sm) {
     // test_overload_resolution
-    sm.def("overloaded", [](py::array_t<double>) { return "double"; });
-    sm.def("overloaded", [](py::array_t<float>) { return "float"; });
-    sm.def("overloaded", [](py::array_t<int>) { return "int"; });
-    sm.def("overloaded", [](py::array_t<unsigned short>) { return "unsigned short"; });
-    sm.def("overloaded", [](py::array_t<long long>) { return "long long"; });
-    sm.def("overloaded", [](py::array_t<std::complex<double>>) { return "double complex"; });
-    sm.def("overloaded", [](py::array_t<std::complex<float>>) { return "float complex"; });
-    sm.def("overloaded2", [](py::array_t<std::complex<double>>) { return "double complex"; });
-    sm.def("overloaded2", [](py::array_t<double>) { return "double"; });
-    sm.def("overloaded2", [](py::array_t<std::complex<float>>) { return "float complex"; });
-    sm.def("overloaded2", [](py::array_t<float>) { return "float"; });
+    sm.def("overloaded", [](const py::array_t<double> &) { return "double"; });
+    sm.def("overloaded", [](const py::array_t<float> &) { return "float"; });
+    sm.def("overloaded", [](const py::array_t<int> &) { return "int"; });
+    sm.def("overloaded", [](const py::array_t<unsigned short> &) { return "unsigned short"; });
+    sm.def("overloaded", [](const py::array_t<long long> &) { return "long long"; });
+    sm.def("overloaded",
+           [](const py::array_t<std::complex<double>> &) { return "double complex"; });
+    sm.def("overloaded", [](const py::array_t<std::complex<float>> &) { return "float complex"; });
+    sm.def("overloaded2",
+           [](const py::array_t<std::complex<double>> &) { return "double complex"; });
+    sm.def("overloaded2", [](const py::array_t<double> &) { return "double"; });
+    sm.def("overloaded2",
+           [](const py::array_t<std::complex<float>> &) { return "float complex"; });
+    sm.def("overloaded2", [](const py::array_t<float> &) { return "float"; });
+    // [workaround(intel)] ICC 20/21 breaks with py::arg().stuff, using py::arg{}.stuff works.
     // Only accept the exact types:
-    sm.def("overloaded3", [](py::array_t<int>) { return "int"; }, py::arg().noconvert());
-    sm.def("overloaded3", [](py::array_t<double>) { return "double"; }, py::arg().noconvert());
+    sm.def(
+        "overloaded3", [](const py::array_t<int> &) { return "int"; }, py::arg{}.noconvert());
+    sm.def(
+        "overloaded3",
+        [](const py::array_t<double> &) { return "double"; },
+        py::arg{}.noconvert());
     // Make sure we don't do unsafe coercion (e.g. float to int) when not using forcecast, but
     // rather that float gets converted via the safe (conversion to double) overload:
-    sm.def("overloaded4", [](py::array_t<long long, 0>) { return "long long"; });
-    sm.def("overloaded4", [](py::array_t<double, 0>) { return "double"; });
+    sm.def("overloaded4", [](const py::array_t<long long, 0> &) { return "long long"; });
+    sm.def("overloaded4", [](const py::array_t<double, 0> &) { return "double"; });
     // But we do allow conversion to int if forcecast is enabled (but only if no overload matches
     // without conversion)
-    sm.def("overloaded5", [](py::array_t<unsigned int>) { return "unsigned int"; });
-    sm.def("overloaded5", [](py::array_t<double>) { return "double"; });
+    sm.def("overloaded5", [](const py::array_t<unsigned int> &) { return "unsigned int"; });
+    sm.def("overloaded5", [](const py::array_t<double> &) { return "double"; });
     // test_greedy_string_overload
     // Issue 685: ndarray shouldn't go to std::string overload
-    sm.def("issue685", [](std::string) { return "string"; });
-    sm.def("issue685", [](py::array) { return "array"; });
-    sm.def("issue685", [](py::object) { return "other"; });
+    sm.def("issue685", [](const std::string &) { return "string"; });
+    sm.def("issue685", [](const py::array &) { return "array"; });
+    sm.def("issue685", [](const py::object &) { return "other"; });
     // test_array_unchecked_fixed_dims
     sm.def("proxy_add2", [](py::array_t<double> a, double v) {
         auto r = a.mutable_unchecked<2>();
-        for (ssize_t i = 0; i < r.shape(0); i++)
-            for (ssize_t j = 0; j < r.shape(1); j++)
+        for (py::ssize_t i = 0; i < r.shape(0); i++)
+            for (py::ssize_t j = 0; j < r.shape(1); j++)
                 r(i, j) += v;
-    }, py::arg().noconvert(), py::arg());
+    }, py::arg{}.noconvert(), py::arg());
     sm.def("proxy_init3", [](double start) {
         py::array_t<double, py::array::c_style> a({ 3, 3, 3 });
         auto r = a.mutable_unchecked<3>();
-        for (ssize_t i = 0; i < r.shape(0); i++)
-        for (ssize_t j = 0; j < r.shape(1); j++)
-        for (ssize_t k = 0; k < r.shape(2); k++)
+        for (py::ssize_t i = 0; i < r.shape(0); i++)
+        for (py::ssize_t j = 0; j < r.shape(1); j++)
+        for (py::ssize_t k = 0; k < r.shape(2); k++)
             r(i, j, k) = start++;
         return a;
     sm.def("proxy_init3F", [](double start) {
         py::array_t<double, py::array::f_style> a({ 3, 3, 3 });
         auto r = a.mutable_unchecked<3>();
-        for (ssize_t k = 0; k < r.shape(2); k++)
-        for (ssize_t j = 0; j < r.shape(1); j++)
-        for (ssize_t i = 0; i < r.shape(0); i++)
+        for (py::ssize_t k = 0; k < r.shape(2); k++)
+        for (py::ssize_t j = 0; j < r.shape(1); j++)
+        for (py::ssize_t i = 0; i < r.shape(0); i++)
             r(i, j, k) = start++;
         return a;
-    sm.def("proxy_squared_L2_norm", [](py::array_t<double> a) {
+    sm.def("proxy_squared_L2_norm", [](const py::array_t<double> &a) {
         auto r = a.unchecked<1>();
         double sumsq = 0;
-        for (ssize_t i = 0; i < r.shape(0); i++)
+        for (py::ssize_t i = 0; i < r.shape(0); i++)
             sumsq += r[i] * r(i); // Either notation works for a 1D array
         return sumsq;
@@ -318,22 +329,34 @@ TEST_SUBMODULE(numpy_array, sm) {
         return auxiliaries(r, r2);
+    sm.def("proxy_auxiliaries1_const_ref", [](py::array_t<double> a) {
+        const auto &r = a.unchecked<1>();
+        const auto &r2 = a.mutable_unchecked<1>();
+        return r(0) == r2(0) && r[0] == r2[0];
+    });
+    sm.def("proxy_auxiliaries2_const_ref", [](py::array_t<double> a) {
+        const auto &r = a.unchecked<2>();
+        const auto &r2 = a.mutable_unchecked<2>();
+        return r(0, 0) == r2(0, 0);
+    });
     // test_array_unchecked_dyn_dims
     // Same as the above, but without a compile-time dimensions specification:
     sm.def("proxy_add2_dyn", [](py::array_t<double> a, double v) {
         auto r = a.mutable_unchecked();
         if (r.ndim() != 2) throw std::domain_error("error: ndim != 2");
-        for (ssize_t i = 0; i < r.shape(0); i++)
-            for (ssize_t j = 0; j < r.shape(1); j++)
+        for (py::ssize_t i = 0; i < r.shape(0); i++)
+            for (py::ssize_t j = 0; j < r.shape(1); j++)
                 r(i, j) += v;
-    }, py::arg().noconvert(), py::arg());
+    }, py::arg{}.noconvert(), py::arg());
     sm.def("proxy_init3_dyn", [](double start) {
         py::array_t<double, py::array::c_style> a({ 3, 3, 3 });
         auto r = a.mutable_unchecked();
         if (r.ndim() != 3) throw std::domain_error("error: ndim != 3");
-        for (ssize_t i = 0; i < r.shape(0); i++)
-        for (ssize_t j = 0; j < r.shape(1); j++)
-        for (ssize_t k = 0; k < r.shape(2); k++)
+        for (py::ssize_t i = 0; i < r.shape(0); i++)
+        for (py::ssize_t j = 0; j < r.shape(1); j++)
+        for (py::ssize_t k = 0; k < r.shape(2); k++)
             r(i, j, k) = start++;
         return a;
@@ -362,7 +385,7 @@ TEST_SUBMODULE(numpy_array, sm) {
     // test_array_resize
     // reshape array to 2D without changing size
     sm.def("array_reshape2", [](py::array_t<double> a) {
-        const auto dim_sz = (ssize_t)std::sqrt(a.size());
+        const auto dim_sz = (py::ssize_t)std::sqrt(a.size());
         if (dim_sz * dim_sz != a.size())
             throw std::domain_error("array_reshape2: input array total size is not a squared integer");
         a.resize({dim_sz, dim_sz});
@@ -382,45 +405,68 @@ TEST_SUBMODULE(numpy_array, sm) {
         return a;
-    sm.def("index_using_ellipsis", [](py::array a) {
-        return a[py::make_tuple(0, py::ellipsis(), 0)];
+    sm.def("array_view",
+           [](py::array_t<uint8_t> a, const std::string &dtype) { return a.view(dtype); });
+    sm.def("reshape_initializer_list", [](py::array_t<int> a, size_t N, size_t M, size_t O) {
+        return a.reshape({N, M, O});
+    });
+    sm.def("reshape_tuple", [](py::array_t<int> a, const std::vector<int> &new_shape) {
+        return a.reshape(new_shape);
+    sm.def("index_using_ellipsis",
+           [](const py::array &a) { return a[py::make_tuple(0, py::ellipsis(), 0)]; });
     // test_argument_conversions
-    sm.def("accept_double",
-           [](py::array_t<double, 0>) {},
-           py::arg("a"));
-    sm.def("accept_double_forcecast",
-           [](py::array_t<double, py::array::forcecast>) {},
-           py::arg("a"));
-    sm.def("accept_double_c_style",
-           [](py::array_t<double, py::array::c_style>) {},
-           py::arg("a"));
-    sm.def("accept_double_c_style_forcecast",
-           [](py::array_t<double, py::array::forcecast | py::array::c_style>) {},
-           py::arg("a"));
-    sm.def("accept_double_f_style",
-           [](py::array_t<double, py::array::f_style>) {},
-           py::arg("a"));
-    sm.def("accept_double_f_style_forcecast",
-           [](py::array_t<double, py::array::forcecast | py::array::f_style>) {},
-           py::arg("a"));
-    sm.def("accept_double_noconvert",
-           [](py::array_t<double, 0>) {},
-           py::arg("a").noconvert());
-    sm.def("accept_double_forcecast_noconvert",
-           [](py::array_t<double, py::array::forcecast>) {},
-           py::arg("a").noconvert());
-    sm.def("accept_double_c_style_noconvert",
-           [](py::array_t<double, py::array::c_style>) {},
-           py::arg("a").noconvert());
-    sm.def("accept_double_c_style_forcecast_noconvert",
-           [](py::array_t<double, py::array::forcecast | py::array::c_style>) {},
-           py::arg("a").noconvert());
-    sm.def("accept_double_f_style_noconvert",
-           [](py::array_t<double, py::array::f_style>) {},
-           py::arg("a").noconvert());
-    sm.def("accept_double_f_style_forcecast_noconvert",
-           [](py::array_t<double, py::array::forcecast | py::array::f_style>) {},
-           py::arg("a").noconvert());
+    sm.def(
+        "accept_double", [](const py::array_t<double, 0> &) {}, py::arg("a"));
+    sm.def(
+        "accept_double_forcecast",
+        [](const py::array_t<double, py::array::forcecast> &) {},
+        py::arg("a"));
+    sm.def(
+        "accept_double_c_style",
+        [](const py::array_t<double, py::array::c_style> &) {},
+        py::arg("a"));
+    sm.def(
+        "accept_double_c_style_forcecast",
+        [](const py::array_t<double, py::array::forcecast | py::array::c_style> &) {},
+        py::arg("a"));
+    sm.def(
+        "accept_double_f_style",
+        [](const py::array_t<double, py::array::f_style> &) {},
+        py::arg("a"));
+    sm.def(
+        "accept_double_f_style_forcecast",
+        [](const py::array_t<double, py::array::forcecast | py::array::f_style> &) {},
+        py::arg("a"));
+    sm.def(
+        "accept_double_noconvert", [](const py::array_t<double, 0> &) {}, "a"_a.noconvert());
+    sm.def(
+        "accept_double_forcecast_noconvert",
+        [](const py::array_t<double, py::array::forcecast> &) {},
+        "a"_a.noconvert());
+    sm.def(
+        "accept_double_c_style_noconvert",
+        [](const py::array_t<double, py::array::c_style> &) {},
+        "a"_a.noconvert());
+    sm.def(
+        "accept_double_c_style_forcecast_noconvert",
+        [](const py::array_t<double, py::array::forcecast | py::array::c_style> &) {},
+        "a"_a.noconvert());
+    sm.def(
+        "accept_double_f_style_noconvert",
+        [](const py::array_t<double, py::array::f_style> &) {},
+        "a"_a.noconvert());
+    sm.def(
+        "accept_double_f_style_forcecast_noconvert",
+        [](const py::array_t<double, py::array::forcecast | py::array::f_style> &) {},
+        "a"_a.noconvert());
+    // Check that types returns correct npy format descriptor
+    sm.def("test_fmt_desc_float", [](const py::array_t<float> &) {});
+    sm.def("test_fmt_desc_double", [](const py::array_t<double> &) {});
+    sm.def("test_fmt_desc_const_float", [](const py::array_t<const float> &) {});
+    sm.def("test_fmt_desc_const_double", [](const py::array_t<const double> &) {});
diff --git a/wrap/pybind11/tests/test_numpy_array.py b/wrap/pybind11/tests/test_numpy_array.py
index a36e707c1d..e4138f0239 100644
--- a/wrap/pybind11/tests/test_numpy_array.py
+++ b/wrap/pybind11/tests/test_numpy_array.py
@@ -2,7 +2,6 @@
 import pytest
 import env  # noqa: F401
 from pybind11_tests import numpy_array as m
 np = pytest.importorskip("numpy")
@@ -19,33 +18,36 @@ def test_dtypes():
         assert check.numpy == check.pybind11, check
         if check.numpy.num != check.pybind11.num:
-            print("NOTE: typenum mismatch for {}: {} != {}".format(
-                check, check.numpy.num, check.pybind11.num))
+            print(
+                "NOTE: typenum mismatch for {}: {} != {}".format(
+                    check, check.numpy.num, check.pybind11.num
+                )
+            )
 def arr():
-    return np.array([[1, 2, 3], [4, 5, 6]], '=u2')
+    return np.array([[1, 2, 3], [4, 5, 6]], "=u2")
 def test_array_attributes():
-    a = np.array(0, 'f8')
+    a = np.array(0, "f8")
     assert m.ndim(a) == 0
     assert all(m.shape(a) == [])
     assert all(m.strides(a) == [])
     with pytest.raises(IndexError) as excinfo:
         m.shape(a, 0)
-    assert str(excinfo.value) == 'invalid axis: 0 (ndim = 0)'
+    assert str(excinfo.value) == "invalid axis: 0 (ndim = 0)"
     with pytest.raises(IndexError) as excinfo:
         m.strides(a, 0)
-    assert str(excinfo.value) == 'invalid axis: 0 (ndim = 0)'
+    assert str(excinfo.value) == "invalid axis: 0 (ndim = 0)"
     assert m.writeable(a)
     assert m.size(a) == 1
     assert m.itemsize(a) == 8
     assert m.nbytes(a) == 8
     assert m.owndata(a)
-    a = np.array([[1, 2, 3], [4, 5, 6]], 'u2').view()
+    a = np.array([[1, 2, 3], [4, 5, 6]], "u2").view()
     a.flags.writeable = False
     assert m.ndim(a) == 2
     assert all(m.shape(a) == [2, 3])
@@ -56,10 +58,10 @@ def test_array_attributes():
     assert m.strides(a, 1) == 2
     with pytest.raises(IndexError) as excinfo:
         m.shape(a, 2)
-    assert str(excinfo.value) == 'invalid axis: 2 (ndim = 2)'
+    assert str(excinfo.value) == "invalid axis: 2 (ndim = 2)"
     with pytest.raises(IndexError) as excinfo:
         m.strides(a, 2)
-    assert str(excinfo.value) == 'invalid axis: 2 (ndim = 2)'
+    assert str(excinfo.value) == "invalid axis: 2 (ndim = 2)"
     assert not m.writeable(a)
     assert m.size(a) == 6
     assert m.itemsize(a) == 2
@@ -67,7 +69,9 @@ def test_array_attributes():
     assert not m.owndata(a)
-@pytest.mark.parametrize('args, ret', [([], 0), ([0], 0), ([1], 3), ([0, 1], 1), ([1, 2], 5)])
+    "args, ret", [([], 0), ([0], 0), ([1], 3), ([0, 1], 1), ([1, 2], 5)]
 def test_index_offset(arr, args, ret):
     assert m.index_at(arr, *args) == ret
     assert m.index_at_t(arr, *args) == ret
@@ -76,31 +80,46 @@ def test_index_offset(arr, args, ret):
 def test_dim_check_fail(arr):
-    for func in (m.index_at, m.index_at_t, m.offset_at, m.offset_at_t, m.data, m.data_t,
-                 m.mutate_data, m.mutate_data_t):
+    for func in (
+        m.index_at,
+        m.index_at_t,
+        m.offset_at,
+        m.offset_at_t,
+        m.data,
+        m.data_t,
+        m.mutate_data,
+        m.mutate_data_t,
+    ):
         with pytest.raises(IndexError) as excinfo:
             func(arr, 1, 2, 3)
-        assert str(excinfo.value) == 'too many indices for an array: 3 (ndim = 2)'
+        assert str(excinfo.value) == "too many indices for an array: 3 (ndim = 2)"
-@pytest.mark.parametrize('args, ret',
-                         [([], [1, 2, 3, 4, 5, 6]),
-                          ([1], [4, 5, 6]),
-                          ([0, 1], [2, 3, 4, 5, 6]),
-                          ([1, 2], [6])])
+    "args, ret",
+    [
+        ([], [1, 2, 3, 4, 5, 6]),
+        ([1], [4, 5, 6]),
+        ([0, 1], [2, 3, 4, 5, 6]),
+        ([1, 2], [6]),
+    ],
 def test_data(arr, args, ret):
     from sys import byteorder
     assert all(m.data_t(arr, *args) == ret)
-    assert all(m.data(arr, *args)[(0 if byteorder == 'little' else 1)::2] == ret)
-    assert all(m.data(arr, *args)[(1 if byteorder == 'little' else 0)::2] == 0)
+    assert all(m.data(arr, *args)[(0 if byteorder == "little" else 1) :: 2] == ret)
+    assert all(m.data(arr, *args)[(1 if byteorder == "little" else 0) :: 2] == 0)
-@pytest.mark.parametrize('dim', [0, 1, 3])
+@pytest.mark.parametrize("dim", [0, 1, 3])
 def test_at_fail(arr, dim):
     for func in m.at_t, m.mutate_at_t:
         with pytest.raises(IndexError) as excinfo:
             func(arr, *([0] * dim))
-        assert str(excinfo.value) == 'index dimension mismatch: {} (ndim = 2)'.format(dim)
+        assert str(excinfo.value) == "index dimension mismatch: {} (ndim = 2)".format(
+            dim
+        )
 def test_at(arr):
@@ -113,10 +132,14 @@ def test_at(arr):
 def test_mutate_readonly(arr):
     arr.flags.writeable = False
-    for func, args in (m.mutate_data, ()), (m.mutate_data_t, ()), (m.mutate_at_t, (0, 0)):
+    for func, args in (
+        (m.mutate_data, ()),
+        (m.mutate_data_t, ()),
+        (m.mutate_at_t, (0, 0)),
+    ):
         with pytest.raises(ValueError) as excinfo:
             func(arr, *args)
-        assert str(excinfo.value) == 'array is not writeable'
+        assert str(excinfo.value) == "array is not writeable"
 def test_mutate_data(arr):
@@ -134,14 +157,22 @@ def test_mutate_data(arr):
 def test_bounds_check(arr):
-    for func in (m.index_at, m.index_at_t, m.data, m.data_t,
-                 m.mutate_data, m.mutate_data_t, m.at_t, m.mutate_at_t):
+    for func in (
+        m.index_at,
+        m.index_at_t,
+        m.data,
+        m.data_t,
+        m.mutate_data,
+        m.mutate_data_t,
+        m.at_t,
+        m.mutate_at_t,
+    ):
         with pytest.raises(IndexError) as excinfo:
             func(arr, 2, 0)
-        assert str(excinfo.value) == 'index 2 is out of bounds for axis 0 with size 2'
+        assert str(excinfo.value) == "index 2 is out of bounds for axis 0 with size 2"
         with pytest.raises(IndexError) as excinfo:
             func(arr, 0, 4)
-        assert str(excinfo.value) == 'index 4 is out of bounds for axis 1 with size 3'
+        assert str(excinfo.value) == "index 4 is out of bounds for axis 1 with size 3"
 def test_make_c_f_array():
@@ -163,10 +194,11 @@ def test_make_empty_shaped_array():
 def test_wrap():
     def assert_references(a, b, base=None):
         from distutils.version import LooseVersion
         if base is None:
             base = a
         assert a is not b
-        assert a.__array_interface__['data'][0] == b.__array_interface__['data'][0]
+        assert a.__array_interface__["data"][0] == b.__array_interface__["data"][0]
         assert a.shape == b.shape
         assert a.strides == b.strides
         assert a.flags.c_contiguous == b.flags.c_contiguous
@@ -189,12 +221,12 @@ def assert_references(a, b, base=None):
     a2 = m.wrap(a1)
     assert_references(a1, a2)
-    a1 = np.array([[1, 2], [3, 4]], dtype=np.float32, order='F')
+    a1 = np.array([[1, 2], [3, 4]], dtype=np.float32, order="F")
     assert a1.flags.owndata and a1.base is None
     a2 = m.wrap(a1)
     assert_references(a1, a2)
-    a1 = np.array([[1, 2], [3, 4]], dtype=np.float32, order='C')
+    a1 = np.array([[1, 2], [3, 4]], dtype=np.float32, order="C")
     a1.flags.writeable = False
     a2 = m.wrap(a1)
     assert_references(a1, a2)
@@ -224,11 +256,14 @@ def test_numpy_view(capture):
         assert np.all(ac_view_1 == np.array([1, 2], dtype=np.int32))
         del ac
-    assert capture == """
+    assert (
+        capture
+        == """
+    )
     ac_view_1[0] = 4
     ac_view_1[1] = 3
     assert ac_view_2[0] == 4
@@ -238,9 +273,12 @@ def test_numpy_view(capture):
         del ac_view_2
-    assert capture == """
+    assert (
+        capture
+        == """
+    )
 def test_cast_numpy_int64_to_uint64():
@@ -271,20 +309,22 @@ def test_constructors():
 def test_overload_resolution(msg):
     # Exact overload matches:
-    assert m.overloaded(np.array([1], dtype='float64')) == 'double'
-    assert m.overloaded(np.array([1], dtype='float32')) == 'float'
-    assert m.overloaded(np.array([1], dtype='ushort')) == 'unsigned short'
-    assert m.overloaded(np.array([1], dtype='intc')) == 'int'
-    assert m.overloaded(np.array([1], dtype='longlong')) == 'long long'
-    assert m.overloaded(np.array([1], dtype='complex')) == 'double complex'
-    assert m.overloaded(np.array([1], dtype='csingle')) == 'float complex'
+    assert m.overloaded(np.array([1], dtype="float64")) == "double"
+    assert m.overloaded(np.array([1], dtype="float32")) == "float"
+    assert m.overloaded(np.array([1], dtype="ushort")) == "unsigned short"
+    assert m.overloaded(np.array([1], dtype="intc")) == "int"
+    assert m.overloaded(np.array([1], dtype="longlong")) == "long long"
+    assert m.overloaded(np.array([1], dtype="complex")) == "double complex"
+    assert m.overloaded(np.array([1], dtype="csingle")) == "float complex"
     # No exact match, should call first convertible version:
-    assert m.overloaded(np.array([1], dtype='uint8')) == 'double'
+    assert m.overloaded(np.array([1], dtype="uint8")) == "double"
     with pytest.raises(TypeError) as excinfo:
         m.overloaded("not an array")
-    assert msg(excinfo.value) == """
+    assert (
+        msg(excinfo.value)
+        == """
         overloaded(): incompatible function arguments. The following argument types are supported:
             1. (arg0: numpy.ndarray[numpy.float64]) -> str
             2. (arg0: numpy.ndarray[numpy.float32]) -> str
@@ -296,15 +336,16 @@ def test_overload_resolution(msg):
         Invoked with: 'not an array'
+    )
-    assert m.overloaded2(np.array([1], dtype='float64')) == 'double'
-    assert m.overloaded2(np.array([1], dtype='float32')) == 'float'
-    assert m.overloaded2(np.array([1], dtype='complex64')) == 'float complex'
-    assert m.overloaded2(np.array([1], dtype='complex128')) == 'double complex'
-    assert m.overloaded2(np.array([1], dtype='float32')) == 'float'
+    assert m.overloaded2(np.array([1], dtype="float64")) == "double"
+    assert m.overloaded2(np.array([1], dtype="float32")) == "float"
+    assert m.overloaded2(np.array([1], dtype="complex64")) == "float complex"
+    assert m.overloaded2(np.array([1], dtype="complex128")) == "double complex"
+    assert m.overloaded2(np.array([1], dtype="float32")) == "float"
-    assert m.overloaded3(np.array([1], dtype='float64')) == 'double'
-    assert m.overloaded3(np.array([1], dtype='intc')) == 'int'
+    assert m.overloaded3(np.array([1], dtype="float64")) == "double"
+    assert m.overloaded3(np.array([1], dtype="intc")) == "int"
     expected_exc = """
         overloaded3(): incompatible function arguments. The following argument types are supported:
             1. (arg0: numpy.ndarray[numpy.int32]) -> str
@@ -313,47 +354,49 @@ def test_overload_resolution(msg):
         Invoked with: """
     with pytest.raises(TypeError) as excinfo:
-        m.overloaded3(np.array([1], dtype='uintc'))
-    assert msg(excinfo.value) == expected_exc + repr(np.array([1], dtype='uint32'))
+        m.overloaded3(np.array([1], dtype="uintc"))
+    assert msg(excinfo.value) == expected_exc + repr(np.array([1], dtype="uint32"))
     with pytest.raises(TypeError) as excinfo:
-        m.overloaded3(np.array([1], dtype='float32'))
-    assert msg(excinfo.value) == expected_exc + repr(np.array([1.], dtype='float32'))
+        m.overloaded3(np.array([1], dtype="float32"))
+    assert msg(excinfo.value) == expected_exc + repr(np.array([1.0], dtype="float32"))
     with pytest.raises(TypeError) as excinfo:
-        m.overloaded3(np.array([1], dtype='complex'))
-    assert msg(excinfo.value) == expected_exc + repr(np.array([1. + 0.j]))
+        m.overloaded3(np.array([1], dtype="complex"))
+    assert msg(excinfo.value) == expected_exc + repr(np.array([1.0 + 0.0j]))
     # Exact matches:
-    assert m.overloaded4(np.array([1], dtype='double')) == 'double'
-    assert m.overloaded4(np.array([1], dtype='longlong')) == 'long long'
+    assert m.overloaded4(np.array([1], dtype="double")) == "double"
+    assert m.overloaded4(np.array([1], dtype="longlong")) == "long long"
     # Non-exact matches requiring conversion.  Since float to integer isn't a
     # save conversion, it should go to the double overload, but short can go to
     # either (and so should end up on the first-registered, the long long).
-    assert m.overloaded4(np.array([1], dtype='float32')) == 'double'
-    assert m.overloaded4(np.array([1], dtype='short')) == 'long long'
+    assert m.overloaded4(np.array([1], dtype="float32")) == "double"
+    assert m.overloaded4(np.array([1], dtype="short")) == "long long"
-    assert m.overloaded5(np.array([1], dtype='double')) == 'double'
-    assert m.overloaded5(np.array([1], dtype='uintc')) == 'unsigned int'
-    assert m.overloaded5(np.array([1], dtype='float32')) == 'unsigned int'
+    assert m.overloaded5(np.array([1], dtype="double")) == "double"
+    assert m.overloaded5(np.array([1], dtype="uintc")) == "unsigned int"
+    assert m.overloaded5(np.array([1], dtype="float32")) == "unsigned int"
 def test_greedy_string_overload():
     """Tests fix for #685 - ndarray shouldn't go to std::string overload"""
     assert m.issue685("abc") == "string"
-    assert m.issue685(np.array([97, 98, 99], dtype='b')) == "array"
+    assert m.issue685(np.array([97, 98, 99], dtype="b")) == "array"
     assert m.issue685(123) == "other"
 def test_array_unchecked_fixed_dims(msg):
-    z1 = np.array([[1, 2], [3, 4]], dtype='float64')
+    z1 = np.array([[1, 2], [3, 4]], dtype="float64")
     m.proxy_add2(z1, 10)
     assert np.all(z1 == [[11, 12], [13, 14]])
     with pytest.raises(ValueError) as excinfo:
-        m.proxy_add2(np.array([1., 2, 3]), 5.0)
-    assert msg(excinfo.value) == "array has incorrect number of dimensions: 1; expected 2"
+        m.proxy_add2(np.array([1.0, 2, 3]), 5.0)
+    assert (
+        msg(excinfo.value) == "array has incorrect number of dimensions: 1; expected 2"
+    )
-    expect_c = np.ndarray(shape=(3, 3, 3), buffer=np.array(range(3, 30)), dtype='int')
+    expect_c = np.ndarray(shape=(3, 3, 3), buffer=np.array(range(3, 30)), dtype="int")
     assert np.all(m.proxy_init3(3.0) == expect_c)
     expect_f = np.transpose(expect_c)
     assert np.all(m.proxy_init3F(3.0) == expect_f)
@@ -364,13 +407,16 @@ def test_array_unchecked_fixed_dims(msg):
     assert m.proxy_auxiliaries2(z1) == [11, 11, True, 2, 8, 2, 2, 4, 32]
     assert m.proxy_auxiliaries2(z1) == m.array_auxiliaries2(z1)
+    assert m.proxy_auxiliaries1_const_ref(z1[0, :])
+    assert m.proxy_auxiliaries2_const_ref(z1)
-def test_array_unchecked_dyn_dims(msg):
-    z1 = np.array([[1, 2], [3, 4]], dtype='float64')
+def test_array_unchecked_dyn_dims():
+    z1 = np.array([[1, 2], [3, 4]], dtype="float64")
     m.proxy_add2_dyn(z1, 10)
     assert np.all(z1 == [[11, 12], [13, 14]])
-    expect_c = np.ndarray(shape=(3, 3, 3), buffer=np.array(range(3, 30)), dtype='int')
+    expect_c = np.ndarray(shape=(3, 3, 3), buffer=np.array(range(3, 30)), dtype="int")
     assert np.all(m.proxy_init3_dyn(3.0) == expect_c)
     assert m.proxy_auxiliaries2_dyn(z1) == [11, 11, True, 2, 8, 2, 2, 4, 32]
@@ -380,15 +426,15 @@ def test_array_unchecked_dyn_dims(msg):
 def test_array_failure():
     with pytest.raises(ValueError) as excinfo:
-    assert str(excinfo.value) == 'cannot create a pybind11::array from a nullptr'
+    assert str(excinfo.value) == "cannot create a pybind11::array from a nullptr"
     with pytest.raises(ValueError) as excinfo:
-    assert str(excinfo.value) == 'cannot create a pybind11::array_t from a nullptr'
+    assert str(excinfo.value) == "cannot create a pybind11::array_t from a nullptr"
     with pytest.raises(ValueError) as excinfo:
-    assert str(excinfo.value) == 'negative dimensions are not allowed'
+    assert str(excinfo.value) == "negative dimensions are not allowed"
 def test_initializer_list():
@@ -398,36 +444,76 @@ def test_initializer_list():
     assert m.array_initializer_list4().shape == (1, 2, 3, 4)
-def test_array_resize(msg):
-    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='float64')
+def test_array_resize():
+    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype="float64")
-    assert(a.size == 9)
-    assert(np.all(a == [[1, 2, 3], [4, 5, 6], [7, 8, 9]]))
+    assert a.size == 9
+    assert np.all(a == [[1, 2, 3], [4, 5, 6], [7, 8, 9]])
     # total size change should succced with refcheck off
     m.array_resize3(a, 4, False)
-    assert(a.size == 64)
+    assert a.size == 64
     # ... and fail with refcheck on
         m.array_resize3(a, 3, True)
     except ValueError as e:
-        assert(str(e).startswith("cannot resize an array"))
+        assert str(e).startswith("cannot resize an array")
     # transposed array doesn't own data
     b = a.transpose()
         m.array_resize3(b, 3, False)
     except ValueError as e:
-        assert(str(e).startswith("cannot resize this array: it does not own its data"))
+        assert str(e).startswith("cannot resize this array: it does not own its data")
     # ... but reshape should be fine
-    assert(b.shape == (8, 8))
+    assert b.shape == (8, 8)
-def test_array_create_and_resize(msg):
+def test_array_create_and_resize():
     a = m.create_and_resize(2)
-    assert(a.size == 4)
-    assert(np.all(a == 42.))
+    assert a.size == 4
+    assert np.all(a == 42.0)
+def test_array_view():
+    a = np.ones(100 * 4).astype("uint8")
+    a_float_view = m.array_view(a, "float32")
+    assert a_float_view.shape == (100 * 1,)  # 1 / 4 bytes = 8 / 32
+    a_int16_view = m.array_view(a, "int16")  # 1 / 2 bytes = 16 / 32
+    assert a_int16_view.shape == (100 * 2,)
+def test_array_view_invalid():
+    a = np.ones(100 * 4).astype("uint8")
+    with pytest.raises(TypeError):
+        m.array_view(a, "deadly_dtype")
+def test_reshape_initializer_list():
+    a = np.arange(2 * 7 * 3) + 1
+    x = m.reshape_initializer_list(a, 2, 7, 3)
+    assert x.shape == (2, 7, 3)
+    assert list(x[1][4]) == [34, 35, 36]
+    with pytest.raises(ValueError) as excinfo:
+        m.reshape_initializer_list(a, 1, 7, 3)
+    assert str(excinfo.value) == "cannot reshape array of size 42 into shape (1,7,3)"
+def test_reshape_tuple():
+    a = np.arange(3 * 7 * 2) + 1
+    x = m.reshape_tuple(a, (3, 7, 2))
+    assert x.shape == (3, 7, 2)
+    assert list(x[1][4]) == [23, 24]
+    y = m.reshape_tuple(x, (x.size,))
+    assert y.shape == (42,)
+    with pytest.raises(ValueError) as excinfo:
+        m.reshape_tuple(a, (3, 7, 1))
+    assert str(excinfo.value) == "cannot reshape array of size 42 into shape (3,7,1)"
+    with pytest.raises(ValueError) as excinfo:
+        m.reshape_tuple(a, ())
+    assert str(excinfo.value) == "cannot reshape array of size 42 into shape ()"
 def test_index_using_ellipsis():
@@ -435,17 +521,30 @@ def test_index_using_ellipsis():
     assert a.shape == (6,)
+    "test_func",
+    [
+        m.test_fmt_desc_float,
+        m.test_fmt_desc_double,
+        m.test_fmt_desc_const_float,
+        m.test_fmt_desc_const_double,
+    ],
+def test_format_descriptors_for_floating_point_types(test_func):
+    assert "numpy.ndarray[numpy.float" in test_func.__doc__
 @pytest.mark.parametrize("forcecast", [False, True])
-@pytest.mark.parametrize("contiguity", [None, 'C', 'F'])
+@pytest.mark.parametrize("contiguity", [None, "C", "F"])
 @pytest.mark.parametrize("noconvert", [False, True])
     "ignore:Casting complex values to real discards the imaginary part:numpy.ComplexWarning"
 def test_argument_conversions(forcecast, contiguity, noconvert):
     function_name = "accept_double"
-    if contiguity == 'C':
+    if contiguity == "C":
         function_name += "_c_style"
-    elif contiguity == 'F':
+    elif contiguity == "F":
         function_name += "_f_style"
     if forcecast:
         function_name += "_forcecast"
@@ -453,37 +552,39 @@ def test_argument_conversions(forcecast, contiguity, noconvert):
         function_name += "_noconvert"
     function = getattr(m, function_name)
-    for dtype in [np.dtype('float32'), np.dtype('float64'), np.dtype('complex128')]:
-        for order in ['C', 'F']:
+    for dtype in [np.dtype("float32"), np.dtype("float64"), np.dtype("complex128")]:
+        for order in ["C", "F"]:
             for shape in [(2, 2), (1, 3, 1, 1), (1, 1, 1), (0,)]:
                 if not noconvert:
                     # If noconvert is not passed, only complex128 needs to be truncated and
                     # "cannot be safely obtained". So without `forcecast`, the argument shouldn't
                     # be accepted.
-                    should_raise = dtype.name == 'complex128' and not forcecast
+                    should_raise = dtype.name == "complex128" and not forcecast
                     # If noconvert is passed, only float64 and the matching order is accepted.
                     # If at most one dimension has a size greater than 1, the array is also
                     # trivially contiguous.
                     trivially_contiguous = sum(1 for d in shape if d > 1) <= 1
-                    should_raise = (
-                        dtype.name != 'float64' or
-                        (contiguity is not None and
-                         contiguity != order and
-                         not trivially_contiguous)
+                    should_raise = dtype.name != "float64" or (
+                        contiguity is not None
+                        and contiguity != order
+                        and not trivially_contiguous
                 array = np.zeros(shape, dtype=dtype, order=order)
                 if not should_raise:
-                    with pytest.raises(TypeError, match="incompatible function arguments"):
+                    with pytest.raises(
+                        TypeError, match="incompatible function arguments"
+                    ):
 def test_dtype_refcount_leak():
     from sys import getrefcount
     dtype = np.dtype(np.float_)
     a = np.array([1], dtype=dtype)
     before = getrefcount(dtype)
diff --git a/wrap/pybind11/tests/test_numpy_dtypes.cpp b/wrap/pybind11/tests/test_numpy_dtypes.cpp
index 467e0253f7..bf4f4cee74 100644
--- a/wrap/pybind11/tests/test_numpy_dtypes.cpp
+++ b/wrap/pybind11/tests/test_numpy_dtypes.cpp
@@ -108,9 +108,11 @@ PYBIND11_PACKED(struct EnumStruct {
 std::ostream& operator<<(std::ostream& os, const StringStruct& v) {
     os << "a='";
-    for (size_t i = 0; i < 3 && v.a[i]; i++) os << v.a[i];
+    for (size_t i = 0; i < 3 && (v.a[i] != 0); i++)
+        os << v.a[i];
     os << "',b='";
-    for (size_t i = 0; i < 3 && v.b[i]; i++) os << v.b[i];
+    for (size_t i = 0; i < 3 && (v.b[i] != 0); i++)
+        os << v.b[i];
     return os << "'";
@@ -146,11 +148,13 @@ py::array mkarray_via_buffer(size_t n) {
                                      1, { n }, { sizeof(T) }));
-#define SET_TEST_VALS(s, i) do { \
-    s.bool_ = (i) % 2 != 0; \
-    s.uint_ = (uint32_t) (i); \
-    s.float_ = (float) (i) * 1.5f; \
-    s.ldbl_ = (long double) (i) * -2.5L; } while (0)
+#define SET_TEST_VALS(s, i)                                                                       \
+    do {                                                                                          \
+        (s).bool_ = (i) % 2 != 0;                                                                 \
+        (s).uint_ = (uint32_t) (i);                                                               \
+        (s).float_ = (float) (i) *1.5f;                                                           \
+        (s).ldbl_ = (long double) (i) * -2.5L;                                                    \
+    } while (0)
 template <typename S>
 py::array_t<S, 0> create_recarray(size_t n) {
@@ -168,7 +172,7 @@ py::list print_recarray(py::array_t<S, 0> arr) {
     const auto req = arr.request();
     const auto ptr = static_cast<S*>(req.ptr);
     auto l = py::list();
-    for (ssize_t i = 0; i < req.size; i++) {
+    for (py::ssize_t i = 0; i < req.size; i++) {
         std::stringstream ss;
         ss << ptr[i];
@@ -180,8 +184,8 @@ py::array_t<int32_t, 0> test_array_ctors(int i) {
     using arr_t = py::array_t<int32_t, 0>;
     std::vector<int32_t> data { 1, 2, 3, 4, 5, 6 };
-    std::vector<ssize_t> shape { 3, 2 };
-    std::vector<ssize_t> strides { 8, 4 };
+    std::vector<py::ssize_t> shape { 3, 2 };
+    std::vector<py::ssize_t> strides { 8, 4 };
     auto ptr = data.data();
     auto vptr = (void *) ptr;
@@ -255,11 +259,31 @@ struct A {};
 struct B {};
 TEST_SUBMODULE(numpy_dtypes, m) {
-    try { py::module::import("numpy"); }
+    try { py::module_::import("numpy"); }
     catch (...) { return; }
     // typeinfo may be registered before the dtype descriptor for scalar casts to work...
-    py::class_<SimpleStruct>(m, "SimpleStruct");
+    py::class_<SimpleStruct>(m, "SimpleStruct")
+        // Explicit construct to ensure zero-valued initialization.
+        .def(py::init([]() { return SimpleStruct(); }))
+        .def_readwrite("bool_", &SimpleStruct::bool_)
+        .def_readwrite("uint_", &SimpleStruct::uint_)
+        .def_readwrite("float_", &SimpleStruct::float_)
+        .def_readwrite("ldbl_", &SimpleStruct::ldbl_)
+        .def("astuple",
+             [](const SimpleStruct &self) {
+                 return py::make_tuple(self.bool_, self.uint_, self.float_, self.ldbl_);
+             })
+        .def_static("fromtuple", [](const py::tuple &tup) {
+            if (py::len(tup) != 4) {
+                throw py::cast_error("Invalid size");
+            }
+            return SimpleStruct{
+                tup[0].cast<bool>(),
+                tup[1].cast<uint32_t>(),
+                tup[2].cast<float>(),
+                tup[3].cast<long double>()};
+        });
     PYBIND11_NUMPY_DTYPE(SimpleStruct, bool_, uint_, float_, ldbl_);
     PYBIND11_NUMPY_DTYPE(SimpleStructReordered, bool_, uint_, float_, ldbl_);
@@ -339,6 +363,14 @@ TEST_SUBMODULE(numpy_dtypes, m) {
     // test_dtype
+    std::vector<const char *> dtype_names{
+        "byte", "short", "intc", "int_", "longlong",
+        "ubyte", "ushort", "uintc", "uint", "ulonglong",
+        "half", "single", "double", "longdouble",
+        "csingle", "cdouble", "clongdouble",
+        "bool_", "datetime64", "timedelta64", "object_"
+    };
     m.def("print_dtypes", []() {
         py::list l;
         for (const py::handle &d : {
@@ -357,6 +389,18 @@ TEST_SUBMODULE(numpy_dtypes, m) {
         return l;
     m.def("test_dtype_ctors", &test_dtype_ctors);
+    m.def("test_dtype_kind", [dtype_names]() {
+        py::list list;
+        for (auto& dt_name : dtype_names)
+            list.append(py::dtype(dt_name).kind());
+        return list;
+    });
+    m.def("test_dtype_char_", [dtype_names]() {
+        py::list list;
+        for (auto& dt_name : dtype_names)
+            list.append(py::dtype(dt_name).char_());
+        return list;
+    });
     m.def("test_dtype_methods", []() {
         py::list list;
         auto dt1 = py::dtype::of<int32_t>();
@@ -379,7 +423,7 @@ TEST_SUBMODULE(numpy_dtypes, m) {
         if (non_empty) {
             auto req = arr.request();
             auto ptr = static_cast<StringStruct*>(req.ptr);
-            for (ssize_t i = 0; i < req.size * req.itemsize; i++)
+            for (py::ssize_t i = 0; i < req.size * req.itemsize; i++)
                 static_cast<char*>(req.ptr)[i] = 0;
             ptr[1].a[0] = 'a'; ptr[1].b[0] = 'a';
             ptr[2].a[0] = 'a'; ptr[2].b[0] = 'a';
@@ -462,10 +506,16 @@ TEST_SUBMODULE(numpy_dtypes, m) {
     m.def("buffer_to_dtype", [](py::buffer& buf) { return py::dtype(buf.request()); });
     // test_scalar_conversion
-    m.def("f_simple", [](SimpleStruct s) { return s.uint_ * 10; });
+    auto f_simple = [](SimpleStruct s) { return s.uint_ * 10; };
+    m.def("f_simple", f_simple);
     m.def("f_packed", [](PackedStruct s) { return s.uint_ * 10; });
     m.def("f_nested", [](NestedStruct s) { return s.a.uint_ * 10; });
+    // test_vectorize
+    m.def("f_simple_vectorized", py::vectorize(f_simple));
+    auto f_simple_pass_thru = [](SimpleStruct s) { return s; };
+    m.def("f_simple_pass_thru_vectorized", py::vectorize(f_simple_pass_thru));
     // test_register_dtype
     m.def("register_dtype", []() { PYBIND11_NUMPY_DTYPE(SimpleStruct, bool_, uint_, float_, ldbl_); });
diff --git a/wrap/pybind11/tests/test_numpy_dtypes.py b/wrap/pybind11/tests/test_numpy_dtypes.py
index 417d6f1cff..06e578329e 100644
--- a/wrap/pybind11/tests/test_numpy_dtypes.py
+++ b/wrap/pybind11/tests/test_numpy_dtypes.py
@@ -4,63 +4,82 @@
 import pytest
 import env  # noqa: F401
 from pybind11_tests import numpy_dtypes as m
 np = pytest.importorskip("numpy")
 def simple_dtype():
-    ld = np.dtype('longdouble')
-    return np.dtype({'names': ['bool_', 'uint_', 'float_', 'ldbl_'],
-                     'formats': ['?', 'u4', 'f4', 'f{}'.format(ld.itemsize)],
-                     'offsets': [0, 4, 8, (16 if ld.alignment > 4 else 12)]})
+    ld = np.dtype("longdouble")
+    return np.dtype(
+        {
+            "names": ["bool_", "uint_", "float_", "ldbl_"],
+            "formats": ["?", "u4", "f4", "f{}".format(ld.itemsize)],
+            "offsets": [0, 4, 8, (16 if ld.alignment > 4 else 12)],
+        }
+    )
 def packed_dtype():
-    return np.dtype([('bool_', '?'), ('uint_', 'u4'), ('float_', 'f4'), ('ldbl_', 'g')])
+    return np.dtype([("bool_", "?"), ("uint_", "u4"), ("float_", "f4"), ("ldbl_", "g")])
 def dt_fmt():
     from sys import byteorder
-    e = '<' if byteorder == 'little' else '>'
-    return ("{{'names':['bool_','uint_','float_','ldbl_'],"
-            " 'formats':['?','" + e + "u4','" + e + "f4','" + e + "f{}'],"
-            " 'offsets':[0,4,8,{}], 'itemsize':{}}}")
+    e = "<" if byteorder == "little" else ">"
+    return (
+        "{{'names':['bool_','uint_','float_','ldbl_'],"
+        " 'formats':['?','" + e + "u4','" + e + "f4','" + e + "f{}'],"
+        " 'offsets':[0,4,8,{}], 'itemsize':{}}}"
+    )
 def simple_dtype_fmt():
-    ld = np.dtype('longdouble')
+    ld = np.dtype("longdouble")
     simple_ld_off = 12 + 4 * (ld.alignment > 4)
     return dt_fmt().format(ld.itemsize, simple_ld_off, simple_ld_off + ld.itemsize)
 def packed_dtype_fmt():
     from sys import byteorder
     return "[('bool_', '?'), ('uint_', '{e}u4'), ('float_', '{e}f4'), ('ldbl_', '{e}f{}')]".format(
-        np.dtype('longdouble').itemsize, e='<' if byteorder == 'little' else '>')
+        np.dtype("longdouble").itemsize, e="<" if byteorder == "little" else ">"
+    )
 def partial_ld_offset():
-    return 12 + 4 * (np.dtype('uint64').alignment > 4) + 8 + 8 * (
-        np.dtype('longdouble').alignment > 8)
+    return (
+        12
+        + 4 * (np.dtype("uint64").alignment > 4)
+        + 8
+        + 8 * (np.dtype("longdouble").alignment > 8)
+    )
 def partial_dtype_fmt():
-    ld = np.dtype('longdouble')
+    ld = np.dtype("longdouble")
     partial_ld_off = partial_ld_offset()
-    return dt_fmt().format(ld.itemsize, partial_ld_off, partial_ld_off + ld.itemsize)
+    partial_size = partial_ld_off + ld.itemsize
+    partial_end_padding = partial_size % np.dtype("uint64").alignment
+    return dt_fmt().format(
+        ld.itemsize, partial_ld_off, partial_size + partial_end_padding
+    )
 def partial_nested_fmt():
-    ld = np.dtype('longdouble')
+    ld = np.dtype("longdouble")
     partial_nested_off = 8 + 8 * (ld.alignment > 8)
     partial_ld_off = partial_ld_offset()
-    partial_nested_size = partial_nested_off * 2 + partial_ld_off + ld.itemsize
+    partial_size = partial_ld_off + ld.itemsize
+    partial_end_padding = partial_size % np.dtype("uint64").alignment
+    partial_nested_size = partial_nested_off * 2 + partial_size + partial_end_padding
     return "{{'names':['a'], 'formats':[{}], 'offsets':[{}], 'itemsize':{}}}".format(
-        partial_dtype_fmt(), partial_nested_off, partial_nested_size)
+        partial_dtype_fmt(), partial_nested_off, partial_nested_size
+    )
 def assert_equal(actual, expected_data, expected_dtype):
@@ -70,15 +89,21 @@ def assert_equal(actual, expected_data, expected_dtype):
 def test_format_descriptors():
     with pytest.raises(RuntimeError) as excinfo:
-    assert re.match('^NumPy type info missing for .*UnboundStruct.*$', str(excinfo.value))
+    assert re.match(
+        "^NumPy type info missing for .*UnboundStruct.*$", str(excinfo.value)
+    )
-    ld = np.dtype('longdouble')
-    ldbl_fmt = ('4x' if ld.alignment > 4 else '') + ld.char
+    ld = np.dtype("longdouble")
+    ldbl_fmt = ("4x" if ld.alignment > 4 else "") + ld.char
     ss_fmt = "^T{?:bool_:3xI:uint_:f:float_:" + ldbl_fmt + ":ldbl_:}"
-    dbl = np.dtype('double')
-    partial_fmt = ("^T{?:bool_:3xI:uint_:f:float_:" +
-                   str(4 * (dbl.alignment > 4) + dbl.itemsize + 8 * (ld.alignment > 8)) +
-                   "xg:ldbl_:}")
+    dbl = np.dtype("double")
+    end_padding = ld.itemsize % np.dtype("uint64").alignment
+    partial_fmt = (
+        "^T{?:bool_:3xI:uint_:f:float_:"
+        + str(4 * (dbl.alignment > 4) + dbl.itemsize + 8 * (ld.alignment > 8))
+        + "xg:ldbl_:"
+        + (str(end_padding) + "x}" if end_padding > 0 else "}")
+    )
     nested_extra = str(max(8, ld.alignment))
     assert m.print_format_descriptors() == [
@@ -88,14 +113,15 @@ def test_format_descriptors():
         "^T{" + nested_extra + "x" + partial_fmt + ":a:" + nested_extra + "x}",
         "^T{(3)4s:a:(2)i:b:(3)B:c:1x(4, 2)f:d:}",
-        '^T{q:e1:B:e2:}',
-        '^T{Zf:cflt:Zd:cdbl:}'
+        "^T{q:e1:B:e2:}",
+        "^T{Zf:cflt:Zd:cdbl:}",
 def test_dtype(simple_dtype):
     from sys import byteorder
-    e = '<' if byteorder == 'little' else '>'
+    e = "<" if byteorder == "little" else ">"
     assert m.print_dtypes() == [
@@ -104,30 +130,63 @@ def test_dtype(simple_dtype):
         "[('a', 'S3'), ('b', 'S3')]",
-        ("{{'names':['a','b','c','d'], " +
-         "'formats':[('S4', (3,)),('" + e + "i4', (2,)),('u1', (3,)),('" + e + "f4', (4, 2))], " +
-         "'offsets':[0,12,20,24], 'itemsize':56}}").format(e=e),
+        (
+            "{{'names':['a','b','c','d'], "
+            + "'formats':[('S4', (3,)),('"
+            + e
+            + "i4', (2,)),('u1', (3,)),('"
+            + e
+            + "f4', (4, 2))], "
+            + "'offsets':[0,12,20,24], 'itemsize':56}}"
+        ).format(e=e),
         "[('e1', '" + e + "i8'), ('e2', 'u1')]",
         "[('x', 'i1'), ('y', '" + e + "u8')]",
-        "[('cflt', '" + e + "c8'), ('cdbl', '" + e + "c16')]"
+        "[('cflt', '" + e + "c8'), ('cdbl', '" + e + "c16')]",
-    d1 = np.dtype({'names': ['a', 'b'], 'formats': ['int32', 'float64'],
-                   'offsets': [1, 10], 'itemsize': 20})
-    d2 = np.dtype([('a', 'i4'), ('b', 'f4')])
-    assert m.test_dtype_ctors() == [np.dtype('int32'), np.dtype('float64'),
-                                    np.dtype('bool'), d1, d1, np.dtype('uint32'), d2]
+    d1 = np.dtype(
+        {
+            "names": ["a", "b"],
+            "formats": ["int32", "float64"],
+            "offsets": [1, 10],
+            "itemsize": 20,
+        }
+    )
+    d2 = np.dtype([("a", "i4"), ("b", "f4")])
+    assert m.test_dtype_ctors() == [
+        np.dtype("int32"),
+        np.dtype("float64"),
+        np.dtype("bool"),
+        d1,
+        d1,
+        np.dtype("uint32"),
+        d2,
+    ]
-    assert m.test_dtype_methods() == [np.dtype('int32'), simple_dtype, False, True,
-                                      np.dtype('int32').itemsize, simple_dtype.itemsize]
+    assert m.test_dtype_methods() == [
+        np.dtype("int32"),
+        simple_dtype,
+        False,
+        True,
+        np.dtype("int32").itemsize,
+        simple_dtype.itemsize,
+    ]
+    assert m.trailing_padding_dtype() == m.buffer_to_dtype(
+        np.zeros(1, m.trailing_padding_dtype())
+    )
-    assert m.trailing_padding_dtype() == m.buffer_to_dtype(np.zeros(1, m.trailing_padding_dtype()))
+    assert m.test_dtype_kind() == list("iiiiiuuuuuffffcccbMmO")
+    assert m.test_dtype_char_() == list("bhilqBHILQefdgFDG?MmO")
 def test_recarray(simple_dtype, packed_dtype):
     elements = [(False, 0, 0.0, -0.0), (True, 1, 1.5, -2.5), (False, 2, 3.0, -5.0)]
-    for func, dtype in [(m.create_rec_simple, simple_dtype), (m.create_rec_packed, packed_dtype)]:
+    for func, dtype in [
+        (m.create_rec_simple, simple_dtype),
+        (m.create_rec_packed, packed_dtype),
+    ]:
         arr = func(0)
         assert arr.dtype == dtype
         assert_equal(arr, [], simple_dtype)
@@ -138,20 +197,24 @@ def test_recarray(simple_dtype, packed_dtype):
         assert_equal(arr, elements, simple_dtype)
         assert_equal(arr, elements, packed_dtype)
+        # Show what recarray's look like in NumPy.
+        assert type(arr[0]) == np.void
+        assert type(arr[0].item()) == tuple
         if dtype == simple_dtype:
             assert m.print_rec_simple(arr) == [
-                "s:0,2,3,-5"
+                "s:0,2,3,-5",
             assert m.print_rec_packed(arr) == [
-                "p:0,2,3,-5"
+                "p:0,2,3,-5",
-    nested_dtype = np.dtype([('a', simple_dtype), ('b', packed_dtype)])
+    nested_dtype = np.dtype([("a", simple_dtype), ("b", packed_dtype)])
     arr = m.create_rec_nested(0)
     assert arr.dtype == nested_dtype
@@ -159,33 +222,39 @@ def test_recarray(simple_dtype, packed_dtype):
     arr = m.create_rec_nested(3)
     assert arr.dtype == nested_dtype
-    assert_equal(arr, [((False, 0, 0.0, -0.0), (True, 1, 1.5, -2.5)),
-                       ((True, 1, 1.5, -2.5), (False, 2, 3.0, -5.0)),
-                       ((False, 2, 3.0, -5.0), (True, 3, 4.5, -7.5))], nested_dtype)
+    assert_equal(
+        arr,
+        [
+            ((False, 0, 0.0, -0.0), (True, 1, 1.5, -2.5)),
+            ((True, 1, 1.5, -2.5), (False, 2, 3.0, -5.0)),
+            ((False, 2, 3.0, -5.0), (True, 3, 4.5, -7.5)),
+        ],
+        nested_dtype,
+    )
     assert m.print_rec_nested(arr) == [
-        "n:a=s:0,2,3,-5;b=p:1,3,4.5,-7.5"
+        "n:a=s:0,2,3,-5;b=p:1,3,4.5,-7.5",
     arr = m.create_rec_partial(3)
     assert str(arr.dtype) == partial_dtype_fmt()
     partial_dtype = arr.dtype
-    assert '' not in arr.dtype.fields
+    assert "" not in arr.dtype.fields
     assert partial_dtype.itemsize > simple_dtype.itemsize
     assert_equal(arr, elements, simple_dtype)
     assert_equal(arr, elements, packed_dtype)
     arr = m.create_rec_partial_nested(3)
     assert str(arr.dtype) == partial_nested_fmt()
-    assert '' not in arr.dtype.fields
-    assert '' not in arr.dtype.fields['a'][0].fields
+    assert "" not in arr.dtype.fields
+    assert "" not in arr.dtype.fields["a"][0].fields
     assert arr.dtype.itemsize > partial_dtype.itemsize
-    np.testing.assert_equal(arr['a'], m.create_rec_partial(3))
+    np.testing.assert_equal(arr["a"], m.create_rec_partial(3))
 def test_array_constructors():
-    data = np.arange(1, 7, dtype='int32')
+    data = np.arange(1, 7, dtype="int32")
     for i in range(8):
         np.testing.assert_array_equal(m.test_array_ctors(10 + i), data.reshape((3, 2)))
         np.testing.assert_array_equal(m.test_array_ctors(20 + i), data.reshape((3, 2)))
@@ -201,82 +270,92 @@ def test_string_array():
-        "a='abc',b='abc'"
+        "a='abc',b='abc'",
     dtype = arr.dtype
-    assert arr['a'].tolist() == [b'', b'a', b'ab', b'abc']
-    assert arr['b'].tolist() == [b'', b'a', b'ab', b'abc']
+    assert arr["a"].tolist() == [b"", b"a", b"ab", b"abc"]
+    assert arr["b"].tolist() == [b"", b"a", b"ab", b"abc"]
     arr = m.create_string_array(False)
     assert dtype == arr.dtype
 def test_array_array():
     from sys import byteorder
-    e = '<' if byteorder == 'little' else '>'
+    e = "<" if byteorder == "little" else ">"
     arr = m.create_array_array(3)
     assert str(arr.dtype) == (
-        "{{'names':['a','b','c','d'], " +
-        "'formats':[('S4', (3,)),('" + e + "i4', (2,)),('u1', (3,)),('{e}f4', (4, 2))], " +
-        "'offsets':[0,12,20,24], 'itemsize':56}}").format(e=e)
+        "{{'names':['a','b','c','d'], "
+        + "'formats':[('S4', (3,)),('"
+        + e
+        + "i4', (2,)),('u1', (3,)),('{e}f4', (4, 2))], "
+        + "'offsets':[0,12,20,24], 'itemsize':56}}"
+    ).format(e=e)
     assert m.print_array_array(arr) == [
-        "a={{A,B,C,D},{K,L,M,N},{U,V,W,X}},b={0,1}," +
-        "c={0,1,2},d={{0,1},{10,11},{20,21},{30,31}}",
-        "a={{W,X,Y,Z},{G,H,I,J},{Q,R,S,T}},b={1000,1001}," +
-        "c={10,11,12},d={{100,101},{110,111},{120,121},{130,131}}",
-        "a={{S,T,U,V},{C,D,E,F},{M,N,O,P}},b={2000,2001}," +
-        "c={20,21,22},d={{200,201},{210,211},{220,221},{230,231}}",
+        "a={{A,B,C,D},{K,L,M,N},{U,V,W,X}},b={0,1},"
+        + "c={0,1,2},d={{0,1},{10,11},{20,21},{30,31}}",
+        "a={{W,X,Y,Z},{G,H,I,J},{Q,R,S,T}},b={1000,1001},"
+        + "c={10,11,12},d={{100,101},{110,111},{120,121},{130,131}}",
+        "a={{S,T,U,V},{C,D,E,F},{M,N,O,P}},b={2000,2001},"
+        + "c={20,21,22},d={{200,201},{210,211},{220,221},{230,231}}",
-    assert arr['a'].tolist() == [[b'ABCD', b'KLMN', b'UVWX'],
-                                 [b'WXYZ', b'GHIJ', b'QRST'],
-                                 [b'STUV', b'CDEF', b'MNOP']]
-    assert arr['b'].tolist() == [[0, 1], [1000, 1001], [2000, 2001]]
+    assert arr["a"].tolist() == [
+        [b"ABCD", b"KLMN", b"UVWX"],
+        [b"WXYZ", b"GHIJ", b"QRST"],
+        [b"STUV", b"CDEF", b"MNOP"],
+    ]
+    assert arr["b"].tolist() == [[0, 1], [1000, 1001], [2000, 2001]]
     assert m.create_array_array(0).dtype == arr.dtype
 def test_enum_array():
     from sys import byteorder
-    e = '<' if byteorder == 'little' else '>'
+    e = "<" if byteorder == "little" else ">"
     arr = m.create_enum_array(3)
     dtype = arr.dtype
-    assert dtype == np.dtype([('e1', e + 'i8'), ('e2', 'u1')])
-    assert m.print_enum_array(arr) == [
-        "e1=A,e2=X",
-        "e1=B,e2=Y",
-        "e1=A,e2=X"
-    ]
-    assert arr['e1'].tolist() == [-1, 1, -1]
-    assert arr['e2'].tolist() == [1, 2, 1]
+    assert dtype == np.dtype([("e1", e + "i8"), ("e2", "u1")])
+    assert m.print_enum_array(arr) == ["e1=A,e2=X", "e1=B,e2=Y", "e1=A,e2=X"]
+    assert arr["e1"].tolist() == [-1, 1, -1]
+    assert arr["e2"].tolist() == [1, 2, 1]
     assert m.create_enum_array(0).dtype == dtype
 def test_complex_array():
     from sys import byteorder
-    e = '<' if byteorder == 'little' else '>'
+    e = "<" if byteorder == "little" else ">"
     arr = m.create_complex_array(3)
     dtype = arr.dtype
-    assert dtype == np.dtype([('cflt', e + 'c8'), ('cdbl', e + 'c16')])
+    assert dtype == np.dtype([("cflt", e + "c8"), ("cdbl", e + "c16")])
     assert m.print_complex_array(arr) == [
-        "c:(2,2.25),(2.5,2.75)"
+        "c:(2,2.25),(2.5,2.75)",
-    assert arr['cflt'].tolist() == [0.0 + 0.25j, 1.0 + 1.25j, 2.0 + 2.25j]
-    assert arr['cdbl'].tolist() == [0.5 + 0.75j, 1.5 + 1.75j, 2.5 + 2.75j]
+    assert arr["cflt"].tolist() == [0.0 + 0.25j, 1.0 + 1.25j, 2.0 + 2.25j]
+    assert arr["cdbl"].tolist() == [0.5 + 0.75j, 1.5 + 1.75j, 2.5 + 2.75j]
     assert m.create_complex_array(0).dtype == dtype
 def test_signature(doc):
-    assert doc(m.create_rec_nested) == \
-        "create_rec_nested(arg0: int) -> numpy.ndarray[NestedStruct]"
+    assert (
+        doc(m.create_rec_nested)
+        == "create_rec_nested(arg0: int) -> numpy.ndarray[NestedStruct]"
+    )
 def test_scalar_conversion():
     n = 3
-    arrays = [m.create_rec_simple(n), m.create_rec_packed(n),
-              m.create_rec_nested(n), m.create_enum_array(n)]
+    arrays = [
+        m.create_rec_simple(n),
+        m.create_rec_packed(n),
+        m.create_rec_nested(n),
+        m.create_enum_array(n),
+    ]
     funcs = [m.f_simple, m.f_packed, m.f_nested]
     for i, func in enumerate(funcs):
@@ -286,18 +365,68 @@ def test_scalar_conversion():
                 with pytest.raises(TypeError) as excinfo:
-                assert 'incompatible function arguments' in str(excinfo.value)
+                assert "incompatible function arguments" in str(excinfo.value)
+def test_vectorize():
+    n = 3
+    array = m.create_rec_simple(n)
+    values = m.f_simple_vectorized(array)
+    np.testing.assert_array_equal(values, [0, 10, 20])
+    array_2 = m.f_simple_pass_thru_vectorized(array)
+    np.testing.assert_array_equal(array, array_2)
+def test_cls_and_dtype_conversion(simple_dtype):
+    s = m.SimpleStruct()
+    assert s.astuple() == (False, 0, 0.0, 0.0)
+    assert m.SimpleStruct.fromtuple(s.astuple()).astuple() == s.astuple()
+    s.uint_ = 2
+    assert m.f_simple(s) == 20
+    # Try as recarray of shape==(1,).
+    s_recarray = np.array([(False, 2, 0.0, 0.0)], dtype=simple_dtype)
+    # Show that this will work for vectorized case.
+    np.testing.assert_array_equal(m.f_simple_vectorized(s_recarray), [20])
+    # Show as a scalar that inherits from np.generic.
+    s_scalar = s_recarray[0]
+    assert isinstance(s_scalar, np.void)
+    assert m.f_simple(s_scalar) == 20
+    # Show that an *array* scalar (np.ndarray.shape == ()) does not convert.
+    # More specifically, conversion to SimpleStruct is not implicit.
+    s_recarray_scalar = s_recarray.reshape(())
+    assert isinstance(s_recarray_scalar, np.ndarray)
+    assert s_recarray_scalar.dtype == simple_dtype
+    with pytest.raises(TypeError) as excinfo:
+        m.f_simple(s_recarray_scalar)
+    assert "incompatible function arguments" in str(excinfo.value)
+    # Explicitly convert to m.SimpleStruct.
+    assert m.f_simple(m.SimpleStruct.fromtuple(s_recarray_scalar.item())) == 20
+    # Show that an array of dtype=object does *not* convert.
+    s_array_object = np.array([s])
+    assert s_array_object.dtype == object
+    with pytest.raises(TypeError) as excinfo:
+        m.f_simple_vectorized(s_array_object)
+    assert "incompatible function arguments" in str(excinfo.value)
+    # Explicitly convert to `np.array(..., dtype=simple_dtype)`
+    s_array = np.array([s.astuple()], dtype=simple_dtype)
+    np.testing.assert_array_equal(m.f_simple_vectorized(s_array), [20])
 def test_register_dtype():
     with pytest.raises(RuntimeError) as excinfo:
-    assert 'dtype is already registered' in str(excinfo.value)
+    assert "dtype is already registered" in str(excinfo.value)
 def test_str_leak():
     from sys import getrefcount
     fmt = "f4"
     start = getrefcount(fmt)
diff --git a/wrap/pybind11/tests/test_numpy_vectorize.cpp b/wrap/pybind11/tests/test_numpy_vectorize.cpp
index e76e462cbf..eb5281fb1d 100644
--- a/wrap/pybind11/tests/test_numpy_vectorize.cpp
+++ b/wrap/pybind11/tests/test_numpy_vectorize.cpp
@@ -11,13 +11,15 @@
 #include "pybind11_tests.h"
 #include <pybind11/numpy.h>
+#include <utility>
 double my_func(int x, float y, double z) {
     py::print("my_func(x:int={}, y:float={:.0f}, z:float={:.0f})"_s.format(x, y, z));
     return (float) x*y*z;
 TEST_SUBMODULE(numpy_vectorize, m) {
-    try { py::module::import("numpy"); }
+    try { py::module_::import("numpy"); }
     catch (...) { return; }
     // test_vectorize, test_docs, test_array_collapse
@@ -25,11 +27,10 @@ TEST_SUBMODULE(numpy_vectorize, m) {
     m.def("vectorized_func", py::vectorize(my_func));
     // Vectorize a lambda function with a capture object (e.g. to exclude some arguments from the vectorization)
-    m.def("vectorized_func2",
-        [](py::array_t<int> x, py::array_t<float> y, float z) {
-            return py::vectorize([z](int x, float y) { return my_func(x, y, z); })(x, y);
-        }
-    );
+    m.def("vectorized_func2", [](py::array_t<int> x, py::array_t<float> y, float z) {
+        return py::vectorize([z](int x, float y) { return my_func(x, y, z); })(std::move(x),
+                                                                               std::move(y));
+    });
     // Vectorize a complex-valued function
     m.def("vectorized_func3", py::vectorize(
@@ -38,29 +39,40 @@ TEST_SUBMODULE(numpy_vectorize, m) {
     // test_type_selection
     // NumPy function which only accepts specific data types
-    m.def("selective_func", [](py::array_t<int, py::array::c_style>) { return "Int branch taken."; });
-    m.def("selective_func", [](py::array_t<float, py::array::c_style>) { return "Float branch taken."; });
-    m.def("selective_func", [](py::array_t<std::complex<float>, py::array::c_style>) { return "Complex float branch taken."; });
+    // A lot of these no lints could be replaced with const refs, and probably should at some point.
+    m.def("selective_func",
+          [](const py::array_t<int, py::array::c_style> &) { return "Int branch taken."; });
+    m.def("selective_func",
+          [](const py::array_t<float, py::array::c_style> &) { return "Float branch taken."; });
+    m.def("selective_func", [](const py::array_t<std::complex<float>, py::array::c_style> &) {
+        return "Complex float branch taken.";
+    });
     // test_passthrough_arguments
     // Passthrough test: references and non-pod types should be automatically passed through (in the
     // function definition below, only `b`, `d`, and `g` are vectorized):
     struct NonPODClass {
-        NonPODClass(int v) : value{v} {}
+        explicit NonPODClass(int v) : value{v} {}
         int value;
-    py::class_<NonPODClass>(m, "NonPODClass").def(py::init<int>());
-    m.def("vec_passthrough", py::vectorize(
-        [](double *a, double b, py::array_t<double> c, const int &d, int &e, NonPODClass f, const double g) {
-            return *a + b + c.at(0) + d + e + f.value + g;
-        }
-    ));
+    py::class_<NonPODClass>(m, "NonPODClass")
+        .def(py::init<int>())
+        .def_readwrite("value", &NonPODClass::value);
+    m.def("vec_passthrough",
+          py::vectorize([](const double *a,
+                           double b,
+                           // Changing this broke things
+                           // NOLINTNEXTLINE(performance-unnecessary-value-param)
+                           py::array_t<double> c,
+                           const int &d,
+                           int &e,
+                           NonPODClass f,
+                           const double g) { return *a + b + c.at(0) + d + e + f.value + g; }));
     // test_method_vectorization
     struct VectorizeTestClass {
-        VectorizeTestClass(int v) : value{v} {};
-        float method(int x, float y) { return y + (float) (x + value); }
+        explicit VectorizeTestClass(int v) : value{v} {};
+        float method(int x, float y) const { return y + (float) (x + value); }
         int value = 0;
     py::class_<VectorizeTestClass> vtc(m, "VectorizeTestClass");
@@ -76,14 +88,16 @@ TEST_SUBMODULE(numpy_vectorize, m) {
         .value("f_trivial", py::detail::broadcast_trivial::f_trivial)
         .value("c_trivial", py::detail::broadcast_trivial::c_trivial)
         .value("non_trivial", py::detail::broadcast_trivial::non_trivial);
-    m.def("vectorized_is_trivial", [](
-                py::array_t<int, py::array::forcecast> arg1,
-                py::array_t<float, py::array::forcecast> arg2,
-                py::array_t<double, py::array::forcecast> arg3
-                ) {
-        ssize_t ndim;
-        std::vector<ssize_t> shape;
-        std::array<py::buffer_info, 3> buffers {{ arg1.request(), arg2.request(), arg3.request() }};
-        return py::detail::broadcast(buffers, ndim, shape);
-    });
+    m.def("vectorized_is_trivial",
+          [](const py::array_t<int, py::array::forcecast> &arg1,
+             const py::array_t<float, py::array::forcecast> &arg2,
+             const py::array_t<double, py::array::forcecast> &arg3) {
+              py::ssize_t ndim = 0;
+              std::vector<py::ssize_t> shape;
+              std::array<py::buffer_info, 3> buffers{
+                  {arg1.request(), arg2.request(), arg3.request()}};
+              return py::detail::broadcast(buffers, ndim, shape);
+          });
+    m.def("add_to", py::vectorize([](NonPODClass& x, int a) { x.value += a; }));
diff --git a/wrap/pybind11/tests/test_numpy_vectorize.py b/wrap/pybind11/tests/test_numpy_vectorize.py
index 54e44cd8d3..de5c9a607d 100644
--- a/wrap/pybind11/tests/test_numpy_vectorize.py
+++ b/wrap/pybind11/tests/test_numpy_vectorize.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 import pytest
 from pybind11_tests import numpy_vectorize as m
 np = pytest.importorskip("numpy")
@@ -17,28 +18,40 @@ def test_vectorize(capture):
         assert capture == "my_func(x:int=1, y:float=2, z:float=3)"
         with capture:
             assert np.allclose(f(np.array([1, 3]), np.array([2, 4]), 3), [6, 36])
-        assert capture == """
+        assert (
+            capture
+            == """
             my_func(x:int=1, y:float=2, z:float=3)
             my_func(x:int=3, y:float=4, z:float=3)
+        )
         with capture:
-            a = np.array([[1, 2], [3, 4]], order='F')
-            b = np.array([[10, 20], [30, 40]], order='F')
+            a = np.array([[1, 2], [3, 4]], order="F")
+            b = np.array([[10, 20], [30, 40]], order="F")
             c = 3
             result = f(a, b, c)
             assert np.allclose(result, a * b * c)
             assert result.flags.f_contiguous
         # All inputs are F order and full or singletons, so we the result is in col-major order:
-        assert capture == """
+        assert (
+            capture
+            == """
             my_func(x:int=1, y:float=10, z:float=3)
             my_func(x:int=3, y:float=30, z:float=3)
             my_func(x:int=2, y:float=20, z:float=3)
             my_func(x:int=4, y:float=40, z:float=3)
+        )
         with capture:
-            a, b, c = np.array([[1, 3, 5], [7, 9, 11]]), np.array([[2, 4, 6], [8, 10, 12]]), 3
+            a, b, c = (
+                np.array([[1, 3, 5], [7, 9, 11]]),
+                np.array([[2, 4, 6], [8, 10, 12]]),
+                3,
+            )
             assert np.allclose(f(a, b, c), a * b * c)
-        assert capture == """
+        assert (
+            capture
+            == """
             my_func(x:int=1, y:float=2, z:float=3)
             my_func(x:int=3, y:float=4, z:float=3)
             my_func(x:int=5, y:float=6, z:float=3)
@@ -46,10 +59,13 @@ def test_vectorize(capture):
             my_func(x:int=9, y:float=10, z:float=3)
             my_func(x:int=11, y:float=12, z:float=3)
+        )
         with capture:
             a, b, c = np.array([[1, 2, 3], [4, 5, 6]]), np.array([2, 3, 4]), 2
             assert np.allclose(f(a, b, c), a * b * c)
-        assert capture == """
+        assert (
+            capture
+            == """
             my_func(x:int=1, y:float=2, z:float=2)
             my_func(x:int=2, y:float=3, z:float=2)
             my_func(x:int=3, y:float=4, z:float=2)
@@ -57,10 +73,13 @@ def test_vectorize(capture):
             my_func(x:int=5, y:float=3, z:float=2)
             my_func(x:int=6, y:float=4, z:float=2)
+        )
         with capture:
             a, b, c = np.array([[1, 2, 3], [4, 5, 6]]), np.array([[2], [3]]), 2
             assert np.allclose(f(a, b, c), a * b * c)
-        assert capture == """
+        assert (
+            capture
+            == """
             my_func(x:int=1, y:float=2, z:float=2)
             my_func(x:int=2, y:float=2, z:float=2)
             my_func(x:int=3, y:float=2, z:float=2)
@@ -68,10 +87,17 @@ def test_vectorize(capture):
             my_func(x:int=5, y:float=3, z:float=2)
             my_func(x:int=6, y:float=3, z:float=2)
+        )
         with capture:
-            a, b, c = np.array([[1, 2, 3], [4, 5, 6]], order='F'), np.array([[2], [3]]), 2
+            a, b, c = (
+                np.array([[1, 2, 3], [4, 5, 6]], order="F"),
+                np.array([[2], [3]]),
+                2,
+            )
             assert np.allclose(f(a, b, c), a * b * c)
-        assert capture == """
+        assert (
+            capture
+            == """
             my_func(x:int=1, y:float=2, z:float=2)
             my_func(x:int=2, y:float=2, z:float=2)
             my_func(x:int=3, y:float=2, z:float=2)
@@ -79,36 +105,53 @@ def test_vectorize(capture):
             my_func(x:int=5, y:float=3, z:float=2)
             my_func(x:int=6, y:float=3, z:float=2)
+        )
         with capture:
             a, b, c = np.array([[1, 2, 3], [4, 5, 6]])[::, ::2], np.array([[2], [3]]), 2
             assert np.allclose(f(a, b, c), a * b * c)
-        assert capture == """
+        assert (
+            capture
+            == """
             my_func(x:int=1, y:float=2, z:float=2)
             my_func(x:int=3, y:float=2, z:float=2)
             my_func(x:int=4, y:float=3, z:float=2)
             my_func(x:int=6, y:float=3, z:float=2)
+        )
         with capture:
-            a, b, c = np.array([[1, 2, 3], [4, 5, 6]], order='F')[::, ::2], np.array([[2], [3]]), 2
+            a, b, c = (
+                np.array([[1, 2, 3], [4, 5, 6]], order="F")[::, ::2],
+                np.array([[2], [3]]),
+                2,
+            )
             assert np.allclose(f(a, b, c), a * b * c)
-        assert capture == """
+        assert (
+            capture
+            == """
             my_func(x:int=1, y:float=2, z:float=2)
             my_func(x:int=3, y:float=2, z:float=2)
             my_func(x:int=4, y:float=3, z:float=2)
             my_func(x:int=6, y:float=3, z:float=2)
+        )
 def test_type_selection():
     assert m.selective_func(np.array([1], dtype=np.int32)) == "Int branch taken."
     assert m.selective_func(np.array([1.0], dtype=np.float32)) == "Float branch taken."
-    assert m.selective_func(np.array([1.0j], dtype=np.complex64)) == "Complex float branch taken."
+    assert (
+        m.selective_func(np.array([1.0j], dtype=np.complex64))
+        == "Complex float branch taken."
+    )
 def test_docs(doc):
-    assert doc(m.vectorized_func) == """
+    assert (
+        doc(m.vectorized_func)
+        == """
         vectorized_func(arg0: numpy.ndarray[numpy.int32], arg1: numpy.ndarray[numpy.float32], arg2: numpy.ndarray[numpy.float64]) -> object
     """  # noqa: E501 line too long
+    )
 def test_trivial_broadcasting():
@@ -116,16 +159,24 @@ def test_trivial_broadcasting():
     assert vectorized_is_trivial(1, 2, 3) == trivial.c_trivial
     assert vectorized_is_trivial(np.array(1), np.array(2), 3) == trivial.c_trivial
-    assert vectorized_is_trivial(np.array([1, 3]), np.array([2, 4]), 3) == trivial.c_trivial
+    assert (
+        vectorized_is_trivial(np.array([1, 3]), np.array([2, 4]), 3)
+        == trivial.c_trivial
+    )
     assert trivial.c_trivial == vectorized_is_trivial(
-        np.array([[1, 3, 5], [7, 9, 11]]), np.array([[2, 4, 6], [8, 10, 12]]), 3)
-    assert vectorized_is_trivial(
-        np.array([[1, 2, 3], [4, 5, 6]]), np.array([2, 3, 4]), 2) == trivial.non_trivial
-    assert vectorized_is_trivial(
-        np.array([[1, 2, 3], [4, 5, 6]]), np.array([[2], [3]]), 2) == trivial.non_trivial
-    z1 = np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype='int32')
-    z2 = np.array(z1, dtype='float32')
-    z3 = np.array(z1, dtype='float64')
+        np.array([[1, 3, 5], [7, 9, 11]]), np.array([[2, 4, 6], [8, 10, 12]]), 3
+    )
+    assert (
+        vectorized_is_trivial(np.array([[1, 2, 3], [4, 5, 6]]), np.array([2, 3, 4]), 2)
+        == trivial.non_trivial
+    )
+    assert (
+        vectorized_is_trivial(np.array([[1, 2, 3], [4, 5, 6]]), np.array([[2], [3]]), 2)
+        == trivial.non_trivial
+    )
+    z1 = np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype="int32")
+    z2 = np.array(z1, dtype="float32")
+    z3 = np.array(z1, dtype="float64")
     assert vectorized_is_trivial(z1, z2, z3) == trivial.c_trivial
     assert vectorized_is_trivial(1, z2, z3) == trivial.c_trivial
     assert vectorized_is_trivial(z1, 1, z3) == trivial.c_trivial
@@ -135,7 +186,7 @@ def test_trivial_broadcasting():
     assert vectorized_is_trivial(1, 1, z3[::2, ::2]) == trivial.non_trivial
     assert vectorized_is_trivial(z1, 1, z3[1::4, 1::4]) == trivial.c_trivial
-    y1 = np.array(z1, order='F')
+    y1 = np.array(z1, order="F")
     y2 = np.array(y1)
     y3 = np.array(y1)
     assert vectorized_is_trivial(y1, y2, y3) == trivial.f_trivial
@@ -156,30 +207,41 @@ def test_trivial_broadcasting():
 def test_passthrough_arguments(doc):
     assert doc(m.vec_passthrough) == (
-        "vec_passthrough(" + ", ".join([
-            "arg0: float",
-            "arg1: numpy.ndarray[numpy.float64]",
-            "arg2: numpy.ndarray[numpy.float64]",
-            "arg3: numpy.ndarray[numpy.int32]",
-            "arg4: int",
-            "arg5: m.numpy_vectorize.NonPODClass",
-            "arg6: numpy.ndarray[numpy.float64]"]) + ") -> object")
-    b = np.array([[10, 20, 30]], dtype='float64')
+        "vec_passthrough("
+        + ", ".join(
+            [
+                "arg0: float",
+                "arg1: numpy.ndarray[numpy.float64]",
+                "arg2: numpy.ndarray[numpy.float64]",
+                "arg3: numpy.ndarray[numpy.int32]",
+                "arg4: int",
+                "arg5: m.numpy_vectorize.NonPODClass",
+                "arg6: numpy.ndarray[numpy.float64]",
+            ]
+        )
+        + ") -> object"
+    )
+    b = np.array([[10, 20, 30]], dtype="float64")
     c = np.array([100, 200])  # NOT a vectorized argument
-    d = np.array([[1000], [2000], [3000]], dtype='int')
-    g = np.array([[1000000, 2000000, 3000000]], dtype='int')  # requires casting
+    d = np.array([[1000], [2000], [3000]], dtype="int")
+    g = np.array([[1000000, 2000000, 3000000]], dtype="int")  # requires casting
     assert np.all(
-        m.vec_passthrough(1, b, c, d, 10000, m.NonPODClass(100000), g) ==
-        np.array([[1111111, 2111121, 3111131],
-                  [1112111, 2112121, 3112131],
-                  [1113111, 2113121, 3113131]]))
+        m.vec_passthrough(1, b, c, d, 10000, m.NonPODClass(100000), g)
+        == np.array(
+            [
+                [1111111, 2111121, 3111131],
+                [1112111, 2112121, 3112131],
+                [1113111, 2113121, 3113131],
+            ]
+        )
+    )
 def test_method_vectorization():
     o = m.VectorizeTestClass(3)
-    x = np.array([1, 2], dtype='int')
-    y = np.array([[10], [20]], dtype='float32')
+    x = np.array([1, 2], dtype="int")
+    y = np.array([[10], [20]], dtype="float32")
     assert np.all(o.method(x, y) == [[14, 15], [24, 25]])
@@ -188,7 +250,18 @@ def test_array_collapse():
     assert not isinstance(m.vectorized_func(np.array(1), 2, 3), np.ndarray)
     z = m.vectorized_func([1], 2, 3)
     assert isinstance(z, np.ndarray)
-    assert z.shape == (1, )
+    assert z.shape == (1,)
     z = m.vectorized_func(1, [[[2]]], 3)
     assert isinstance(z, np.ndarray)
     assert z.shape == (1, 1, 1)
+def test_vectorized_noreturn():
+    x = m.NonPODClass(0)
+    assert x.value == 0
+    m.add_to(x, [1, 2, 3, 4])
+    assert x.value == 10
+    m.add_to(x, 1)
+    assert x.value == 11
+    m.add_to(x, [[1, 1], [2, 3]])
+    assert x.value == 18
diff --git a/wrap/pybind11/tests/test_opaque_types.cpp b/wrap/pybind11/tests/test_opaque_types.cpp
index 594c45a089..804de6d4ff 100644
--- a/wrap/pybind11/tests/test_opaque_types.cpp
+++ b/wrap/pybind11/tests/test_opaque_types.cpp
@@ -44,7 +44,7 @@ TEST_SUBMODULE(opaque_types, m) {
     m.def("print_opaque_list", [](const StringList &l) {
         std::string ret = "Opaque list: [";
         bool first = true;
-        for (auto entry : l) {
+        for (const auto &entry : l) {
             if (!first)
                 ret += ", ";
             ret += entry;
@@ -64,4 +64,10 @@ TEST_SUBMODULE(opaque_types, m) {
         result->push_back("some value");
         return std::unique_ptr<StringList>(result);
+    // test unions
+    py::class_<IntFloat>(m, "IntFloat")
+        .def(py::init<>())
+        .def_readwrite("i", &IntFloat::i)
+        .def_readwrite("f", &IntFloat::f);
diff --git a/wrap/pybind11/tests/test_opaque_types.py b/wrap/pybind11/tests/test_opaque_types.py
index 3f2392775d..5495cb6b4a 100644
--- a/wrap/pybind11/tests/test_opaque_types.py
+++ b/wrap/pybind11/tests/test_opaque_types.py
@@ -1,7 +1,8 @@
 # -*- coding: utf-8 -*-
 import pytest
-from pybind11_tests import opaque_types as m
 from pybind11_tests import ConstructorStats, UserType
+from pybind11_tests import opaque_types as m
 def test_string_list():
@@ -32,12 +33,15 @@ def test_pointers(msg):
     with pytest.raises(TypeError) as excinfo:
         m.get_void_ptr_value([1, 2, 3])  # This should not work
-    assert msg(excinfo.value) == """
+    assert (
+        msg(excinfo.value)
+        == """
         get_void_ptr_value(): incompatible function arguments. The following argument types are supported:
             1. (arg0: capsule) -> int
         Invoked with: [1, 2, 3]
     """  # noqa: E501 line too long
+    )
     assert m.return_null_str() is None
     assert m.get_null_str_value(m.return_null_str()) is not None
@@ -45,3 +49,11 @@ def test_pointers(msg):
     ptr = m.return_unique_ptr()
     assert "StringList" in repr(ptr)
     assert m.print_opaque_list(ptr) == "Opaque list: [some value]"
+def test_unions():
+    int_float_union = m.IntFloat()
+    int_float_union.i = 42
+    assert int_float_union.i == 42
+    int_float_union.f = 3.0
+    assert int_float_union.f == 3.0
diff --git a/wrap/pybind11/tests/test_operator_overloading.cpp b/wrap/pybind11/tests/test_operator_overloading.cpp
index d55495471a..ffa059d5b5 100644
--- a/wrap/pybind11/tests/test_operator_overloading.cpp
+++ b/wrap/pybind11/tests/test_operator_overloading.cpp
@@ -7,18 +7,28 @@
     BSD-style license that can be found in the LICENSE file.
-#include "pybind11_tests.h"
 #include "constructor_stats.h"
-#include <pybind11/operators.h>
+#include "pybind11_tests.h"
 #include <functional>
+#include <pybind11/operators.h>
+#include <pybind11/stl.h>
 class Vector2 {
     Vector2(float x, float y) : x(x), y(y) { print_created(this, toString()); }
     Vector2(const Vector2 &v) : x(v.x), y(v.y) { print_copy_created(this); }
-    Vector2(Vector2 &&v) : x(v.x), y(v.y) { print_move_created(this); v.x = v.y = 0; }
+    Vector2(Vector2 &&v) noexcept : x(v.x), y(v.y) {
+        print_move_created(this);
+        v.x = v.y = 0;
+    }
     Vector2 &operator=(const Vector2 &v) { x = v.x; y = v.y; print_copy_assigned(this); return *this; }
-    Vector2 &operator=(Vector2 &&v) { x = v.x; y = v.y; v.x = v.y = 0; print_move_assigned(this); return *this; }
+    Vector2 &operator=(Vector2 &&v) noexcept {
+        x   = v.x;
+        y   = v.y;
+        v.x = v.y = 0;
+        print_move_assigned(this);
+        return *this;
+    }
     ~Vector2() { print_destroyed(this); }
     std::string toString() const { return "[" + std::to_string(x) + ", " + std::to_string(y) + "]"; }
@@ -62,6 +72,12 @@ int operator+(const C2 &, const C2 &) { return 22; }
 int operator+(const C2 &, const C1 &) { return 21; }
 int operator+(const C1 &, const C2 &) { return 12; }
+struct HashMe {
+    std::string member;
+bool operator==(const HashMe &lhs, const HashMe &rhs) { return lhs.member == rhs.member; }
 // Note: Specializing explicit within `namespace std { ... }` is done due to a
 // bug in GCC<7. If you are supporting compilers later than this, consider
 // specializing `using template<> struct std::hash<...>` in the global
@@ -73,6 +89,14 @@ namespace std {
         // Not a good hash function, but easy to test
         size_t operator()(const Vector2 &) { return 4; }
+    // HashMe has a hash function in C++ but no `__hash__` for Python.
+    template <>
+    struct hash<HashMe> {
+        std::size_t operator()(const HashMe &selector) const {
+            return std::hash<std::string>()(selector.member);
+        }
+    };
 } // namespace std
 // Not a good abs function, but easy to test.
@@ -80,8 +104,8 @@ std::string abs(const Vector2&) {
     return "abs(Vector2)";
-// MSVC warns about unknown pragmas, and warnings are errors.
-#ifndef _MSC_VER
+// MSVC & Intel warns about unknown pragmas, and warnings are errors.
+#if !defined(_MSC_VER) && !defined(__INTEL_COMPILER)
   #pragma GCC diagnostic push
   // clang 7.0.0 and Apple LLVM 10.0.1 introduce `-Wself-assign-overloaded` to
   // `-Wall`, which is used here for overloading (e.g. `py::self += py::self `).
@@ -89,7 +113,7 @@ std::string abs(const Vector2&) {
   // Taken from: https://github.com/RobotLocomotion/drake/commit/aaf84b46
   // TODO(eric): This could be resolved using a function / functor (e.g. `py::self()`).
   #if defined(__APPLE__) && defined(__clang__)
-    #if (__clang_major__ >= 10) && (__clang_minor__ >= 0) && (__clang_patchlevel__ >= 1)
+    #if (__clang_major__ >= 10)
       #pragma GCC diagnostic ignored "-Wself-assign-overloaded"
   #elif defined(__clang__)
@@ -219,8 +243,12 @@ TEST_SUBMODULE(operators, m) {
         .def("__hash__", &Hashable::hash)
         .def(py::self == py::self);
-#ifndef _MSC_VER
+    // define __eq__ but not __hash__
+    py::class_<HashMe>(m, "HashMe").def(py::self == py::self);
+    m.def("get_unhashable_HashMe_set", []() { return std::unordered_set<HashMe>{{"one"}}; });
+#if !defined(_MSC_VER) && !defined(__INTEL_COMPILER)
   #pragma GCC diagnostic pop
diff --git a/wrap/pybind11/tests/test_operator_overloading.py b/wrap/pybind11/tests/test_operator_overloading.py
index 39e3aee271..8cf375b6da 100644
--- a/wrap/pybind11/tests/test_operator_overloading.py
+++ b/wrap/pybind11/tests/test_operator_overloading.py
@@ -1,7 +1,9 @@
 # -*- coding: utf-8 -*-
 import pytest
-from pybind11_tests import operators as m
+import env
 from pybind11_tests import ConstructorStats
+from pybind11_tests import operators as m
 def test_operator_overloading():
@@ -56,23 +58,23 @@ def test_operator_overloading():
     del v3
     assert cstats.alive() == 0
     assert cstats.values() == [
-        '[1.000000, 2.000000]',
-        '[3.000000, -1.000000]',
-        '[1.000000, 2.000000]',
-        '[-3.000000, 1.000000]',
-        '[4.000000, 1.000000]',
-        '[-2.000000, 3.000000]',
-        '[-7.000000, -6.000000]',
-        '[9.000000, 10.000000]',
-        '[8.000000, 16.000000]',
-        '[0.125000, 0.250000]',
-        '[7.000000, 6.000000]',
-        '[9.000000, 10.000000]',
-        '[8.000000, 16.000000]',
-        '[8.000000, 4.000000]',
-        '[3.000000, -2.000000]',
-        '[3.000000, -0.500000]',
-        '[6.000000, -2.000000]',
+        "[1.000000, 2.000000]",
+        "[3.000000, -1.000000]",
+        "[1.000000, 2.000000]",
+        "[-3.000000, 1.000000]",
+        "[4.000000, 1.000000]",
+        "[-2.000000, 3.000000]",
+        "[-7.000000, -6.000000]",
+        "[9.000000, 10.000000]",
+        "[8.000000, 16.000000]",
+        "[0.125000, 0.250000]",
+        "[7.000000, 6.000000]",
+        "[9.000000, 10.000000]",
+        "[8.000000, 16.000000]",
+        "[8.000000, 4.000000]",
+        "[3.000000, -2.000000]",
+        "[3.000000, -0.500000]",
+        "[6.000000, -2.000000]",
     assert cstats.default_constructions == 0
     assert cstats.copy_constructions == 0
@@ -134,8 +136,9 @@ def test_overriding_eq_reset_hash():
     assert m.Comparable(15) is not m.Comparable(15)
     assert m.Comparable(15) == m.Comparable(15)
-    with pytest.raises(TypeError):
-        hash(m.Comparable(15))  # TypeError: unhashable type: 'm.Comparable'
+    with pytest.raises(TypeError) as excinfo:
+        hash(m.Comparable(15))
+    assert str(excinfo.value).startswith("unhashable type:")
     for hashable in (m.Hashable, m.Hashable2):
         assert hashable(15) is not hashable(15)
@@ -143,3 +146,10 @@ def test_overriding_eq_reset_hash():
         assert hash(hashable(15)) == 15
         assert hash(hashable(15)) == hash(hashable(15))
+def test_return_set_of_unhashable():
+    with pytest.raises(TypeError) as excinfo:
+        m.get_unhashable_HashMe_set()
+    if not env.PY2:
+        assert str(excinfo.value.__cause__).startswith("unhashable type:")
diff --git a/wrap/pybind11/tests/test_pickling.cpp b/wrap/pybind11/tests/test_pickling.cpp
index 9dc63bda3b..b77636dd1a 100644
--- a/wrap/pybind11/tests/test_pickling.cpp
+++ b/wrap/pybind11/tests/test_pickling.cpp
@@ -1,7 +1,9 @@
+// clang-format off
     tests/test_pickling.cpp -- pickle support
     Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+    Copyright (c) 2021 The Pybind Development Team.
     All rights reserved. Use of this source code is governed by a
     BSD-style license that can be found in the LICENSE file.
@@ -9,11 +11,63 @@
 #include "pybind11_tests.h"
+// clang-format on
+#include <memory>
+#include <stdexcept>
+#include <utility>
+namespace exercise_trampoline {
+struct SimpleBase {
+    int num               = 0;
+    virtual ~SimpleBase() = default;
+    // For compatibility with old clang versions:
+    SimpleBase()                   = default;
+    SimpleBase(const SimpleBase &) = default;
+struct SimpleBaseTrampoline : SimpleBase {};
+struct SimpleCppDerived : SimpleBase {};
+void wrap(py::module m) {
+    py::class_<SimpleBase, SimpleBaseTrampoline>(m, "SimpleBase")
+        .def(py::init<>())
+        .def_readwrite("num", &SimpleBase::num)
+        .def(py::pickle(
+            [](const py::object &self) {
+                py::dict d;
+                if (py::hasattr(self, "__dict__"))
+                    d = self.attr("__dict__");
+                return py::make_tuple(self.attr("num"), d);
+            },
+            [](const py::tuple &t) {
+                if (t.size() != 2)
+                    throw std::runtime_error("Invalid state!");
+                auto cpp_state = std::unique_ptr<SimpleBase>(new SimpleBaseTrampoline);
+                cpp_state->num = t[0].cast<int>();
+                auto py_state  = t[1].cast<py::dict>();
+                return std::make_pair(std::move(cpp_state), py_state);
+            }));
+    m.def("make_SimpleCppDerivedAsBase",
+          []() { return std::unique_ptr<SimpleBase>(new SimpleCppDerived); });
+    m.def("check_dynamic_cast_SimpleCppDerived", [](const SimpleBase *base_ptr) {
+        return dynamic_cast<const SimpleCppDerived *>(base_ptr) != nullptr;
+    });
+} // namespace exercise_trampoline
+// clang-format off
 TEST_SUBMODULE(pickling, m) {
     // test_roundtrip
     class Pickleable {
-        Pickleable(const std::string &value) : m_value(value) { }
+        explicit Pickleable(const std::string &value) : m_value(value) { }
         const std::string &value() const { return m_value; }
         void setExtra1(int extra1) { m_extra1 = extra1; }
@@ -31,7 +85,8 @@ TEST_SUBMODULE(pickling, m) {
         using Pickleable::Pickleable;
-    py::class_<Pickleable>(m, "Pickleable")
+    py::class_<Pickleable> pyPickleable(m, "Pickleable");
+    pyPickleable
         .def("value", &Pickleable::value)
         .def("extra1", &Pickleable::extra1)
@@ -43,8 +98,9 @@ TEST_SUBMODULE(pickling, m) {
         .def("__getstate__", [](const Pickleable &p) {
             /* Return a tuple that fully encodes the state of the object */
             return py::make_tuple(p.value(), p.extra1(), p.extra2());
-        })
-        .def("__setstate__", [](Pickleable &p, py::tuple t) {
+        });
+    ignoreOldStyleInitWarnings([&pyPickleable]() {
+        pyPickleable.def("__setstate__", [](Pickleable &p, const py::tuple &t) {
             if (t.size() != 3)
                 throw std::runtime_error("Invalid state!");
             /* Invoke the constructor (need to use in-place version) */
@@ -54,6 +110,7 @@ TEST_SUBMODULE(pickling, m) {
+    });
     py::class_<PickleableNew, Pickleable>(m, "PickleableNew")
@@ -61,7 +118,7 @@ TEST_SUBMODULE(pickling, m) {
             [](const PickleableNew &p) {
                 return py::make_tuple(p.value(), p.extra1(), p.extra2());
-            [](py::tuple t) {
+            [](const py::tuple &t) {
                 if (t.size() != 3)
                     throw std::runtime_error("Invalid state!");
                 auto p = PickleableNew(t[0].cast<std::string>());
@@ -69,14 +126,13 @@ TEST_SUBMODULE(pickling, m) {
                 return p;
-            }
-        ));
+            }));
 #if !defined(PYPY_VERSION)
     // test_roundtrip_with_dict
     class PickleableWithDict {
-        PickleableWithDict(const std::string &value) : value(value) { }
+        explicit PickleableWithDict(const std::string &value) : value(value) { }
         std::string value;
         int extra;
@@ -87,19 +143,20 @@ TEST_SUBMODULE(pickling, m) {
         using PickleableWithDict::PickleableWithDict;
-    py::class_<PickleableWithDict>(m, "PickleableWithDict", py::dynamic_attr())
-        .def(py::init<std::string>())
+    py::class_<PickleableWithDict> pyPickleableWithDict(m, "PickleableWithDict", py::dynamic_attr());
+    pyPickleableWithDict.def(py::init<std::string>())
         .def_readwrite("value", &PickleableWithDict::value)
         .def_readwrite("extra", &PickleableWithDict::extra)
-        .def("__getstate__", [](py::object self) {
+        .def("__getstate__", [](const py::object &self) {
             /* Also include __dict__ in state */
             return py::make_tuple(self.attr("value"), self.attr("extra"), self.attr("__dict__"));
-        })
-        .def("__setstate__", [](py::object self, py::tuple t) {
+        });
+    ignoreOldStyleInitWarnings([&pyPickleableWithDict]() {
+        pyPickleableWithDict.def("__setstate__", [](const py::object &self, const py::tuple &t) {
             if (t.size() != 3)
                 throw std::runtime_error("Invalid state!");
             /* Cast and construct */
-            auto& p = self.cast<PickleableWithDict&>();
+            auto &p = self.cast<PickleableWithDict &>();
             new (&p) PickleableWithDict(t[0].cast<std::string>());
             /* Assign C++ state */
@@ -108,11 +165,12 @@ TEST_SUBMODULE(pickling, m) {
             /* Assign Python state */
             self.attr("__dict__") = t[2];
+    });
     py::class_<PickleableWithDictNew, PickleableWithDict>(m, "PickleableWithDictNew")
-            [](py::object self) {
+            [](const py::object &self) {
                 return py::make_tuple(self.attr("value"), self.attr("extra"), self.attr("__dict__"));
             [](const py::tuple &t) {
@@ -124,7 +182,8 @@ TEST_SUBMODULE(pickling, m) {
                 auto py_state = t[2].cast<py::dict>();
                 return std::make_pair(cpp_state, py_state);
-            }
-        ));
+            }));
+    exercise_trampoline::wrap(m);
diff --git a/wrap/pybind11/tests/test_pickling.py b/wrap/pybind11/tests/test_pickling.py
index 9aee70505d..9f68f37dcf 100644
--- a/wrap/pybind11/tests/test_pickling.py
+++ b/wrap/pybind11/tests/test_pickling.py
@@ -1,8 +1,7 @@
 # -*- coding: utf-8 -*-
 import pytest
-import env  # noqa: F401
+import env
 from pybind11_tests import pickling as m
@@ -42,5 +41,42 @@ def test_roundtrip_with_dict(cls_name):
 def test_enum_pickle():
     from pybind11_tests import enums as e
     data = pickle.dumps(e.EOne, 2)
     assert e.EOne == pickle.loads(data)
+# exercise_trampoline
+class SimplePyDerived(m.SimpleBase):
+    pass
+def test_roundtrip_simple_py_derived():
+    p = SimplePyDerived()
+    p.num = 202
+    p.stored_in_dict = 303
+    data = pickle.dumps(p, pickle.HIGHEST_PROTOCOL)
+    p2 = pickle.loads(data)
+    assert isinstance(p2, SimplePyDerived)
+    assert p2.num == 202
+    assert p2.stored_in_dict == 303
+def test_roundtrip_simple_cpp_derived():
+    p = m.make_SimpleCppDerivedAsBase()
+    assert m.check_dynamic_cast_SimpleCppDerived(p)
+    p.num = 404
+    if not env.PYPY:
+        # To ensure that this unit test is not accidentally invalidated.
+        with pytest.raises(AttributeError):
+            # Mimics the `setstate` C++ implementation.
+            setattr(p, "__dict__", {})  # noqa: B010
+    data = pickle.dumps(p, pickle.HIGHEST_PROTOCOL)
+    p2 = pickle.loads(data)
+    assert isinstance(p2, m.SimpleBase)
+    assert p2.num == 404
+    # Issue #3062: pickleable base C++ classes can incur object slicing
+    #              if derived typeid is not registered with pybind11
+    assert not m.check_dynamic_cast_SimpleCppDerived(p2)
diff --git a/wrap/pybind11/tests/test_pytypes.cpp b/wrap/pybind11/tests/test_pytypes.cpp
index 4ef1b9ff0b..85cb98fcbf 100644
--- a/wrap/pybind11/tests/test_pytypes.cpp
+++ b/wrap/pybind11/tests/test_pytypes.cpp
@@ -7,17 +7,28 @@
     BSD-style license that can be found in the LICENSE file.
+#include <utility>
 #include "pybind11_tests.h"
 TEST_SUBMODULE(pytypes, m) {
+    // test_bool
+    m.def("get_bool", []{return py::bool_(false);});
     // test_int
     m.def("get_int", []{return py::int_(0);});
     // test_iterator
     m.def("get_iterator", []{return py::iterator();});
     // test_iterable
     m.def("get_iterable", []{return py::iterable();});
+    // test_float
+    m.def("get_float", []{return py::float_(0.0f);});
     // test_list
+    m.def("list_no_args", []() { return py::list{}; });
+    m.def("list_ssize_t", []() { return py::list{(py::ssize_t) 0}; });
+    m.def("list_size_t", []() { return py::list{(py::size_t) 0}; });
+    m.def("list_insert_ssize_t", [](py::list *l) { return l->insert((py::ssize_t) 1, 83); });
+    m.def("list_insert_size_t", [](py::list *l) { return l->insert((py::size_t) 3, 57); });
     m.def("get_list", []() {
         py::list list;
@@ -27,16 +38,14 @@ TEST_SUBMODULE(pytypes, m) {
         list.insert(2, "inserted-2");
         return list;
-    m.def("print_list", [](py::list list) {
+    m.def("print_list", [](const py::list &list) {
         int index = 0;
         for (auto item : list)
             py::print("list item {}: {}"_s.format(index++, item));
     // test_none
     m.def("get_none", []{return py::none();});
-    m.def("print_none", [](py::none none) {
-        py::print("none: {}"_s.format(none));
-    });
+    m.def("print_none", [](const py::none &none) { py::print("none: {}"_s.format(none)); });
     // test_set
     m.def("get_set", []() {
@@ -46,20 +55,17 @@ TEST_SUBMODULE(pytypes, m) {
         return set;
-    m.def("print_set", [](py::set set) {
+    m.def("print_set", [](const py::set &set) {
         for (auto item : set)
             py::print("key:", item);
-    m.def("set_contains", [](py::set set, py::object key) {
-        return set.contains(key);
-    });
-    m.def("set_contains", [](py::set set, const char* key) {
-        return set.contains(key);
-    });
+    m.def("set_contains",
+          [](const py::set &set, const py::object &key) { return set.contains(key); });
+    m.def("set_contains", [](const py::set &set, const char *key) { return set.contains(key); });
     // test_dict
     m.def("get_dict", []() { return py::dict("key"_a="value"); });
-    m.def("print_dict", [](py::dict dict) {
+    m.def("print_dict", [](const py::dict &dict) {
         for (auto item : dict)
             py::print("key: {}, value={}"_s.format(item.first, item.second));
@@ -68,19 +74,38 @@ TEST_SUBMODULE(pytypes, m) {
         auto d2 = py::dict("z"_a=3, **d1);
         return d2;
-    m.def("dict_contains", [](py::dict dict, py::object val) {
-        return dict.contains(val);
-    });
-    m.def("dict_contains", [](py::dict dict, const char* val) {
-        return dict.contains(val);
+    m.def("dict_contains",
+          [](const py::dict &dict, py::object val) { return dict.contains(val); });
+    m.def("dict_contains",
+          [](const py::dict &dict, const char *val) { return dict.contains(val); });
+    // test_tuple
+    m.def("tuple_no_args", []() { return py::tuple{}; });
+    m.def("tuple_ssize_t", []() { return py::tuple{(py::ssize_t) 0}; });
+    m.def("tuple_size_t", []() { return py::tuple{(py::size_t) 0}; });
+    m.def("get_tuple", []() { return py::make_tuple(42, py::none(), "spam"); });
+#if PY_VERSION_HEX >= 0x03030000
+    // test_simple_namespace
+    m.def("get_simple_namespace", []() {
+        auto ns = py::module_::import("types").attr("SimpleNamespace")("attr"_a=42, "x"_a="foo", "wrong"_a=1);
+        py::delattr(ns, "wrong");
+        py::setattr(ns, "right", py::int_(2));
+        return ns;
     // test_str
+    m.def("str_from_char_ssize_t", []() { return py::str{"red", (py::ssize_t) 3}; });
+    m.def("str_from_char_size_t", []() { return py::str{"blue", (py::size_t) 4}; });
     m.def("str_from_string", []() { return py::str(std::string("baz")); });
     m.def("str_from_bytes", []() { return py::str(py::bytes("boo", 3)); });
     m.def("str_from_object", [](const py::object& obj) { return py::str(obj); });
     m.def("repr_from_object", [](const py::object& obj) { return py::repr(obj); });
     m.def("str_from_handle", [](py::handle h) { return py::str(h); });
+    m.def("str_from_string_from_str", [](const py::str& obj) {
+        return py::str(static_cast<std::string>(obj));
+    });
     m.def("str_format", []() {
         auto s1 = "{} + {} = {}"_s.format(1, 2, 3);
@@ -89,9 +114,17 @@ TEST_SUBMODULE(pytypes, m) {
     // test_bytes
+    m.def("bytes_from_char_ssize_t", []() { return py::bytes{"green", (py::ssize_t) 5}; });
+    m.def("bytes_from_char_size_t", []() { return py::bytes{"purple", (py::size_t) 6}; });
     m.def("bytes_from_string", []() { return py::bytes(std::string("foo")); });
     m.def("bytes_from_str", []() { return py::bytes(py::str("bar", 3)); });
+    // test bytearray
+    m.def("bytearray_from_char_ssize_t", []() { return py::bytearray{"$%", (py::ssize_t) 2}; });
+    m.def("bytearray_from_char_size_t", []() { return py::bytearray{"@$!", (py::size_t) 3}; });
+    m.def("bytearray_from_string", []() { return py::bytearray(std::string("foo")); });
+    m.def("bytearray_size", []() { return py::bytearray("foo").size(); });
     // test_capsule
     m.def("return_capsule_with_destructor", []() {
         py::print("creating capsule");
@@ -108,7 +141,7 @@ TEST_SUBMODULE(pytypes, m) {
     m.def("return_capsule_with_name_and_destructor", []() {
-        auto capsule = py::capsule((void *) 1234, "pointer type description", [](PyObject *ptr) {
+        auto capsule = py::capsule((void *) 12345, "pointer type description", [](PyObject *ptr) {
             if (ptr) {
                 auto name = PyCapsule_GetName(ptr);
                 py::print("destructing capsule ({}, '{}')"_s.format(
@@ -116,19 +149,30 @@ TEST_SUBMODULE(pytypes, m) {
-        void *contents = capsule;
-        py::print("created capsule ({}, '{}')"_s.format((size_t) contents, capsule.name()));
+        capsule.set_pointer((void *) 1234);
+        // Using get_pointer<T>()
+        void* contents1 = static_cast<void*>(capsule);
+        void* contents2 = capsule.get_pointer();
+        void* contents3 = capsule.get_pointer<void>();
+        auto result1 = reinterpret_cast<size_t>(contents1);
+        auto result2 = reinterpret_cast<size_t>(contents2);
+        auto result3 = reinterpret_cast<size_t>(contents3);
+        py::print("created capsule ({}, '{}')"_s.format(result1 & result2 & result3, capsule.name()));
         return capsule;
     // test_accessors
-    m.def("accessor_api", [](py::object o) {
+    m.def("accessor_api", [](const py::object &o) {
         auto d = py::dict();
         d["basic_attr"] = o.attr("basic_attr");
         auto l = py::list();
-        for (const auto &item : o.attr("begin_end")) {
+        for (auto item : o.attr("begin_end")) {
         d["begin_end"] = l;
@@ -163,7 +207,7 @@ TEST_SUBMODULE(pytypes, m) {
         return d;
-    m.def("tuple_accessor", [](py::tuple existing_t) {
+    m.def("tuple_accessor", [](const py::tuple &existing_t) {
         try {
             existing_t[0] = 1;
         } catch (const py::error_already_set &) {
@@ -199,6 +243,7 @@ TEST_SUBMODULE(pytypes, m) {
     m.def("default_constructors", []() {
         return py::dict(
+            "bytearray"_a=py::bytearray(),
@@ -210,9 +255,10 @@ TEST_SUBMODULE(pytypes, m) {
-    m.def("converting_constructors", [](py::dict d) {
+    m.def("converting_constructors", [](const py::dict &d) {
         return py::dict(
+            "bytearray"_a=py::bytearray(d["bytearray"]),
@@ -225,10 +271,11 @@ TEST_SUBMODULE(pytypes, m) {
-    m.def("cast_functions", [](py::dict d) {
+    m.def("cast_functions", [](const py::dict &d) {
         // When converting between Python types, obj.cast<T>() should be the same as T(obj)
         return py::dict(
+            "bytearray"_a=d["bytearray"].cast<py::bytearray>(),
@@ -241,7 +288,24 @@ TEST_SUBMODULE(pytypes, m) {
-    m.def("convert_to_pybind11_str", [](py::object o) { return py::str(o); });
+    m.def("convert_to_pybind11_str", [](const py::object &o) { return py::str(o); });
+    m.def("nonconverting_constructor",
+          [](const std::string &type, py::object value, bool move) -> py::object {
+              if (type == "bytes") {
+                  return move ? py::bytes(std::move(value)) : py::bytes(value);
+              }
+              if (type == "none") {
+                  return move ? py::none(std::move(value)) : py::none(value);
+              }
+              if (type == "ellipsis") {
+                  return move ? py::ellipsis(std::move(value)) : py::ellipsis(value);
+              }
+              if (type == "type") {
+                  return move ? py::type(std::move(value)) : py::type(value);
+              }
+              throw std::runtime_error("Invalid type");
+          });
     m.def("get_implicit_casting", []() {
         py::dict d;
@@ -289,7 +353,7 @@ TEST_SUBMODULE(pytypes, m) {
         py::print("no new line here", "end"_a=" -- ");
         py::print("next print");
-        auto py_stderr = py::module::import("sys").attr("stderr");
+        auto py_stderr = py::module_::import("sys").attr("stderr");
         py::print("this goes to stderr", "file"_a=py_stderr);
         py::print("flush", "flush"_a=true);
@@ -299,9 +363,9 @@ TEST_SUBMODULE(pytypes, m) {
     m.def("print_failure", []() { py::print(42, UnregisteredType()); });
-    m.def("hash_function", [](py::object obj) { return py::hash(obj); });
+    m.def("hash_function", [](py::object obj) { return py::hash(std::move(obj)); });
-    m.def("test_number_protocol", [](py::object a, py::object b) {
+    m.def("test_number_protocol", [](const py::object &a, const py::object &b) {
         py::list l;
@@ -321,9 +385,7 @@ TEST_SUBMODULE(pytypes, m) {
         return l;
-    m.def("test_list_slicing", [](py::list a) {
-        return a[py::slice(0, -1, 2)];
-    });
+    m.def("test_list_slicing", [](const py::list &a) { return a[py::slice(0, -1, 2)]; });
     // See #2361
     m.def("issue2361_str_implicit_copy_none", []() {
@@ -335,13 +397,10 @@ TEST_SUBMODULE(pytypes, m) {
         return is_this_none;
-    m.def("test_memoryview_object", [](py::buffer b) {
-        return py::memoryview(b);
-    });
+    m.def("test_memoryview_object", [](const py::buffer &b) { return py::memoryview(b); });
-    m.def("test_memoryview_buffer_info", [](py::buffer b) {
-        return py::memoryview(b.request());
-    });
+    m.def("test_memoryview_buffer_info",
+          [](const py::buffer &b) { return py::memoryview(b.request()); });
     m.def("test_memoryview_from_buffer", [](bool is_unsigned) {
         static const int16_t si16[] = { 3, 1, 4, 1, 5 };
@@ -349,9 +408,7 @@ TEST_SUBMODULE(pytypes, m) {
         if (is_unsigned)
             return py::memoryview::from_buffer(
                 ui16, { 4 }, { sizeof(uint16_t) });
-        else
-            return py::memoryview::from_buffer(
-                si16, { 5 }, { sizeof(int16_t) });
+        return py::memoryview::from_buffer(si16, {5}, {sizeof(int16_t)});
     m.def("test_memoryview_from_buffer_nativeformat", []() {
@@ -380,7 +437,128 @@ TEST_SUBMODULE(pytypes, m) {
     m.def("test_memoryview_from_memory", []() {
         const char* buf = "\xff\xe1\xab\x37";
         return py::memoryview::from_memory(
-            buf, static_cast<ssize_t>(strlen(buf)));
+            buf, static_cast<py::ssize_t>(strlen(buf)));
+    // test_builtin_functions
+    m.def("get_len", [](py::handle h) { return py::len(h); });
+    m.attr("PYBIND11_STR_LEGACY_PERMISSIVE") = true;
+    m.def("isinstance_pybind11_bytes",
+          [](py::object o) { return py::isinstance<py::bytes>(std::move(o)); });
+    m.def("isinstance_pybind11_str",
+          [](py::object o) { return py::isinstance<py::str>(std::move(o)); });
+    m.def("pass_to_pybind11_bytes", [](py::bytes b) { return py::len(std::move(b)); });
+    m.def("pass_to_pybind11_str", [](py::str s) { return py::len(std::move(s)); });
+    m.def("pass_to_std_string", [](const std::string &s) { return s.size(); });
+    // test_weakref
+    m.def("weakref_from_handle",
+          [](py::handle h) { return py::weakref(h); });
+    m.def("weakref_from_handle_and_function",
+          [](py::handle h, py::function f) { return py::weakref(h, std::move(f)); });
+    m.def("weakref_from_object", [](const py::object &o) { return py::weakref(o); });
+    m.def("weakref_from_object_and_function",
+          [](py::object o, py::function f) { return py::weakref(std::move(o), std::move(f)); });
+// See PR #3263 for background (https://github.com/pybind/pybind11/pull/3263):
+// pytypes.h could be changed to enforce the "most correct" user code below, by removing
+// `const` from iterator `reference` using type aliases, but that will break existing
+// user code.
+#if (defined(__APPLE__) && defined(__clang__)) || defined(PYPY_VERSION)
+// This is "most correct" and enforced on these platforms.
+#    define PYBIND11_AUTO_IT auto it
+// This works on many platforms and is (unfortunately) reflective of existing user code.
+// NOLINTNEXTLINE(bugprone-macro-parentheses)
+#    define PYBIND11_AUTO_IT auto &it
+    m.def("tuple_iterator", []() {
+        auto tup = py::make_tuple(5, 7);
+        int tup_sum = 0;
+        for (PYBIND11_AUTO_IT : tup) {
+            tup_sum += it.cast<int>();
+        }
+        return tup_sum;
+    });
+    m.def("dict_iterator", []() {
+        py::dict dct;
+        dct[py::int_(3)] = 5;
+        dct[py::int_(7)] = 11;
+        int kv_sum = 0;
+        for (PYBIND11_AUTO_IT : dct) {
+            kv_sum += it.first.cast<int>() * 100 + it.second.cast<int>();
+        }
+        return kv_sum;
+    });
+    m.def("passed_iterator", [](const py::iterator &py_it) {
+        int elem_sum = 0;
+        for (PYBIND11_AUTO_IT : py_it) {
+            elem_sum += it.cast<int>();
+        }
+        return elem_sum;
+    });
+#undef PYBIND11_AUTO_IT
+    // Tests below this line are for pybind11 IMPLEMENTATION DETAILS:
+    m.def("sequence_item_get_ssize_t", [](const py::object &o) {
+        return py::detail::accessor_policies::sequence_item::get(o, (py::ssize_t) 1);
+    });
+    m.def("sequence_item_set_ssize_t", [](const py::object &o) {
+        auto s = py::str{"peppa", 5};
+        py::detail::accessor_policies::sequence_item::set(o, (py::ssize_t) 1, s);
+    });
+    m.def("sequence_item_get_size_t", [](const py::object &o) {
+        return py::detail::accessor_policies::sequence_item::get(o, (py::size_t) 2);
+    });
+    m.def("sequence_item_set_size_t", [](const py::object &o) {
+        auto s = py::str{"george", 6};
+        py::detail::accessor_policies::sequence_item::set(o, (py::size_t) 2, s);
+    });
+    m.def("list_item_get_ssize_t", [](const py::object &o) {
+        return py::detail::accessor_policies::list_item::get(o, (py::ssize_t) 3);
+    });
+    m.def("list_item_set_ssize_t", [](const py::object &o) {
+        auto s = py::str{"rebecca", 7};
+        py::detail::accessor_policies::list_item::set(o, (py::ssize_t) 3, s);
+    });
+    m.def("list_item_get_size_t", [](const py::object &o) {
+        return py::detail::accessor_policies::list_item::get(o, (py::size_t) 4);
+    });
+    m.def("list_item_set_size_t", [](const py::object &o) {
+        auto s = py::str{"richard", 7};
+        py::detail::accessor_policies::list_item::set(o, (py::size_t) 4, s);
+    });
+    m.def("tuple_item_get_ssize_t", [](const py::object &o) {
+        return py::detail::accessor_policies::tuple_item::get(o, (py::ssize_t) 5);
+    });
+    m.def("tuple_item_set_ssize_t", []() {
+        auto s0 = py::str{"emely", 5};
+        auto s1 = py::str{"edmond", 6};
+        auto o = py::tuple{2};
+        py::detail::accessor_policies::tuple_item::set(o, (py::ssize_t) 0, s0);
+        py::detail::accessor_policies::tuple_item::set(o, (py::ssize_t) 1, s1);
+        return o;
+    });
+    m.def("tuple_item_get_size_t", [](const py::object &o) {
+        return py::detail::accessor_policies::tuple_item::get(o, (py::size_t) 6);
+    });
+    m.def("tuple_item_set_size_t", []() {
+        auto s0 = py::str{"candy", 5};
+        auto s1 = py::str{"cat", 3};
+        auto o = py::tuple{2};
+        py::detail::accessor_policies::tuple_item::set(o, (py::size_t) 1, s1);
+        py::detail::accessor_policies::tuple_item::set(o, (py::size_t) 0, s0);
+        return o;
+    });
diff --git a/wrap/pybind11/tests/test_pytypes.py b/wrap/pybind11/tests/test_pytypes.py
index 0618cd54c9..2cd6c3f03a 100644
--- a/wrap/pybind11/tests/test_pytypes.py
+++ b/wrap/pybind11/tests/test_pytypes.py
@@ -1,12 +1,17 @@
 # -*- coding: utf-8 -*-
 from __future__ import division
-import pytest
 import sys
-import env  # noqa: F401
+import pytest
-from pybind11_tests import pytypes as m
+import env
 from pybind11_tests import debug_enabled
+from pybind11_tests import pytypes as m
+def test_bool(doc):
+    assert doc(m.get_bool) == "get_bool() -> bool"
 def test_int(doc):
@@ -21,20 +26,36 @@ def test_iterable(doc):
     assert doc(m.get_iterable) == "get_iterable() -> Iterable"
+def test_float(doc):
+    assert doc(m.get_float) == "get_float() -> float"
 def test_list(capture, doc):
+    assert m.list_no_args() == []
+    assert m.list_ssize_t() == []
+    assert m.list_size_t() == []
+    lins = [1, 2]
+    m.list_insert_ssize_t(lins)
+    assert lins == [1, 83, 2]
+    m.list_insert_size_t(lins)
+    assert lins == [1, 83, 2, 57]
     with capture:
         lst = m.get_list()
         assert lst == ["inserted-0", "overwritten", "inserted-2"]
-    assert capture.unordered == """
+    assert (
+        capture.unordered
+        == """
         Entry at position 0: value
         list item 0: inserted-0
         list item 1: overwritten
         list item 2: inserted-2
         list item 3: value2
+    )
     assert doc(m.get_list) == "get_list() -> list"
     assert doc(m.print_list) == "print_list(arg0: list) -> None"
@@ -52,14 +73,17 @@ def test_set(capture, doc):
     with capture:
-    assert capture.unordered == """
+    assert (
+        capture.unordered
+        == """
         key: key1
         key: key2
         key: key3
         key: key4
+    )
-    assert not m.set_contains(set([]), 42)
+    assert not m.set_contains(set(), 42)
     assert m.set_contains({42}, 42)
     assert m.set_contains({"foo"}, "foo")
@@ -74,10 +98,13 @@ def test_dict(capture, doc):
     with capture:
         d["key2"] = "value2"
-    assert capture.unordered == """
+    assert (
+        capture.unordered
+        == """
         key: key, value=value
         key: key2, value=value2
+    )
     assert not m.dict_contains({}, 42)
     assert m.dict_contains({42: None}, 42)
@@ -89,7 +116,25 @@ def test_dict(capture, doc):
     assert m.dict_keyword_constructor() == {"x": 1, "y": 2, "z": 3}
+def test_tuple():
+    assert m.tuple_no_args() == ()
+    assert m.tuple_ssize_t() == ()
+    assert m.tuple_size_t() == ()
+    assert m.get_tuple() == (42, None, "spam")
+def test_simple_namespace():
+    ns = m.get_simple_namespace()
+    assert ns.attr == 42
+    assert ns.x == "foo"
+    assert ns.right == 2
+    assert not hasattr(ns, "wrong")
 def test_str(doc):
+    assert m.str_from_char_ssize_t().encode().decode() == "red"
+    assert m.str_from_char_size_t().encode().decode() == "blue"
     assert m.str_from_string().encode().decode() == "baz"
     assert m.str_from_bytes().encode().decode() == "boo"
@@ -111,18 +156,31 @@ def __repr__(self):
     assert s1 == s2
     malformed_utf8 = b"\x80"
-    assert m.str_from_object(malformed_utf8) is malformed_utf8  # To be fixed; see #2380
+    if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
+        assert m.str_from_object(malformed_utf8) is malformed_utf8
+    elif env.PY2:
+        with pytest.raises(UnicodeDecodeError):
+            m.str_from_object(malformed_utf8)
+    else:
+        assert m.str_from_object(malformed_utf8) == "b'\\x80'"
     if env.PY2:
-        # with pytest.raises(UnicodeDecodeError):
-        #     m.str_from_object(malformed_utf8)
         with pytest.raises(UnicodeDecodeError):
-        # assert m.str_from_object(malformed_utf8) == "b'\\x80'"
         assert m.str_from_handle(malformed_utf8) == "b'\\x80'"
+    assert m.str_from_string_from_str("this is a str") == "this is a str"
+    ucs_surrogates_str = u"\udcc3"
+    if env.PY2:
+        assert u"\udcc3" == m.str_from_string_from_str(ucs_surrogates_str)
+    else:
+        with pytest.raises(UnicodeEncodeError):
+            m.str_from_string_from_str(ucs_surrogates_str)
 def test_bytes(doc):
+    assert m.bytes_from_char_ssize_t().decode() == "green"
+    assert m.bytes_from_char_size_t().decode() == "purple"
     assert m.bytes_from_string().decode() == "foo"
     assert m.bytes_from_str().decode() == "bar"
@@ -131,34 +189,50 @@ def test_bytes(doc):
+def test_bytearray(doc):
+    assert m.bytearray_from_char_ssize_t().decode() == "$%"
+    assert m.bytearray_from_char_size_t().decode() == "@$!"
+    assert m.bytearray_from_string().decode() == "foo"
+    assert m.bytearray_size() == len("foo")
 def test_capsule(capture):
     with capture:
         a = m.return_capsule_with_destructor()
         del a
-    assert capture.unordered == """
+    assert (
+        capture.unordered
+        == """
         creating capsule
         destructing capsule
+    )
     with capture:
         a = m.return_capsule_with_destructor_2()
         del a
-    assert capture.unordered == """
+    assert (
+        capture.unordered
+        == """
         creating capsule
         destructing capsule: 1234
+    )
     with capture:
         a = m.return_capsule_with_name_and_destructor()
         del a
-    assert capture.unordered == """
+    assert (
+        capture.unordered
+        == """
         created capsule (1234, 'pointer type description')
         destructing capsule (1234, 'pointer type description')
+    )
 def test_accessors():
@@ -202,7 +276,7 @@ def func(self, x, *args):
 def test_constructors():
     """C++ default and converting constructors are equivalent to type calls in Python"""
-    types = [bytes, str, bool, int, float, tuple, list, dict, set]
+    types = [bytes, bytearray, str, bool, int, float, tuple, list, dict, set]
     expected = {t.__name__: t() for t in types}
     if env.PY2:
         # Note that bytes.__name__ == 'str' in Python 2.
@@ -212,7 +286,8 @@ def test_constructors():
     assert m.default_constructors() == expected
     data = {
-        bytes: b'41',  # Currently no supported or working conversions.
+        bytes: b"41",  # Currently no supported or working conversions.
+        bytearray: bytearray(b"41"),
         str: 42,
         bool: "Not empty",
         int: "42",
@@ -221,14 +296,14 @@ def test_constructors():
         list: range(3),
         dict: [("two", 2), ("one", 1), ("three", 3)],
         set: [4, 4, 5, 6, 6, 6],
-        memoryview: b'abc'
+        memoryview: b"abc",
     inputs = {k.__name__: v for k, v in data.items()}
     expected = {k.__name__: k(v) for k, v in data.items()}
     if env.PY2:  # Similar to the above. See comments above.
-        inputs["bytes"] = b'41'
+        inputs["bytes"] = b"41"
         inputs["str"] = 42
-        expected["bytes"] = b'41'
+        expected["bytes"] = b"41"
         expected["str"] = u"42"
     assert m.converting_constructors(inputs) == expected
@@ -245,16 +320,33 @@ def test_constructors():
         assert noconv2[k] is expected[k]
+def test_non_converting_constructors():
+    non_converting_test_cases = [
+        ("bytes", range(10)),
+        ("none", 42),
+        ("ellipsis", 42),
+        ("type", 42),
+    ]
+    for t, v in non_converting_test_cases:
+        for move in [True, False]:
+            with pytest.raises(TypeError) as excinfo:
+                m.nonconverting_constructor(t, v, move)
+            expected_error = "Object of type '{}' is not an instance of '{}'".format(
+                type(v).__name__, t
+            )
+            assert str(excinfo.value) == expected_error
 def test_pybind11_str_raw_str():
     # specifically to exercise pybind11::str::raw_str
     cvt = m.convert_to_pybind11_str
     assert cvt(u"Str") == u"Str"
-    assert cvt(b'Bytes') == u"Bytes" if env.PY2 else "b'Bytes'"
+    assert cvt(b"Bytes") == u"Bytes" if env.PY2 else "b'Bytes'"
     assert cvt(None) == u"None"
     assert cvt(False) == u"False"
     assert cvt(True) == u"True"
     assert cvt(42) == u"42"
-    assert cvt(2**65) == u"36893488147419103232"
+    assert cvt(2 ** 65) == u"36893488147419103232"
     assert cvt(-1.50) == u"-1.5"
     assert cvt(()) == u"()"
     assert cvt((18,)) == u"(18,)"
@@ -268,30 +360,54 @@ def test_pybind11_str_raw_str():
     valid_orig = u"DZ"
     valid_utf8 = valid_orig.encode("utf-8")
     valid_cvt = cvt(valid_utf8)
-    assert type(valid_cvt) == bytes  # Probably surprising.
-    assert valid_cvt == b'\xc7\xb1'
+    if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
+        assert valid_cvt is valid_utf8
+    else:
+        assert type(valid_cvt) is unicode if env.PY2 else str  # noqa: F821
+        if env.PY2:
+            assert valid_cvt == valid_orig
+        else:
+            assert valid_cvt == "b'\\xc7\\xb1'"
-    malformed_utf8 = b'\x80'
-    malformed_cvt = cvt(malformed_utf8)
-    assert type(malformed_cvt) == bytes  # Probably surprising.
-    assert malformed_cvt == b'\x80'
+    malformed_utf8 = b"\x80"
+    if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
+        assert cvt(malformed_utf8) is malformed_utf8
+    else:
+        if env.PY2:
+            with pytest.raises(UnicodeDecodeError):
+                cvt(malformed_utf8)
+        else:
+            malformed_cvt = cvt(malformed_utf8)
+            assert type(malformed_cvt) is str
+            assert malformed_cvt == "b'\\x80'"
 def test_implicit_casting():
     """Tests implicit casting when assigning or appending to dicts and lists."""
     z = m.get_implicit_casting()
-    assert z['d'] == {
-        'char*_i1': 'abc', 'char*_i2': 'abc', 'char*_e': 'abc', 'char*_p': 'abc',
-        'str_i1': 'str', 'str_i2': 'str1', 'str_e': 'str2', 'str_p': 'str3',
-        'int_i1': 42, 'int_i2': 42, 'int_e': 43, 'int_p': 44
+    assert z["d"] == {
+        "char*_i1": "abc",
+        "char*_i2": "abc",
+        "char*_e": "abc",
+        "char*_p": "abc",
+        "str_i1": "str",
+        "str_i2": "str1",
+        "str_e": "str2",
+        "str_p": "str3",
+        "int_i1": 42,
+        "int_i2": 42,
+        "int_e": 43,
+        "int_p": 44,
-    assert z['l'] == [3, 6, 9, 12, 15]
+    assert z["l"] == [3, 6, 9, 12, 15]
 def test_print(capture):
     with capture:
-    assert capture == """
+    assert (
+        capture
+        == """
         Hello, World!
         1 2.0 three True -- multiple args
@@ -299,14 +415,15 @@ def test_print(capture):
         py::print + str.format = this
+    )
     assert capture.stderr == "this goes to stderr"
     with pytest.raises(RuntimeError) as excinfo:
-    assert str(excinfo.value) == "make_tuple(): unable to convert " + (
-        "argument of type 'UnregisteredType' to Python object"
-        if debug_enabled else
-        "arguments to Python object (compile in debug mode for details)"
+    assert str(excinfo.value) == "Unable to convert call argument " + (
+        "'1' of type 'UnregisteredType' to Python object"
+        if debug_enabled
+        else "to Python object (compile in debug mode for details)"
@@ -328,8 +445,23 @@ class Unhashable(object):
 def test_number_protocol():
     for a, b in [(1, 1), (3, 5)]:
-        li = [a == b, a != b, a < b, a <= b, a > b, a >= b, a + b,
-              a - b, a * b, a / b, a | b, a & b, a ^ b, a >> b, a << b]
+        li = [
+            a == b,
+            a != b,
+            a < b,
+            a <= b,
+            a > b,
+            a >= b,
+            a + b,
+            a - b,
+            a * b,
+            a / b,
+            a | b,
+            a & b,
+            a ^ b,
+            a >> b,
+            a << b,
+        ]
         assert m.test_number_protocol(a, b) == li
@@ -343,16 +475,20 @@ def test_issue2361():
     assert m.issue2361_str_implicit_copy_none() == "None"
     with pytest.raises(TypeError) as excinfo:
         assert m.issue2361_dict_implicit_copy_none()
-    assert "'NoneType' object is not iterable" in str(excinfo.value)
-@pytest.mark.parametrize('method, args, fmt, expected_view', [
-    (m.test_memoryview_object, (b'red',), 'B', b'red'),
-    (m.test_memoryview_buffer_info, (b'green',), 'B', b'green'),
-    (m.test_memoryview_from_buffer, (False,), 'h', [3, 1, 4, 1, 5]),
-    (m.test_memoryview_from_buffer, (True,), 'H', [2, 7, 1, 8]),
-    (m.test_memoryview_from_buffer_nativeformat, (), '@i', [4, 7, 5]),
+    assert "NoneType" in str(excinfo.value)
+    assert "iterable" in str(excinfo.value)
+    "method, args, fmt, expected_view",
+    [
+        (m.test_memoryview_object, (b"red",), "B", b"red"),
+        (m.test_memoryview_buffer_info, (b"green",), "B", b"green"),
+        (m.test_memoryview_from_buffer, (False,), "h", [3, 1, 4, 1, 5]),
+        (m.test_memoryview_from_buffer, (True,), "H", [2, 7, 1, 8]),
+        (m.test_memoryview_from_buffer_nativeformat, (), "@i", [4, 7, 5]),
+    ],
 def test_memoryview(method, args, fmt, expected_view):
     view = method(*args)
     assert isinstance(view, memoryview)
@@ -361,17 +497,20 @@ def test_memoryview(method, args, fmt, expected_view):
         view_as_list = list(view)
         # Using max to pick non-zero byte (big-endian vs little-endian).
-        view_as_list = [max([ord(c) for c in s]) for s in view]
+        view_as_list = [max(ord(c) for c in s) for s in view]
     assert view_as_list == list(expected_view)
 @pytest.mark.xfail("env.PYPY", reason="getrefcount is not available")
-@pytest.mark.parametrize('method', [
-    m.test_memoryview_object,
-    m.test_memoryview_buffer_info,
+    "method",
+    [
+        m.test_memoryview_object,
+        m.test_memoryview_buffer_info,
+    ],
 def test_memoryview_refcount(method):
-    buf = b'\x0a\x0b\x0c\x0d'
+    buf = b"\x0a\x0b\x0c\x0d"
     ref_before = sys.getrefcount(buf)
     view = method(buf)
     ref_after = sys.getrefcount(buf)
@@ -382,13 +521,13 @@ def test_memoryview_refcount(method):
 def test_memoryview_from_buffer_empty_shape():
     view = m.test_memoryview_from_buffer_empty_shape()
     assert isinstance(view, memoryview)
-    assert view.format == 'B'
+    assert view.format == "B"
     if env.PY2:
         # Python 2 behavior is weird, but Python 3 (the future) is fine.
         # PyPy3 has <memoryview, while CPython 2 has <memory
-        assert bytes(view).startswith(b'<memory')
+        assert bytes(view).startswith(b"<memory")
-        assert bytes(view) == b''
+        assert bytes(view) == b""
 def test_test_memoryview_from_buffer_invalid_strides():
@@ -408,5 +547,113 @@ def test_test_memoryview_from_buffer_nullptr():
 def test_memoryview_from_memory():
     view = m.test_memoryview_from_memory()
     assert isinstance(view, memoryview)
-    assert view.format == 'B'
-    assert bytes(view) == b'\xff\xe1\xab\x37'
+    assert view.format == "B"
+    assert bytes(view) == b"\xff\xe1\xab\x37"
+def test_builtin_functions():
+    assert m.get_len([i for i in range(42)]) == 42
+    with pytest.raises(TypeError) as exc_info:
+        m.get_len(i for i in range(42))
+    assert str(exc_info.value) in [
+        "object of type 'generator' has no len()",
+        "'generator' has no length",
+    ]  # PyPy
+def test_isinstance_string_types():
+    assert m.isinstance_pybind11_bytes(b"")
+    assert not m.isinstance_pybind11_bytes(u"")
+    assert m.isinstance_pybind11_str(u"")
+    if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
+        assert m.isinstance_pybind11_str(b"")
+    else:
+        assert not m.isinstance_pybind11_str(b"")
+def test_pass_bytes_or_unicode_to_string_types():
+    assert m.pass_to_pybind11_bytes(b"Bytes") == 5
+    with pytest.raises(TypeError):
+        m.pass_to_pybind11_bytes(u"Str")
+    if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE") or env.PY2:
+        assert m.pass_to_pybind11_str(b"Bytes") == 5
+    else:
+        with pytest.raises(TypeError):
+            m.pass_to_pybind11_str(b"Bytes")
+    assert m.pass_to_pybind11_str(u"Str") == 3
+    assert m.pass_to_std_string(b"Bytes") == 5
+    assert m.pass_to_std_string(u"Str") == 3
+    malformed_utf8 = b"\x80"
+    if hasattr(m, "PYBIND11_STR_LEGACY_PERMISSIVE"):
+        assert m.pass_to_pybind11_str(malformed_utf8) == 1
+    elif env.PY2:
+        with pytest.raises(UnicodeDecodeError):
+            m.pass_to_pybind11_str(malformed_utf8)
+    else:
+        with pytest.raises(TypeError):
+            m.pass_to_pybind11_str(malformed_utf8)
+    "create_weakref, create_weakref_with_callback",
+    [
+        (m.weakref_from_handle, m.weakref_from_handle_and_function),
+        (m.weakref_from_object, m.weakref_from_object_and_function),
+    ],
+def test_weakref(create_weakref, create_weakref_with_callback):
+    from weakref import getweakrefcount
+    # Apparently, you cannot weakly reference an object()
+    class WeaklyReferenced(object):
+        pass
+    def callback(wr):
+        # No `nonlocal` in Python 2
+        callback.called = True
+    obj = WeaklyReferenced()
+    assert getweakrefcount(obj) == 0
+    wr = create_weakref(obj)
+    assert getweakrefcount(obj) == 1
+    obj = WeaklyReferenced()
+    assert getweakrefcount(obj) == 0
+    callback.called = False
+    wr = create_weakref_with_callback(obj, callback)  # noqa: F841
+    assert getweakrefcount(obj) == 1
+    assert not callback.called
+    del obj
+    pytest.gc_collect()
+    assert callback.called
+def test_cpp_iterators():
+    assert m.tuple_iterator() == 12
+    assert m.dict_iterator() == 305 + 711
+    assert m.passed_iterator(iter((-7, 3))) == -4
+def test_implementation_details():
+    lst = [39, 43, 92, 49, 22, 29, 93, 98, 26, 57, 8]
+    tup = tuple(lst)
+    assert m.sequence_item_get_ssize_t(lst) == 43
+    assert m.sequence_item_set_ssize_t(lst) is None
+    assert lst[1] == "peppa"
+    assert m.sequence_item_get_size_t(lst) == 92
+    assert m.sequence_item_set_size_t(lst) is None
+    assert lst[2] == "george"
+    assert m.list_item_get_ssize_t(lst) == 49
+    assert m.list_item_set_ssize_t(lst) is None
+    assert lst[3] == "rebecca"
+    assert m.list_item_get_size_t(lst) == 22
+    assert m.list_item_set_size_t(lst) is None
+    assert lst[4] == "richard"
+    assert m.tuple_item_get_ssize_t(tup) == 29
+    assert m.tuple_item_set_ssize_t() == ("emely", "edmond")
+    assert m.tuple_item_get_size_t(tup) == 93
+    assert m.tuple_item_set_size_t() == ("candy", "cat")
diff --git a/wrap/pybind11/tests/test_sequences_and_iterators.cpp b/wrap/pybind11/tests/test_sequences_and_iterators.cpp
index 545dc45d08..a378128ae2 100644
--- a/wrap/pybind11/tests/test_sequences_and_iterators.cpp
+++ b/wrap/pybind11/tests/test_sequences_and_iterators.cpp
@@ -14,12 +14,19 @@
 #include <pybind11/stl.h>
 #include <algorithm>
+#include <utility>
+#include <vector>
+#include <optional>
 template<typename T>
 class NonZeroIterator {
     const T* ptr_;
-    NonZeroIterator(const T* ptr) : ptr_(ptr) {}
+    explicit NonZeroIterator(const T *ptr) : ptr_(ptr) {}
     const T& operator*() const { return *ptr_; }
     NonZeroIterator& operator++() { ++ptr_; return *this; }
@@ -31,6 +38,40 @@ bool operator==(const NonZeroIterator<std::pair<A, B>>& it, const NonZeroSentine
     return !(*it).first || !(*it).second;
+/* Iterator where dereferencing returns prvalues instead of references. */
+template<typename T>
+class NonRefIterator {
+    const T* ptr_;
+    explicit NonRefIterator(const T *ptr) : ptr_(ptr) {}
+    T operator*() const { return T(*ptr_); }
+    NonRefIterator& operator++() { ++ptr_; return *this; }
+    bool operator==(const NonRefIterator &other) const { return ptr_ == other.ptr_; }
+class NonCopyableInt {
+    explicit NonCopyableInt(int value) : value_(value) {}
+    NonCopyableInt(const NonCopyableInt &) = delete;
+    NonCopyableInt(NonCopyableInt &&other) noexcept : value_(other.value_) {
+        other.value_ = -1;  // detect when an unwanted move occurs
+    }
+    NonCopyableInt &operator=(const NonCopyableInt &) = delete;
+    NonCopyableInt &operator=(NonCopyableInt &&other) noexcept {
+        value_ = other.value_;
+        other.value_ = -1;  // detect when an unwanted move occurs
+        return *this;
+    }
+    int get() const { return value_; }
+    void set(int value) { value_ = value; }
+    ~NonCopyableInt() = default;
+    int value_;
+using NonCopyableIntPair = std::pair<NonCopyableInt, NonCopyableInt>;
 template <typename PythonType>
 py::list test_random_access_iterator(PythonType x) {
     if (x.size() < 5)
@@ -76,32 +117,43 @@ TEST_SUBMODULE(sequences_and_iterators, m) {
     // test_sliceable
     class Sliceable{
-      Sliceable(int n): size(n) {}
-      int start,stop,step;
-      int size;
+        explicit Sliceable(int n) : size(n) {}
+        int start, stop, step;
+        int size;
-    py::class_<Sliceable>(m,"Sliceable")
+    py::class_<Sliceable>(m, "Sliceable")
-        .def("__getitem__",[](const Sliceable &s, py::slice slice) {
-          ssize_t start, stop, step, slicelength;
-          if (!slice.compute(s.size, &start, &stop, &step, &slicelength))
-              throw py::error_already_set();
-          int istart = static_cast<int>(start);
-          int istop =  static_cast<int>(stop);
-          int istep =  static_cast<int>(step);
-          return std::make_tuple(istart,istop,istep);
-        })
-        ;
+        .def("__getitem__", [](const Sliceable &s, const py::slice &slice) {
+            py::ssize_t start = 0, stop = 0, step = 0, slicelength = 0;
+            if (!slice.compute(s.size, &start, &stop, &step, &slicelength))
+                throw py::error_already_set();
+            int istart = static_cast<int>(start);
+            int istop  = static_cast<int>(stop);
+            int istep  = static_cast<int>(step);
+            return std::make_tuple(istart, istop, istep);
+        });
+    m.def("make_forward_slice_size_t", []() { return py::slice(0, -1, 1); });
+    m.def("make_reversed_slice_object", []() { return py::slice(py::none(), py::none(), py::int_(-1)); });
+    m.attr("has_optional") = true;
+    m.def("make_reversed_slice_size_t_optional_verbose", []() { return py::slice(std::nullopt, std::nullopt, -1); });
+    // Warning: The following spelling may still compile if optional<> is not present and give wrong answers.
+    // Please use with caution.
+    m.def("make_reversed_slice_size_t_optional", []() { return py::slice({}, {}, -1); });
+    m.attr("has_optional") = false;
     // test_sequence
     class Sequence {
-        Sequence(size_t size) : m_size(size) {
+        explicit Sequence(size_t size) : m_size(size) {
             print_created(this, "of size", m_size);
             m_data = new float[size];
             memset(m_data, 0, sizeof(float) * size);
-        Sequence(const std::vector<float> &value) : m_size(value.size()) {
+        explicit Sequence(const std::vector<float> &value) : m_size(value.size()) {
             print_created(this, "of size", m_size, "from std::vector");
             m_data = new float[m_size];
             memcpy(m_data, &value[0], sizeof(float) * m_size);
@@ -111,7 +163,7 @@ TEST_SUBMODULE(sequences_and_iterators, m) {
             m_data = new float[m_size];
             memcpy(m_data, s.m_data, sizeof(float)*m_size);
-        Sequence(Sequence &&s) : m_size(s.m_size), m_data(s.m_data) {
+        Sequence(Sequence &&s) noexcept : m_size(s.m_size), m_data(s.m_data) {
             s.m_size = 0;
             s.m_data = nullptr;
@@ -130,7 +182,7 @@ TEST_SUBMODULE(sequences_and_iterators, m) {
             return *this;
-        Sequence &operator=(Sequence &&s) {
+        Sequence &operator=(Sequence &&s) noexcept {
             if (&s != this) {
                 delete[] m_data;
                 m_size = s.m_size;
@@ -179,43 +231,54 @@ TEST_SUBMODULE(sequences_and_iterators, m) {
     py::class_<Sequence>(m, "Sequence")
-        .def(py::init<const std::vector<float>&>())
+        .def(py::init<const std::vector<float> &>())
         /// Bare bones interface
-        .def("__getitem__", [](const Sequence &s, size_t i) {
-            if (i >= s.size()) throw py::index_error();
-            return s[i];
-        })
-        .def("__setitem__", [](Sequence &s, size_t i, float v) {
-            if (i >= s.size()) throw py::index_error();
-            s[i] = v;
-        })
+        .def("__getitem__",
+             [](const Sequence &s, size_t i) {
+                 if (i >= s.size())
+                     throw py::index_error();
+                 return s[i];
+             })
+        .def("__setitem__",
+             [](Sequence &s, size_t i, float v) {
+                 if (i >= s.size())
+                     throw py::index_error();
+                 s[i] = v;
+             })
         .def("__len__", &Sequence::size)
         /// Optional sequence protocol operations
-        .def("__iter__", [](const Sequence &s) { return py::make_iterator(s.begin(), s.end()); },
-                         py::keep_alive<0, 1>() /* Essential: keep object alive while iterator exists */)
+        .def(
+            "__iter__",
+            [](const Sequence &s) { return py::make_iterator(s.begin(), s.end()); },
+            py::keep_alive<0, 1>() /* Essential: keep object alive while iterator exists */)
         .def("__contains__", [](const Sequence &s, float v) { return s.contains(v); })
         .def("__reversed__", [](const Sequence &s) -> Sequence { return s.reversed(); })
         /// Slicing protocol (optional)
-        .def("__getitem__", [](const Sequence &s, py::slice slice) -> Sequence* {
-            size_t start, stop, step, slicelength;
-            if (!slice.compute(s.size(), &start, &stop, &step, &slicelength))
-                throw py::error_already_set();
-            auto *seq = new Sequence(slicelength);
-            for (size_t i = 0; i < slicelength; ++i) {
-                (*seq)[i] = s[start]; start += step;
-            }
-            return seq;
-        })
-        .def("__setitem__", [](Sequence &s, py::slice slice, const Sequence &value) {
-            size_t start, stop, step, slicelength;
-            if (!slice.compute(s.size(), &start, &stop, &step, &slicelength))
-                throw py::error_already_set();
-            if (slicelength != value.size())
-                throw std::runtime_error("Left and right hand size of slice assignment have different sizes!");
-            for (size_t i = 0; i < slicelength; ++i) {
-                s[start] = value[i]; start += step;
-            }
-        })
+        .def("__getitem__",
+             [](const Sequence &s, const py::slice &slice) -> Sequence * {
+                 size_t start = 0, stop = 0, step = 0, slicelength = 0;
+                 if (!slice.compute(s.size(), &start, &stop, &step, &slicelength))
+                     throw py::error_already_set();
+                 auto *seq = new Sequence(slicelength);
+                 for (size_t i = 0; i < slicelength; ++i) {
+                     (*seq)[i] = s[start];
+                     start += step;
+                 }
+                 return seq;
+             })
+        .def("__setitem__",
+             [](Sequence &s, const py::slice &slice, const Sequence &value) {
+                 size_t start = 0, stop = 0, step = 0, slicelength = 0;
+                 if (!slice.compute(s.size(), &start, &stop, &step, &slicelength))
+                     throw py::error_already_set();
+                 if (slicelength != value.size())
+                     throw std::runtime_error(
+                         "Left and right hand size of slice assignment have different sizes!");
+                 for (size_t i = 0; i < slicelength; ++i) {
+                     s[start] = value[i];
+                     start += step;
+                 }
+             })
         /// Comparisons
         .def(py::self == py::self)
         .def(py::self != py::self)
@@ -228,11 +291,11 @@ TEST_SUBMODULE(sequences_and_iterators, m) {
     class StringMap {
         StringMap() = default;
-        StringMap(std::unordered_map<std::string, std::string> init)
+        explicit StringMap(std::unordered_map<std::string, std::string> init)
             : map(std::move(init)) {}
-        void set(std::string key, std::string val) { map[key] = val; }
-        std::string get(std::string key) const { return map.at(key); }
+        void set(const std::string &key, std::string val) { map[key] = std::move(val); }
+        std::string get(const std::string &key) const { return map.at(key); }
         size_t size() const { return map.size(); }
         std::unordered_map<std::string, std::string> map;
@@ -243,38 +306,117 @@ TEST_SUBMODULE(sequences_and_iterators, m) {
     py::class_<StringMap>(m, "StringMap")
         .def(py::init<std::unordered_map<std::string, std::string>>())
-        .def("__getitem__", [](const StringMap &map, std::string key) {
-                try { return map.get(key); }
-                catch (const std::out_of_range&) {
-                    throw py::key_error("key '" + key + "' does not exist");
-                }
-        })
+        .def("__getitem__",
+             [](const StringMap &map, const std::string &key) {
+                 try {
+                     return map.get(key);
+                 } catch (const std::out_of_range &) {
+                     throw py::key_error("key '" + key + "' does not exist");
+                 }
+             })
         .def("__setitem__", &StringMap::set)
         .def("__len__", &StringMap::size)
-        .def("__iter__", [](const StringMap &map) { return py::make_key_iterator(map.begin(), map.end()); },
-                py::keep_alive<0, 1>())
-        .def("items", [](const StringMap &map) { return py::make_iterator(map.begin(), map.end()); },
-                py::keep_alive<0, 1>())
-        ;
+        .def(
+            "__iter__",
+            [](const StringMap &map) { return py::make_key_iterator(map.begin(), map.end()); },
+            py::keep_alive<0, 1>())
+        .def(
+            "items",
+            [](const StringMap &map) { return py::make_iterator(map.begin(), map.end()); },
+            py::keep_alive<0, 1>())
+        .def(
+            "values",
+            [](const StringMap &map) { return py::make_value_iterator(map.begin(), map.end()); },
+            py::keep_alive<0, 1>());
     // test_generalized_iterators
     class IntPairs {
-        IntPairs(std::vector<std::pair<int, int>> data) : data_(std::move(data)) {}
+        explicit IntPairs(std::vector<std::pair<int, int>> data) : data_(std::move(data)) {}
         const std::pair<int, int>* begin() const { return data_.data(); }
+        // .end() only required for py::make_iterator(self) overload
+        const std::pair<int, int>* end() const { return data_.data() + data_.size(); }
         std::vector<std::pair<int, int>> data_;
     py::class_<IntPairs>(m, "IntPairs")
         .def(py::init<std::vector<std::pair<int, int>>>())
         .def("nonzero", [](const IntPairs& s) {
-                return py::make_iterator(NonZeroIterator<std::pair<int, int>>(s.begin()), NonZeroSentinel());
+            return py::make_iterator(NonZeroIterator<std::pair<int, int>>(s.begin()), NonZeroSentinel());
         }, py::keep_alive<0, 1>())
         .def("nonzero_keys", [](const IntPairs& s) {
             return py::make_key_iterator(NonZeroIterator<std::pair<int, int>>(s.begin()), NonZeroSentinel());
         }, py::keep_alive<0, 1>())
+        .def("nonzero_values", [](const IntPairs& s) {
+            return py::make_value_iterator(NonZeroIterator<std::pair<int, int>>(s.begin()), NonZeroSentinel());
+        }, py::keep_alive<0, 1>())
+        // test iterator that returns values instead of references
+        .def("nonref", [](const IntPairs& s) {
+             return py::make_iterator(NonRefIterator<std::pair<int, int>>(s.begin()),
+                                      NonRefIterator<std::pair<int, int>>(s.end()));
+        }, py::keep_alive<0, 1>())
+        .def("nonref_keys", [](const IntPairs& s) {
+             return py::make_key_iterator(NonRefIterator<std::pair<int, int>>(s.begin()),
+                                          NonRefIterator<std::pair<int, int>>(s.end()));
+        }, py::keep_alive<0, 1>())
+        .def("nonref_values", [](const IntPairs& s) {
+             return py::make_value_iterator(NonRefIterator<std::pair<int, int>>(s.begin()),
+                                            NonRefIterator<std::pair<int, int>>(s.end()));
+        }, py::keep_alive<0, 1>())
+        // test single-argument make_iterator
+        .def("simple_iterator", [](IntPairs& self) {
+            return py::make_iterator(self);
+        }, py::keep_alive<0, 1>())
+        .def("simple_keys", [](IntPairs& self) {
+            return py::make_key_iterator(self);
+        }, py::keep_alive<0, 1>())
+        .def("simple_values", [](IntPairs& self) {
+            return py::make_value_iterator(self);
+        }, py::keep_alive<0, 1>())
+        // Test iterator with an Extra (doesn't do anything useful, so not used
+        // at runtime, but tests need to be able to compile with the correct
+        // overload. See PR #3293.
+        .def("_make_iterator_extras", [](IntPairs& self) {
+            return py::make_iterator(self, py::call_guard<int>());
+        }, py::keep_alive<0, 1>())
+        .def("_make_key_extras", [](IntPairs& self) {
+            return py::make_key_iterator(self, py::call_guard<int>());
+        }, py::keep_alive<0, 1>())
+        .def("_make_value_extras", [](IntPairs& self) {
+            return py::make_value_iterator(self, py::call_guard<int>());
+        }, py::keep_alive<0, 1>())
+    // test_iterater_referencing
+    py::class_<NonCopyableInt>(m, "NonCopyableInt")
+        .def(py::init<int>())
+        .def("set", &NonCopyableInt::set)
+        .def("__int__", &NonCopyableInt::get)
+        ;
+    py::class_<std::vector<NonCopyableInt>>(m, "VectorNonCopyableInt")
+        .def(py::init<>())
+        .def("append", [](std::vector<NonCopyableInt> &vec, int value) {
+            vec.emplace_back(value);
+        })
+        .def("__iter__", [](std::vector<NonCopyableInt> &vec) {
+            return py::make_iterator(vec.begin(), vec.end());
+        })
+        ;
+    py::class_<std::vector<NonCopyableIntPair>>(m, "VectorNonCopyableIntPair")
+        .def(py::init<>())
+        .def("append", [](std::vector<NonCopyableIntPair> &vec, const std::pair<int, int> &value) {
+            vec.emplace_back(NonCopyableInt(value.first), NonCopyableInt(value.second));
+        })
+        .def("keys", [](std::vector<NonCopyableIntPair> &vec) {
+            return py::make_key_iterator(vec.begin(), vec.end());
+        })
+        .def("values", [](std::vector<NonCopyableIntPair> &vec) {
+            return py::make_value_iterator(vec.begin(), vec.end());
+        })
+        ;
 #if 0
     // Obsolete: special data structure for exposing custom iterator types to python
@@ -304,7 +446,7 @@ TEST_SUBMODULE(sequences_and_iterators, m) {
     // test_python_iterator_in_cpp
-    m.def("object_to_list", [](py::object o) {
+    m.def("object_to_list", [](const py::object &o) {
         auto l = py::list();
         for (auto item : o) {
@@ -322,22 +464,22 @@ TEST_SUBMODULE(sequences_and_iterators, m) {
     // test_sequence_length: check that Python sequences can be converted to py::sequence.
-    m.def("sequence_length", [](py::sequence seq) { return seq.size(); });
+    m.def("sequence_length", [](const py::sequence &seq) { return seq.size(); });
     // Make sure that py::iterator works with std algorithms
-    m.def("count_none", [](py::object o) {
+    m.def("count_none", [](const py::object &o) {
         return std::count_if(o.begin(), o.end(), [](py::handle h) { return h.is_none(); });
-    m.def("find_none", [](py::object o) {
+    m.def("find_none", [](const py::object &o) {
         auto it = std::find_if(o.begin(), o.end(), [](py::handle h) { return h.is_none(); });
         return it->is_none();
-    m.def("count_nonzeros", [](py::dict d) {
-       return std::count_if(d.begin(), d.end(), [](std::pair<py::handle, py::handle> p) {
-           return p.second.cast<int>() != 0;
-       });
+    m.def("count_nonzeros", [](const py::dict &d) {
+        return std::count_if(d.begin(), d.end(), [](std::pair<py::handle, py::handle> p) {
+            return p.second.cast<int>() != 0;
+        });
     m.def("tuple_iterator", &test_random_access_iterator<py::tuple>);
diff --git a/wrap/pybind11/tests/test_sequences_and_iterators.py b/wrap/pybind11/tests/test_sequences_and_iterators.py
index 8f6c0c4bbd..6985918a11 100644
--- a/wrap/pybind11/tests/test_sequences_and_iterators.py
+++ b/wrap/pybind11/tests/test_sequences_and_iterators.py
@@ -1,7 +1,8 @@
 # -*- coding: utf-8 -*-
 import pytest
-from pybind11_tests import sequences_and_iterators as m
 from pybind11_tests import ConstructorStats
+from pybind11_tests import sequences_and_iterators as m
 def isclose(a, b, rel_tol=1e-05, abs_tol=0.0):
@@ -10,7 +11,20 @@ def isclose(a, b, rel_tol=1e-05, abs_tol=0.0):
 def allclose(a_list, b_list, rel_tol=1e-05, abs_tol=0.0):
-    return all(isclose(a, b, rel_tol=rel_tol, abs_tol=abs_tol) for a, b in zip(a_list, b_list))
+    return all(
+        isclose(a, b, rel_tol=rel_tol, abs_tol=abs_tol) for a, b in zip(a_list, b_list)
+    )
+def test_slice_constructors():
+    assert m.make_forward_slice_size_t() == slice(0, -1, 1)
+    assert m.make_reversed_slice_object() == slice(None, None, -1)
+@pytest.mark.skipif(not m.has_optional, reason="no <optional>")
+def test_slice_constructors_explicit_optional():
+    assert m.make_reversed_slice_size_t_optional() == slice(None, None, -1)
+    assert m.make_reversed_slice_size_t_optional_verbose() == slice(None, None, -1)
 def test_generalized_iterators():
@@ -22,6 +36,10 @@ def test_generalized_iterators():
     assert list(m.IntPairs([(1, 2), (2, 0), (0, 3), (4, 5)]).nonzero_keys()) == [1]
     assert list(m.IntPairs([(0, 3), (1, 2), (3, 4)]).nonzero_keys()) == []
+    assert list(m.IntPairs([(1, 2), (3, 4), (0, 5)]).nonzero_values()) == [2, 4]
+    assert list(m.IntPairs([(1, 2), (2, 0), (0, 3), (4, 5)]).nonzero_values()) == [2]
+    assert list(m.IntPairs([(0, 3), (1, 2), (3, 4)]).nonzero_values()) == []
     # __next__ must continue to raise StopIteration
     it = m.IntPairs([(0, 0)]).nonzero()
     for _ in range(3):
@@ -34,6 +52,47 @@ def test_generalized_iterators():
+def test_nonref_iterators():
+    pairs = m.IntPairs([(1, 2), (3, 4), (0, 5)])
+    assert list(pairs.nonref()) == [(1, 2), (3, 4), (0, 5)]
+    assert list(pairs.nonref_keys()) == [1, 3, 0]
+    assert list(pairs.nonref_values()) == [2, 4, 5]
+def test_generalized_iterators_simple():
+    assert list(m.IntPairs([(1, 2), (3, 4), (0, 5)]).simple_iterator()) == [
+        (1, 2),
+        (3, 4),
+        (0, 5),
+    ]
+    assert list(m.IntPairs([(1, 2), (3, 4), (0, 5)]).simple_keys()) == [1, 3, 0]
+    assert list(m.IntPairs([(1, 2), (3, 4), (0, 5)]).simple_values()) == [2, 4, 5]
+def test_iterator_referencing():
+    """Test that iterators reference rather than copy their referents."""
+    vec = m.VectorNonCopyableInt()
+    vec.append(3)
+    vec.append(5)
+    assert [int(x) for x in vec] == [3, 5]
+    # Increment everything to make sure the referents can be mutated
+    for x in vec:
+        x.set(int(x) + 1)
+    assert [int(x) for x in vec] == [4, 6]
+    vec = m.VectorNonCopyableIntPair()
+    vec.append([3, 4])
+    vec.append([5, 7])
+    assert [int(x) for x in vec.keys()] == [3, 5]
+    assert [int(x) for x in vec.values()] == [4, 7]
+    for x in vec.keys():
+        x.set(int(x) + 1)
+    for x in vec.values():
+        x.set(int(x) + 10)
+    assert [int(x) for x in vec.keys()] == [4, 6]
+    assert [int(x) for x in vec.values()] == [14, 17]
 def test_sliceable():
     sliceable = m.Sliceable(100)
     assert sliceable[::] == (0, 100, 1)
@@ -51,7 +110,7 @@ def test_sequence():
     cstats = ConstructorStats.get(m.Sequence)
     s = m.Sequence(5)
-    assert cstats.values() == ['of size', '5']
+    assert cstats.values() == ["of size", "5"]
     assert "Sequence" in repr(s)
     assert len(s) == 5
@@ -62,16 +121,16 @@ def test_sequence():
     assert isclose(s[0], 12.34) and isclose(s[3], 56.78)
     rev = reversed(s)
-    assert cstats.values() == ['of size', '5']
+    assert cstats.values() == ["of size", "5"]
     rev2 = s[::-1]
-    assert cstats.values() == ['of size', '5']
+    assert cstats.values() == ["of size", "5"]
     it = iter(m.Sequence(0))
     for _ in range(3):  # __next__ must continue to raise StopIteration
         with pytest.raises(StopIteration):
-    assert cstats.values() == ['of size', '0']
+    assert cstats.values() == ["of size", "0"]
     expected = [0, 56.78, 0, 0, 12.34]
     assert allclose(rev, expected)
@@ -79,7 +138,7 @@ def test_sequence():
     assert rev == rev2
     rev[0::2] = m.Sequence([2.0, 2.0, 2.0])
-    assert cstats.values() == ['of size', '3', 'from std::vector']
+    assert cstats.values() == ["of size", "3", "from std::vector"]
     assert allclose(rev, [2, 56.78, 2, 0, 2])
@@ -102,11 +161,12 @@ def test_sequence():
 def test_sequence_length():
-    """#2076: Exception raised by len(arg) should be propagated """
+    """#2076: Exception raised by len(arg) should be propagated"""
     class BadLen(RuntimeError):
-    class SequenceLike():
+    class SequenceLike:
         def __getitem__(self, i):
             return None
@@ -121,21 +181,22 @@ def __len__(self):
 def test_map_iterator():
-    sm = m.StringMap({'hi': 'bye', 'black': 'white'})
-    assert sm['hi'] == 'bye'
+    sm = m.StringMap({"hi": "bye", "black": "white"})
+    assert sm["hi"] == "bye"
     assert len(sm) == 2
-    assert sm['black'] == 'white'
+    assert sm["black"] == "white"
     with pytest.raises(KeyError):
-        assert sm['orange']
-    sm['orange'] = 'banana'
-    assert sm['orange'] == 'banana'
+        assert sm["orange"]
+    sm["orange"] = "banana"
+    assert sm["orange"] == "banana"
-    expected = {'hi': 'bye', 'black': 'white', 'orange': 'banana'}
+    expected = {"hi": "bye", "black": "white", "orange": "banana"}
     for k in sm:
         assert sm[k] == expected[k]
     for k, v in sm.items():
         assert v == expected[k]
+    assert list(sm.values()) == [expected[k] for k in sm]
     it = iter(m.StringMap({}))
     for _ in range(3):  # __next__ must continue to raise StopIteration
@@ -179,11 +240,12 @@ def test_iterator_passthrough():
     """#181: iterator passthrough did not compile"""
     from pybind11_tests.sequences_and_iterators import iterator_passthrough
-    assert list(iterator_passthrough(iter([3, 5, 7, 9, 11, 13, 15]))) == [3, 5, 7, 9, 11, 13, 15]
+    values = [3, 5, 7, 9, 11, 13, 15]
+    assert list(iterator_passthrough(iter(values))) == values
 def test_iterator_rvp():
-    """#388: Can't make iterators via make_iterator() with different r/v policies """
+    """#388: Can't make iterators via make_iterator() with different r/v policies"""
     import pybind11_tests.sequences_and_iterators as m
     assert list(m.make_iterator_1()) == [1, 2, 3]
diff --git a/wrap/pybind11/tests/test_smart_ptr.cpp b/wrap/pybind11/tests/test_smart_ptr.cpp
index 60c2e692e5..94f04330a2 100644
--- a/wrap/pybind11/tests/test_smart_ptr.cpp
+++ b/wrap/pybind11/tests/test_smart_ptr.cpp
@@ -8,30 +8,14 @@
     BSD-style license that can be found in the LICENSE file.
-#if defined(_MSC_VER) && _MSC_VER < 1910
-#  pragma warning(disable: 4702) // unreachable code in system header
+#if defined(_MSC_VER) && _MSC_VER < 1910  // VS 2015's MSVC
+#  pragma warning(disable: 4702) // unreachable code in system header (xatomic.h(382))
 #include "pybind11_tests.h"
 #include "object.h"
-// Make pybind aware of the ref-counted wrapper type (s):
-// ref<T> is a wrapper for 'Object' which uses intrusive reference counting
-// It is always possible to construct a ref<T> from an Object* pointer without
-// possible inconsistencies, hence the 'true' argument at the end.
-// Make pybind11 aware of the non-standard getter member function
-namespace pybind11 { namespace detail {
-    template <typename T>
-    struct holder_helper<ref<T>> {
-        static const T *get(const ref<T> &p) { return p.get_ptr(); }
-    };
-} // namespace detail
-} // namespace pybind11
-// The following is not required anymore for std::shared_ptr, but it should compile without error:
-PYBIND11_DECLARE_HOLDER_TYPE(T, std::shared_ptr<T>);
+namespace {
 // This is just a wrapper around unique_ptr, but with extra fields to deliberately bloat up the
 // holder size to trigger the non-simple-layout internal instance layout for single inheritance with
@@ -40,21 +24,19 @@ template <typename T> class huge_unique_ptr {
     std::unique_ptr<T> ptr;
     uint64_t padding[10];
-    huge_unique_ptr(T *p) : ptr(p) {};
+    explicit huge_unique_ptr(T *p) : ptr(p) {}
     T *get() { return ptr.get(); }
-PYBIND11_DECLARE_HOLDER_TYPE(T, huge_unique_ptr<T>);
 // Simple custom holder that works like unique_ptr
 template <typename T>
 class custom_unique_ptr {
     std::unique_ptr<T> impl;
-    custom_unique_ptr(T* p) : impl(p) { }
+    explicit custom_unique_ptr(T *p) : impl(p) {}
     T* get() const { return impl.get(); }
     T* release_ptr() { return impl.release(); }
-PYBIND11_DECLARE_HOLDER_TYPE(T, custom_unique_ptr<T>);
 // Simple custom holder that works like shared_ptr and has operator& overload
 // To obtain address of an instance of this holder pybind should use std::addressof
@@ -64,11 +46,10 @@ class shared_ptr_with_addressof_operator {
     std::shared_ptr<T> impl;
     shared_ptr_with_addressof_operator( ) = default;
-    shared_ptr_with_addressof_operator(T* p) : impl(p) { }
+    explicit shared_ptr_with_addressof_operator(T *p) : impl(p) {}
     T* get() const { return impl.get(); }
     T** operator&() { throw std::logic_error("Call of overloaded operator& is not expected"); }
-PYBIND11_DECLARE_HOLDER_TYPE(T, shared_ptr_with_addressof_operator<T>);
 // Simple custom holder that works like unique_ptr and has operator& overload
 // To obtain address of an instance of this holder pybind should use std::addressof
@@ -78,15 +59,226 @@ class unique_ptr_with_addressof_operator {
     std::unique_ptr<T> impl;
     unique_ptr_with_addressof_operator() = default;
-    unique_ptr_with_addressof_operator(T* p) : impl(p) { }
+    explicit unique_ptr_with_addressof_operator(T *p) : impl(p) {}
     T* get() const { return impl.get(); }
     T* release_ptr() { return impl.release(); }
     T** operator&() { throw std::logic_error("Call of overloaded operator& is not expected"); }
-PYBIND11_DECLARE_HOLDER_TYPE(T, unique_ptr_with_addressof_operator<T>);
+// Custom object with builtin reference counting (see 'object.h' for the implementation)
+class MyObject1 : public Object {
+    explicit MyObject1(int value) : value(value) { print_created(this, toString()); }
+    std::string toString() const override { return "MyObject1[" + std::to_string(value) + "]"; }
+    ~MyObject1() override { print_destroyed(this); }
+    int value;
+// Object managed by a std::shared_ptr<>
+class MyObject2 {
+    MyObject2(const MyObject2 &) = default;
+    explicit MyObject2(int value) : value(value) { print_created(this, toString()); }
+    std::string toString() const { return "MyObject2[" + std::to_string(value) + "]"; }
+    virtual ~MyObject2() { print_destroyed(this); }
+    int value;
+// Object managed by a std::shared_ptr<>, additionally derives from std::enable_shared_from_this<>
+class MyObject3 : public std::enable_shared_from_this<MyObject3> {
+    MyObject3(const MyObject3 &) = default;
+    explicit MyObject3(int value) : value(value) { print_created(this, toString()); }
+    std::string toString() const { return "MyObject3[" + std::to_string(value) + "]"; }
+    virtual ~MyObject3() { print_destroyed(this); }
+    int value;
+// test_unique_nodelete
+// Object with a private destructor
+class MyObject4;
+std::unordered_set<MyObject4 *> myobject4_instances;
+class MyObject4 {
+    explicit MyObject4(int value) : value{value} {
+        print_created(this);
+        myobject4_instances.insert(this);
+    }
+    int value;
+    static void cleanupAllInstances() {
+        auto tmp = std::move(myobject4_instances);
+        myobject4_instances.clear();
+        for (auto o : tmp)
+            delete o;
+    }
+    ~MyObject4() {
+        myobject4_instances.erase(this);
+        print_destroyed(this);
+    }
+// test_unique_deleter
+// Object with std::unique_ptr<T, D> where D is not matching the base class
+// Object with a protected destructor
+class MyObject4a;
+std::unordered_set<MyObject4a *> myobject4a_instances;
+class MyObject4a {
+    explicit MyObject4a(int i) {
+        value = i;
+        print_created(this);
+        myobject4a_instances.insert(this);
+    };
+    int value;
+    static void cleanupAllInstances() {
+        auto tmp = std::move(myobject4a_instances);
+        myobject4a_instances.clear();
+        for (auto o : tmp)
+            delete o;
+    }
+    virtual ~MyObject4a() {
+        myobject4a_instances.erase(this);
+        print_destroyed(this);
+    }
+// Object derived but with public destructor and no Deleter in default holder
+class MyObject4b : public MyObject4a {
+    explicit MyObject4b(int i) : MyObject4a(i) { print_created(this); }
+    ~MyObject4b() override { print_destroyed(this); }
+// test_large_holder
+class MyObject5 { // managed by huge_unique_ptr
+    explicit MyObject5(int value) : value{value} { print_created(this); }
+    ~MyObject5() { print_destroyed(this); }
+    int value;
+// test_shared_ptr_and_references
+struct SharedPtrRef {
+    struct A {
+        A() { print_created(this); }
+        A(const A &) { print_copy_created(this); }
+        A(A &&) noexcept { print_move_created(this); }
+        ~A() { print_destroyed(this); }
+    };
+    A value = {};
+    std::shared_ptr<A> shared = std::make_shared<A>();
+// test_shared_ptr_from_this_and_references
+struct SharedFromThisRef {
+    struct B : std::enable_shared_from_this<B> {
+        B() { print_created(this); }
+        // NOLINTNEXTLINE(bugprone-copy-constructor-init)
+        B(const B &) : std::enable_shared_from_this<B>() { print_copy_created(this); }
+        B(B &&) noexcept : std::enable_shared_from_this<B>() { print_move_created(this); }
+        ~B() { print_destroyed(this); }
+    };
+    B value = {};
+    std::shared_ptr<B> shared = std::make_shared<B>();
+// Issue #865: shared_from_this doesn't work with virtual inheritance
+struct SharedFromThisVBase : std::enable_shared_from_this<SharedFromThisVBase> {
+    SharedFromThisVBase() = default;
+    SharedFromThisVBase(const SharedFromThisVBase &) = default;
+    virtual ~SharedFromThisVBase() = default;
+struct SharedFromThisVirt : virtual SharedFromThisVBase {};
+// test_move_only_holder
+struct C {
+    C() { print_created(this); }
+    ~C() { print_destroyed(this); }
+// test_holder_with_addressof_operator
+struct TypeForHolderWithAddressOf {
+    TypeForHolderWithAddressOf() { print_created(this); }
+    TypeForHolderWithAddressOf(const TypeForHolderWithAddressOf &) { print_copy_created(this); }
+    TypeForHolderWithAddressOf(TypeForHolderWithAddressOf &&) noexcept {
+        print_move_created(this);
+    }
+    ~TypeForHolderWithAddressOf() { print_destroyed(this); }
+    std::string toString() const {
+        return "TypeForHolderWithAddressOf[" + std::to_string(value) + "]";
+    }
+    int value = 42;
+// test_move_only_holder_with_addressof_operator
+struct TypeForMoveOnlyHolderWithAddressOf {
+    explicit TypeForMoveOnlyHolderWithAddressOf(int value) : value{value} { print_created(this); }
+    ~TypeForMoveOnlyHolderWithAddressOf() { print_destroyed(this); }
+    std::string toString() const {
+        return "MoveOnlyHolderWithAddressOf[" + std::to_string(value) + "]";
+    }
+    int value;
+// test_smart_ptr_from_default
+struct HeldByDefaultHolder { };
+// test_shared_ptr_gc
+// #187: issue involving std::shared_ptr<> return value policy & garbage collection
+struct ElementBase {
+    virtual ~ElementBase() = default; /* Force creation of virtual table */
+    ElementBase() = default;
+    ElementBase(const ElementBase&) = delete;
+struct ElementA : ElementBase {
+    explicit ElementA(int v) : v(v) {}
+    int value() const { return v; }
+    int v;
+struct ElementList {
+    void add(const std::shared_ptr<ElementBase> &e) { l.push_back(e); }
+    std::vector<std::shared_ptr<ElementBase>> l;
+} // namespace
+// ref<T> is a wrapper for 'Object' which uses intrusive reference counting
+// It is always possible to construct a ref<T> from an Object* pointer without
+// possible inconsistencies, hence the 'true' argument at the end.
+// Make pybind11 aware of the non-standard getter member function
+namespace pybind11 { namespace detail {
+    template <typename T>
+    struct holder_helper<ref<T>> {
+        static const T *get(const ref<T> &p) { return p.get_ptr(); }
+    };
+} // namespace detail
+} // namespace pybind11
+// Make pybind aware of the ref-counted wrapper type (s):
+// The following is not required anymore for std::shared_ptr, but it should compile without error:
+PYBIND11_DECLARE_HOLDER_TYPE(T, std::shared_ptr<T>);
+PYBIND11_DECLARE_HOLDER_TYPE(T, huge_unique_ptr<T>);
+PYBIND11_DECLARE_HOLDER_TYPE(T, custom_unique_ptr<T>);
+PYBIND11_DECLARE_HOLDER_TYPE(T, shared_ptr_with_addressof_operator<T>);
+PYBIND11_DECLARE_HOLDER_TYPE(T, unique_ptr_with_addressof_operator<T>);
 TEST_SUBMODULE(smart_ptr, m) {
+    // Please do not interleave `struct` and `class` definitions with bindings code,
+    // but implement `struct`s and `class`es in the anonymous namespace above.
+    // This helps keeping the smart_holder branch in sync with master.
     // test_smart_ptr
@@ -94,24 +286,14 @@ TEST_SUBMODULE(smart_ptr, m) {
     py::class_<Object, ref<Object>> obj(m, "Object");
     obj.def("getRefCount", &Object::getRefCount);
-    // Custom object with builtin reference counting (see 'object.h' for the implementation)
-    class MyObject1 : public Object {
-    public:
-        MyObject1(int value) : value(value) { print_created(this, toString()); }
-        std::string toString() const override { return "MyObject1[" + std::to_string(value) + "]"; }
-    protected:
-        ~MyObject1() override { print_destroyed(this); }
-    private:
-        int value;
-    };
     py::class_<MyObject1, ref<MyObject1>>(m, "MyObject1", obj)
     py::implicitly_convertible<py::int_, MyObject1>();
     m.def("make_object_1", []() -> Object * { return new MyObject1(1); });
-    m.def("make_object_2", []() -> ref<Object> { return new MyObject1(2); });
+    m.def("make_object_2", []() -> ref<Object> { return ref<Object>(new MyObject1(2)); });
     m.def("make_myobject1_1", []() -> MyObject1 * { return new MyObject1(4); });
-    m.def("make_myobject1_2", []() -> ref<MyObject1> { return new MyObject1(5); });
+    m.def("make_myobject1_2", []() -> ref<MyObject1> { return ref<MyObject1>(new MyObject1(5)); });
     m.def("print_object_1", [](const Object *obj) { py::print(obj->toString()); });
     m.def("print_object_2", [](ref<Object> obj) { py::print(obj->toString()); });
     m.def("print_object_3", [](const ref<Object> &obj) { py::print(obj->toString()); });
@@ -124,48 +306,29 @@ TEST_SUBMODULE(smart_ptr, m) {
     // Expose constructor stats for the ref type
     m.def("cstats_ref", &ConstructorStats::get<ref_tag>);
-    // Object managed by a std::shared_ptr<>
-    class MyObject2 {
-    public:
-        MyObject2(const MyObject2 &) = default;
-        MyObject2(int value) : value(value) { print_created(this, toString()); }
-        std::string toString() const { return "MyObject2[" + std::to_string(value) + "]"; }
-        virtual ~MyObject2() { print_destroyed(this); }
-    private:
-        int value;
-    };
     py::class_<MyObject2, std::shared_ptr<MyObject2>>(m, "MyObject2")
     m.def("make_myobject2_1", []() { return new MyObject2(6); });
     m.def("make_myobject2_2", []() { return std::make_shared<MyObject2>(7); });
     m.def("print_myobject2_1", [](const MyObject2 *obj) { py::print(obj->toString()); });
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
     m.def("print_myobject2_2", [](std::shared_ptr<MyObject2> obj) { py::print(obj->toString()); });
     m.def("print_myobject2_3", [](const std::shared_ptr<MyObject2> &obj) { py::print(obj->toString()); });
     m.def("print_myobject2_4", [](const std::shared_ptr<MyObject2> *obj) { py::print((*obj)->toString()); });
-    // Object managed by a std::shared_ptr<>, additionally derives from std::enable_shared_from_this<>
-    class MyObject3 : public std::enable_shared_from_this<MyObject3> {
-    public:
-        MyObject3(const MyObject3 &) = default;
-        MyObject3(int value) : value(value) { print_created(this, toString()); }
-        std::string toString() const { return "MyObject3[" + std::to_string(value) + "]"; }
-        virtual ~MyObject3() { print_destroyed(this); }
-    private:
-        int value;
-    };
     py::class_<MyObject3, std::shared_ptr<MyObject3>>(m, "MyObject3")
     m.def("make_myobject3_1", []() { return new MyObject3(8); });
     m.def("make_myobject3_2", []() { return std::make_shared<MyObject3>(9); });
     m.def("print_myobject3_1", [](const MyObject3 *obj) { py::print(obj->toString()); });
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
     m.def("print_myobject3_2", [](std::shared_ptr<MyObject3> obj) { py::print(obj->toString()); });
     m.def("print_myobject3_3", [](const std::shared_ptr<MyObject3> &obj) { py::print(obj->toString()); });
     m.def("print_myobject3_4", [](const std::shared_ptr<MyObject3> *obj) { py::print((*obj)->toString()); });
     // test_smart_ptr_refcounting
     m.def("test_object1_refcounting", []() {
-        ref<MyObject1> o = new MyObject1(0);
+        auto o = ref<MyObject1>(new MyObject1(0));
         bool good = o->getRefCount() == 1;
         py::object o2 = py::cast(o, py::return_value_policy::reference);
         // always request (partial) ownership for objects with intrusive
@@ -175,155 +338,88 @@ TEST_SUBMODULE(smart_ptr, m) {
     // test_unique_nodelete
-    // Object with a private destructor
-    class MyObject4 {
-    public:
-        MyObject4(int value) : value{value} { print_created(this); }
-        int value;
-    private:
-        ~MyObject4() { print_destroyed(this); }
-    };
     py::class_<MyObject4, std::unique_ptr<MyObject4, py::nodelete>>(m, "MyObject4")
-        .def_readwrite("value", &MyObject4::value);
+        .def_readwrite("value", &MyObject4::value)
+        .def_static("cleanup_all_instances", &MyObject4::cleanupAllInstances);
     // test_unique_deleter
-    // Object with std::unique_ptr<T, D> where D is not matching the base class
-    // Object with a protected destructor
-    class MyObject4a {
-    public:
-        MyObject4a(int i) {
-            value = i;
-            print_created(this);
-        };
-        int value;
-    protected:
-        virtual ~MyObject4a() { print_destroyed(this); }
-    };
     py::class_<MyObject4a, std::unique_ptr<MyObject4a, py::nodelete>>(m, "MyObject4a")
-        .def_readwrite("value", &MyObject4a::value);
+        .def_readwrite("value", &MyObject4a::value)
+        .def_static("cleanup_all_instances", &MyObject4a::cleanupAllInstances);
-    // Object derived but with public destructor and no Deleter in default holder
-    class MyObject4b : public MyObject4a {
-    public:
-        MyObject4b(int i) : MyObject4a(i) { print_created(this); }
-        ~MyObject4b() override { print_destroyed(this); }
-    };
-    py::class_<MyObject4b, MyObject4a>(m, "MyObject4b")
+    py::class_<MyObject4b, MyObject4a, std::unique_ptr<MyObject4b>>(m, "MyObject4b")
     // test_large_holder
-    class MyObject5 { // managed by huge_unique_ptr
-    public:
-        MyObject5(int value) : value{value} { print_created(this); }
-        ~MyObject5() { print_destroyed(this); }
-        int value;
-    };
     py::class_<MyObject5, huge_unique_ptr<MyObject5>>(m, "MyObject5")
         .def_readwrite("value", &MyObject5::value);
     // test_shared_ptr_and_references
-    struct SharedPtrRef {
-        struct A {
-            A() { print_created(this); }
-            A(const A &) { print_copy_created(this); }
-            A(A &&) { print_move_created(this); }
-            ~A() { print_destroyed(this); }
-        };
-        A value = {};
-        std::shared_ptr<A> shared = std::make_shared<A>();
-    };
     using A = SharedPtrRef::A;
     py::class_<A, std::shared_ptr<A>>(m, "A");
-    py::class_<SharedPtrRef>(m, "SharedPtrRef")
+    py::class_<SharedPtrRef, std::unique_ptr<SharedPtrRef>>(m, "SharedPtrRef")
         .def_readonly("ref", &SharedPtrRef::value)
-        .def_property_readonly("copy", [](const SharedPtrRef &s) { return s.value; },
-                               py::return_value_policy::copy)
+        .def_property_readonly(
+            "copy", [](const SharedPtrRef &s) { return s.value; }, py::return_value_policy::copy)
         .def_readonly("holder_ref", &SharedPtrRef::shared)
-        .def_property_readonly("holder_copy", [](const SharedPtrRef &s) { return s.shared; },
-                               py::return_value_policy::copy)
+        .def_property_readonly(
+            "holder_copy",
+            [](const SharedPtrRef &s) { return s.shared; },
+            py::return_value_policy::copy)
         .def("set_ref", [](SharedPtrRef &, const A &) { return true; })
+        // NOLINTNEXTLINE(performance-unnecessary-value-param)
         .def("set_holder", [](SharedPtrRef &, std::shared_ptr<A>) { return true; });
     // test_shared_ptr_from_this_and_references
-    struct SharedFromThisRef {
-        struct B : std::enable_shared_from_this<B> {
-            B() { print_created(this); }
-            B(const B &) : std::enable_shared_from_this<B>() { print_copy_created(this); }
-            B(B &&) : std::enable_shared_from_this<B>() { print_move_created(this); }
-            ~B() { print_destroyed(this); }
-        };
-        B value = {};
-        std::shared_ptr<B> shared = std::make_shared<B>();
-    };
     using B = SharedFromThisRef::B;
     py::class_<B, std::shared_ptr<B>>(m, "B");
-    py::class_<SharedFromThisRef>(m, "SharedFromThisRef")
+    py::class_<SharedFromThisRef, std::unique_ptr<SharedFromThisRef>>(m, "SharedFromThisRef")
         .def_readonly("bad_wp", &SharedFromThisRef::value)
-        .def_property_readonly("ref", [](const SharedFromThisRef &s) -> const B & { return *s.shared; })
-        .def_property_readonly("copy", [](const SharedFromThisRef &s) { return s.value; },
-                               py::return_value_policy::copy)
+        .def_property_readonly("ref",
+                               [](const SharedFromThisRef &s) -> const B & { return *s.shared; })
+        .def_property_readonly(
+            "copy",
+            [](const SharedFromThisRef &s) { return s.value; },
+            py::return_value_policy::copy)
         .def_readonly("holder_ref", &SharedFromThisRef::shared)
-        .def_property_readonly("holder_copy", [](const SharedFromThisRef &s) { return s.shared; },
-                               py::return_value_policy::copy)
+        .def_property_readonly(
+            "holder_copy",
+            [](const SharedFromThisRef &s) { return s.shared; },
+            py::return_value_policy::copy)
         .def("set_ref", [](SharedFromThisRef &, const B &) { return true; })
+        // NOLINTNEXTLINE(performance-unnecessary-value-param)
         .def("set_holder", [](SharedFromThisRef &, std::shared_ptr<B>) { return true; });
     // Issue #865: shared_from_this doesn't work with virtual inheritance
-    struct SharedFromThisVBase : std::enable_shared_from_this<SharedFromThisVBase> {
-        SharedFromThisVBase() = default;
-        SharedFromThisVBase(const SharedFromThisVBase &) = default;
-        virtual ~SharedFromThisVBase() = default;
-    };
-    struct SharedFromThisVirt : virtual SharedFromThisVBase {};
     static std::shared_ptr<SharedFromThisVirt> sft(new SharedFromThisVirt());
     py::class_<SharedFromThisVirt, std::shared_ptr<SharedFromThisVirt>>(m, "SharedFromThisVirt")
         .def_static("get", []() { return sft.get(); });
     // test_move_only_holder
-    struct C {
-        C() { print_created(this); }
-        ~C() { print_destroyed(this); }
-    };
     py::class_<C, custom_unique_ptr<C>>(m, "TypeWithMoveOnlyHolder")
         .def_static("make", []() { return custom_unique_ptr<C>(new C); })
         .def_static("make_as_object", []() { return py::cast(custom_unique_ptr<C>(new C)); });
     // test_holder_with_addressof_operator
-    struct TypeForHolderWithAddressOf {
-        TypeForHolderWithAddressOf() { print_created(this); }
-        TypeForHolderWithAddressOf(const TypeForHolderWithAddressOf &) { print_copy_created(this); }
-        TypeForHolderWithAddressOf(TypeForHolderWithAddressOf &&) { print_move_created(this); }
-        ~TypeForHolderWithAddressOf() { print_destroyed(this); }
-        std::string toString() const {
-            return "TypeForHolderWithAddressOf[" + std::to_string(value) + "]";
-        }
-        int value = 42;
-    };
     using HolderWithAddressOf = shared_ptr_with_addressof_operator<TypeForHolderWithAddressOf>;
     py::class_<TypeForHolderWithAddressOf, HolderWithAddressOf>(m, "TypeForHolderWithAddressOf")
         .def_static("make", []() { return HolderWithAddressOf(new TypeForHolderWithAddressOf); })
         .def("get", [](const HolderWithAddressOf &self) { return self.get(); })
-        .def("print_object_1", [](const TypeForHolderWithAddressOf *obj) { py::print(obj->toString()); })
+        .def("print_object_1",
+             [](const TypeForHolderWithAddressOf *obj) { py::print(obj->toString()); })
+        // NOLINTNEXTLINE(performance-unnecessary-value-param)
         .def("print_object_2", [](HolderWithAddressOf obj) { py::print(obj.get()->toString()); })
-        .def("print_object_3", [](const HolderWithAddressOf &obj) { py::print(obj.get()->toString()); })
-        .def("print_object_4", [](const HolderWithAddressOf *obj) { py::print((*obj).get()->toString()); });
+        .def("print_object_3",
+             [](const HolderWithAddressOf &obj) { py::print(obj.get()->toString()); })
+        .def("print_object_4",
+             [](const HolderWithAddressOf *obj) { py::print((*obj).get()->toString()); });
     // test_move_only_holder_with_addressof_operator
-    struct TypeForMoveOnlyHolderWithAddressOf {
-        TypeForMoveOnlyHolderWithAddressOf(int value) : value{value} { print_created(this); }
-        ~TypeForMoveOnlyHolderWithAddressOf() { print_destroyed(this); }
-        std::string toString() const {
-            return "MoveOnlyHolderWithAddressOf[" + std::to_string(value) + "]";
-        }
-        int value;
-    };
     using MoveOnlyHolderWithAddressOf = unique_ptr_with_addressof_operator<TypeForMoveOnlyHolderWithAddressOf>;
     py::class_<TypeForMoveOnlyHolderWithAddressOf, MoveOnlyHolderWithAddressOf>(m, "TypeForMoveOnlyHolderWithAddressOf")
         .def_static("make", []() { return MoveOnlyHolderWithAddressOf(new TypeForMoveOnlyHolderWithAddressOf(0)); })
@@ -331,33 +427,19 @@ TEST_SUBMODULE(smart_ptr, m) {
         .def("print_object", [](const TypeForMoveOnlyHolderWithAddressOf *obj) { py::print(obj->toString()); });
     // test_smart_ptr_from_default
-    struct HeldByDefaultHolder { };
-    py::class_<HeldByDefaultHolder>(m, "HeldByDefaultHolder")
+    py::class_<HeldByDefaultHolder, std::unique_ptr<HeldByDefaultHolder>>(m, "HeldByDefaultHolder")
+        // NOLINTNEXTLINE(performance-unnecessary-value-param)
         .def_static("load_shared_ptr", [](std::shared_ptr<HeldByDefaultHolder>) {});
     // test_shared_ptr_gc
     // #187: issue involving std::shared_ptr<> return value policy & garbage collection
-    struct ElementBase {
-        virtual ~ElementBase() = default; /* Force creation of virtual table */
-        ElementBase() = default;
-        ElementBase(const ElementBase&) = delete;
-    };
     py::class_<ElementBase, std::shared_ptr<ElementBase>>(m, "ElementBase");
-    struct ElementA : ElementBase {
-        ElementA(int v) : v(v) { }
-        int value() { return v; }
-        int v;
-    };
     py::class_<ElementA, ElementBase, std::shared_ptr<ElementA>>(m, "ElementA")
         .def("value", &ElementA::value);
-    struct ElementList {
-        void add(std::shared_ptr<ElementBase> e) { l.push_back(e); }
-        std::vector<std::shared_ptr<ElementBase>> l;
-    };
     py::class_<ElementList, std::shared_ptr<ElementList>>(m, "ElementList")
         .def("add", &ElementList::add)
diff --git a/wrap/pybind11/tests/test_smart_ptr.py b/wrap/pybind11/tests/test_smart_ptr.py
index 0b1ca45b5a..85f61a3223 100644
--- a/wrap/pybind11/tests/test_smart_ptr.py
+++ b/wrap/pybind11/tests/test_smart_ptr.py
@@ -7,7 +7,9 @@
 def test_smart_ptr(capture):
     # Object1
-    for i, o in enumerate([m.make_object_1(), m.make_object_2(), m.MyObject1(3)], start=1):
+    for i, o in enumerate(
+        [m.make_object_1(), m.make_object_2(), m.MyObject1(3)], start=1
+    ):
         assert o.getRefCount() == 1
         with capture:
@@ -16,8 +18,9 @@ def test_smart_ptr(capture):
         assert capture == "MyObject1[{i}]\n".format(i=i) * 4
-    for i, o in enumerate([m.make_myobject1_1(), m.make_myobject1_2(), m.MyObject1(6), 7],
-                          start=4):
+    for i, o in enumerate(
+        [m.make_myobject1_1(), m.make_myobject1_2(), m.MyObject1(6), 7], start=4
+    ):
         with capture:
             if not isinstance(o, int):
@@ -29,11 +32,15 @@ def test_smart_ptr(capture):
-        assert capture == "MyObject1[{i}]\n".format(i=i) * (4 if isinstance(o, int) else 8)
+        times = 4 if isinstance(o, int) else 8
+        assert capture == "MyObject1[{i}]\n".format(i=i) * times
     cstats = ConstructorStats.get(m.MyObject1)
     assert cstats.alive() == 0
-    expected_values = ['MyObject1[{}]'.format(i) for i in range(1, 7)] + ['MyObject1[7]'] * 4
+    expected_values = ["MyObject1[{}]".format(i) for i in range(1, 7)] + [
+        "MyObject1[7]"
+    ] * 4
     assert cstats.values() == expected_values
     assert cstats.default_constructions == 0
     assert cstats.copy_constructions == 0
@@ -42,7 +49,9 @@ def test_smart_ptr(capture):
     assert cstats.move_assignments == 0
     # Object2
-    for i, o in zip([8, 6, 7], [m.MyObject2(8), m.make_myobject2_1(), m.make_myobject2_2()]):
+    for i, o in zip(
+        [8, 6, 7], [m.MyObject2(8), m.make_myobject2_1(), m.make_myobject2_2()]
+    ):
         with capture:
@@ -55,7 +64,7 @@ def test_smart_ptr(capture):
     assert cstats.alive() == 1
     o = None
     assert cstats.alive() == 0
-    assert cstats.values() == ['MyObject2[8]', 'MyObject2[6]', 'MyObject2[7]']
+    assert cstats.values() == ["MyObject2[8]", "MyObject2[6]", "MyObject2[7]"]
     assert cstats.default_constructions == 0
     assert cstats.copy_constructions == 0
     # assert cstats.move_constructions >= 0 # Doesn't invoke any
@@ -63,7 +72,9 @@ def test_smart_ptr(capture):
     assert cstats.move_assignments == 0
     # Object3
-    for i, o in zip([9, 8, 9], [m.MyObject3(9), m.make_myobject3_1(), m.make_myobject3_2()]):
+    for i, o in zip(
+        [9, 8, 9], [m.MyObject3(9), m.make_myobject3_1(), m.make_myobject3_2()]
+    ):
         with capture:
@@ -76,7 +87,7 @@ def test_smart_ptr(capture):
     assert cstats.alive() == 1
     o = None
     assert cstats.alive() == 0
-    assert cstats.values() == ['MyObject3[9]', 'MyObject3[8]', 'MyObject3[9]']
+    assert cstats.values() == ["MyObject3[9]", "MyObject3[8]", "MyObject3[9]"]
     assert cstats.default_constructions == 0
     assert cstats.copy_constructions == 0
     # assert cstats.move_constructions >= 0 # Doesn't invoke any
@@ -96,7 +107,7 @@ def test_smart_ptr(capture):
     # ref<>
     cstats = m.cstats_ref()
     assert cstats.alive() == 0
-    assert cstats.values() == ['from pointer'] * 10
+    assert cstats.values() == ["from pointer"] * 10
     assert cstats.default_constructions == 30
     assert cstats.copy_constructions == 12
     # assert cstats.move_constructions >= 0 # Doesn't invoke any
@@ -114,7 +125,9 @@ def test_unique_nodelete():
     cstats = ConstructorStats.get(m.MyObject4)
     assert cstats.alive() == 1
     del o
-    assert cstats.alive() == 1  # Leak, but that's intentional
+    assert cstats.alive() == 1
+    m.MyObject4.cleanup_all_instances()
+    assert cstats.alive() == 0
 def test_unique_nodelete4a():
@@ -123,19 +136,25 @@ def test_unique_nodelete4a():
     cstats = ConstructorStats.get(m.MyObject4a)
     assert cstats.alive() == 1
     del o
-    assert cstats.alive() == 1  # Leak, but that's intentional
+    assert cstats.alive() == 1
+    m.MyObject4a.cleanup_all_instances()
+    assert cstats.alive() == 0
 def test_unique_deleter():
+    m.MyObject4a(0)
     o = m.MyObject4b(23)
     assert o.value == 23
     cstats4a = ConstructorStats.get(m.MyObject4a)
-    assert cstats4a.alive() == 2  # Two because of previous test
+    assert cstats4a.alive() == 2
     cstats4b = ConstructorStats.get(m.MyObject4b)
     assert cstats4b.alive() == 1
     del o
-    assert cstats4a.alive() == 1  # Should now only be one leftover from previous test
+    assert cstats4a.alive() == 1  # Should now only be one leftover
     assert cstats4b.alive() == 0  # Should be deleted
+    m.MyObject4a.cleanup_all_instances()
+    assert cstats4a.alive() == 0
+    assert cstats4b.alive() == 0
 def test_large_holder():
@@ -186,7 +205,9 @@ def test_shared_ptr_from_this_and_references():
     ref = s.ref  # init_holder_helper(holder_ptr=false, owned=false, bad_wp=false)
     assert stats.alive() == 2
     assert s.set_ref(ref)
-    assert s.set_holder(ref)  # std::enable_shared_from_this can create a holder from a reference
+    assert s.set_holder(
+        ref
+    )  # std::enable_shared_from_this can create a holder from a reference
     bad_wp = s.bad_wp  # init_holder_helper(holder_ptr=false, owned=false, bad_wp=true)
     assert stats.alive() == 2
@@ -200,12 +221,16 @@ def test_shared_ptr_from_this_and_references():
     assert s.set_ref(copy)
     assert s.set_holder(copy)
-    holder_ref = s.holder_ref  # init_holder_helper(holder_ptr=true, owned=false, bad_wp=false)
+    holder_ref = (
+        s.holder_ref
+    )  # init_holder_helper(holder_ptr=true, owned=false, bad_wp=false)
     assert stats.alive() == 3
     assert s.set_ref(holder_ref)
     assert s.set_holder(holder_ref)
-    holder_copy = s.holder_copy  # init_holder_helper(holder_ptr=true, owned=true, bad_wp=false)
+    holder_copy = (
+        s.holder_copy
+    )  # init_holder_helper(holder_ptr=true, owned=true, bad_wp=false)
     assert stats.alive() == 3
     assert s.set_ref(holder_copy)
     assert s.set_holder(holder_copy)
@@ -277,8 +302,10 @@ def test_smart_ptr_from_default():
     instance = m.HeldByDefaultHolder()
     with pytest.raises(RuntimeError) as excinfo:
-    assert "Unable to load a custom holder type from a " \
-           "default-holder instance" in str(excinfo.value)
+    assert (
+        "Unable to load a custom holder type from a "
+        "default-holder instance" in str(excinfo.value)
+    )
 def test_shared_ptr_gc():
diff --git a/wrap/pybind11/tests/test_stl.cpp b/wrap/pybind11/tests/test_stl.cpp
index 0590162770..bc5c6553a2 100644
--- a/wrap/pybind11/tests/test_stl.cpp
+++ b/wrap/pybind11/tests/test_stl.cpp
@@ -11,9 +11,26 @@
 #include "constructor_stats.h"
 #include <pybind11/stl.h>
+#include <pybind11/stl/filesystem.h>
 #include <vector>
 #include <string>
+#if defined(PYBIND11_TEST_BOOST)
+#include <boost/optional.hpp>
+namespace pybind11 { namespace detail {
+template <typename T>
+struct type_caster<boost::optional<T>> : optional_caster<boost::optional<T>> {};
+template <>
+struct type_caster<boost::none_t> : void_caster<boost::none_t> {};
+}} // namespace pybind11::detail
 // Test with `std::variant` in C++17 mode, or with `boost::variant` in C++11/14
 #if defined(PYBIND11_HAS_VARIANT)
 using std::variant;
@@ -40,7 +57,8 @@ PYBIND11_MAKE_OPAQUE(std::vector<std::string, std::allocator<std::string>>);
 /// Issue #528: templated constructor
 struct TplCtorClass {
-    template <typename T> TplCtorClass(const T &) { }
+    template <typename T>
+    explicit TplCtorClass(const T &) {}
     bool operator==(const TplCtorClass &) const { return true; }
@@ -53,7 +71,8 @@ namespace std {
 template <template <typename> class OptionalImpl, typename T>
 struct OptionalHolder
-    OptionalHolder() = default;
+    // NOLINTNEXTLINE(modernize-use-equals-default): breaks GCC 4.8
+    OptionalHolder() {};
     bool member_initialized() const {
         return member && member->initialized;
@@ -61,6 +80,95 @@ struct OptionalHolder
+enum class EnumType {
+  kSet = 42,
+  kUnset = 85,
+// This is used to test that return-by-ref and return-by-copy policies are
+// handled properly for optional types. This is a regression test for a dangling
+// reference issue. The issue seemed to require the enum value type to
+// reproduce - it didn't seem to happen if the value type is just an integer.
+template <template <typename> class OptionalImpl>
+class OptionalProperties {
+    using OptionalEnumValue = OptionalImpl<EnumType>;
+    OptionalProperties() : value(EnumType::kSet) {}
+    ~OptionalProperties() {
+        // Reset value to detect use-after-destruction.
+        // This is set to a specific value rather than nullopt to ensure that
+        // the memory that contains the value gets re-written.
+        value = EnumType::kUnset;
+    }
+    OptionalEnumValue& access_by_ref() { return value; }
+    OptionalEnumValue access_by_copy() { return value; }
+    OptionalEnumValue value;
+// This type mimics aspects of boost::optional from old versions of Boost,
+// which exposed a dangling reference bug in Pybind11. Recent versions of
+// boost::optional, as well as libstdc++'s std::optional, don't seem to be
+// affected by the same issue. This is meant to be a minimal implementation
+// required to reproduce the issue, not fully standard-compliant.
+// See issue #3330 for more details.
+template <typename T>
+class ReferenceSensitiveOptional {
+    using value_type = T;
+    ReferenceSensitiveOptional() = default;
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    ReferenceSensitiveOptional(const T& value) : storage{value} {}
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    ReferenceSensitiveOptional(T&& value) : storage{std::move(value)} {}
+    ReferenceSensitiveOptional& operator=(const T& value) {
+        storage = {value};
+        return *this;
+    }
+    ReferenceSensitiveOptional& operator=(T&& value) {
+        storage = {std::move(value)};
+        return *this;
+    }
+    template <typename... Args>
+    T& emplace(Args&&... args) {
+        storage.clear();
+        storage.emplace_back(std::forward<Args>(args)...);
+        return storage.back();
+    }
+    const T& value() const noexcept {
+        assert(!storage.empty());
+        return storage[0];
+    }
+    const T& operator*() const noexcept {
+        return value();
+    }
+    const T* operator->() const noexcept {
+        return &value();
+    }
+    explicit operator bool() const noexcept {
+        return !storage.empty();
+    }
+    std::vector<T> storage;
+namespace pybind11 { namespace detail {
+template <typename T>
+struct type_caster<ReferenceSensitiveOptional<T>> : optional_caster<ReferenceSensitiveOptional<T>> {};
+} // namespace detail
+} // namespace pybind11
     // test_vector
     m.def("cast_vector", []() { return std::vector<int>{1}; });
@@ -97,7 +205,7 @@ TEST_SUBMODULE(stl, m) {
     // test_set
     m.def("cast_set", []() { return std::set<std::string>{"key1", "key2"}; });
     m.def("load_set", [](const std::set<std::string> &set) {
-        return set.count("key1") && set.count("key2") && set.count("key3");
+        return (set.count("key1") != 0u) && (set.count("key2") != 0u) && (set.count("key3") != 0u);
     // test_recursive_casting
@@ -139,6 +247,10 @@ TEST_SUBMODULE(stl, m) {
         return v;
+    pybind11::enum_<EnumType>(m, "EnumType")
+        .value("kSet", EnumType::kSet)
+        .value("kUnset", EnumType::kUnset);
     // test_move_out_container
     struct MoveOutContainer {
         struct Value { int value; };
@@ -191,9 +303,7 @@ TEST_SUBMODULE(stl, m) {
     m.def("double_or_zero", [](const opt_int& x) -> int {
         return x.value_or(0) * 2;
-    m.def("half_or_none", [](int x) -> opt_int {
-        return x ? opt_int(x / 2) : opt_int();
-    });
+    m.def("half_or_none", [](int x) -> opt_int { return x != 0 ? opt_int(x / 2) : opt_int(); });
     m.def("test_nullopt", [](opt_int x) {
         return x.value_or(42);
     }, py::arg_v("x", std::nullopt, "None"));
@@ -202,13 +312,19 @@ TEST_SUBMODULE(stl, m) {
     }, py::arg_v("x", std::nullopt, "None"));
     m.def("nodefer_none_optional", [](std::optional<int>) { return true; });
-    m.def("nodefer_none_optional", [](py::none) { return false; });
+    m.def("nodefer_none_optional", [](const py::none &) { return false; });
     using opt_holder = OptionalHolder<std::optional, MoveOutDetector>;
     py::class_<opt_holder>(m, "OptionalHolder", "Class with optional member")
         .def_readonly("member", &opt_holder::member)
         .def("member_initialized", &opt_holder::member_initialized);
+    using opt_props = OptionalProperties<std::optional>;
+    pybind11::class_<opt_props>(m, "OptionalProperties")
+        .def(pybind11::init<>())
+        .def_property_readonly("access_by_ref", &opt_props::access_by_ref)
+        .def_property_readonly("access_by_copy", &opt_props::access_by_copy);
@@ -235,6 +351,79 @@ TEST_SUBMODULE(stl, m) {
         .def_readonly("member", &opt_exp_holder::member)
         .def("member_initialized", &opt_exp_holder::member_initialized);
+    using opt_exp_props = OptionalProperties<std::experimental::optional>;
+    pybind11::class_<opt_exp_props>(m, "OptionalExpProperties")
+        .def(pybind11::init<>())
+        .def_property_readonly("access_by_ref", &opt_exp_props::access_by_ref)
+        .def_property_readonly("access_by_copy", &opt_exp_props::access_by_copy);
+#if defined(PYBIND11_TEST_BOOST)
+    // test_boost_optional
+    m.attr("has_boost_optional") = true;
+    using boost_opt_int = boost::optional<int>;
+    using boost_opt_no_assign = boost::optional<NoAssign>;
+    m.def("double_or_zero_boost", [](const boost_opt_int& x) -> int {
+        return x.value_or(0) * 2;
+    });
+    m.def("half_or_none_boost", [](int x) -> boost_opt_int {
+        return x != 0 ? boost_opt_int(x / 2) : boost_opt_int();
+    });
+    m.def("test_nullopt_boost", [](boost_opt_int x) {
+        return x.value_or(42);
+    }, py::arg_v("x", boost::none, "None"));
+    m.def("test_no_assign_boost", [](const boost_opt_no_assign &x) {
+        return x ? x->value : 42;
+    }, py::arg_v("x", boost::none, "None"));
+    using opt_boost_holder = OptionalHolder<boost::optional, MoveOutDetector>;
+    py::class_<opt_boost_holder>(m, "OptionalBoostHolder", "Class with optional member")
+        .def(py::init<>())
+        .def_readonly("member", &opt_boost_holder::member)
+        .def("member_initialized", &opt_boost_holder::member_initialized);
+    using opt_boost_props = OptionalProperties<boost::optional>;
+    pybind11::class_<opt_boost_props>(m, "OptionalBoostProperties")
+        .def(pybind11::init<>())
+        .def_property_readonly("access_by_ref", &opt_boost_props::access_by_ref)
+        .def_property_readonly("access_by_copy", &opt_boost_props::access_by_copy);
+    // test_refsensitive_optional
+    using refsensitive_opt_int = ReferenceSensitiveOptional<int>;
+    using refsensitive_opt_no_assign = ReferenceSensitiveOptional<NoAssign>;
+    m.def("double_or_zero_refsensitive", [](const refsensitive_opt_int& x) -> int {
+        return (x ? x.value() : 0) * 2;
+    });
+    m.def("half_or_none_refsensitive", [](int x) -> refsensitive_opt_int {
+        return x != 0 ? refsensitive_opt_int(x / 2) : refsensitive_opt_int();
+    });
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
+    m.def("test_nullopt_refsensitive", [](refsensitive_opt_int x) {
+        return x ? x.value() : 42;
+    }, py::arg_v("x", refsensitive_opt_int(), "None"));
+    m.def("test_no_assign_refsensitive", [](const refsensitive_opt_no_assign &x) {
+        return x ? x->value : 42;
+    }, py::arg_v("x", refsensitive_opt_no_assign(), "None"));
+    using opt_refsensitive_holder = OptionalHolder<ReferenceSensitiveOptional, MoveOutDetector>;
+    py::class_<opt_refsensitive_holder>(m, "OptionalRefSensitiveHolder", "Class with optional member")
+        .def(py::init<>())
+        .def_readonly("member", &opt_refsensitive_holder::member)
+        .def("member_initialized", &opt_refsensitive_holder::member_initialized);
+    using opt_refsensitive_props = OptionalProperties<ReferenceSensitiveOptional>;
+    pybind11::class_<opt_refsensitive_props>(m, "OptionalRefSensitiveProperties")
+        .def(pybind11::init<>())
+        .def_property_readonly("access_by_ref", &opt_refsensitive_props::access_by_ref)
+        .def_property_readonly("access_by_copy", &opt_refsensitive_props::access_by_copy);
+    // test_fs_path
+    m.attr("has_filesystem") = true;
+    m.def("parent_path", [](const std::filesystem::path& p) { return p.parent_path(); });
@@ -245,13 +434,13 @@ TEST_SUBMODULE(stl, m) {
         using result_type = const char *;
         result_type operator()(int) { return "int"; }
-        result_type operator()(std::string) { return "std::string"; }
+        result_type operator()(const std::string &) { return "std::string"; }
         result_type operator()(double) { return "double"; }
         result_type operator()(std::nullptr_t) { return "std::nullptr_t"; }
     // test_variant
-    m.def("load_variant", [](variant<int, std::string, double, std::nullptr_t> v) {
+    m.def("load_variant", [](const variant<int, std::string, double, std::nullptr_t> &v) {
         return py::detail::visit_helper<variant>::call(visitor(), v);
     m.def("load_variant_2pass", [](variant<double, int> v) {
@@ -270,8 +459,12 @@ TEST_SUBMODULE(stl, m) {
     m.def("tpl_ctor_set", [](std::unordered_set<TplCtorClass> &) {});
 #if defined(PYBIND11_HAS_OPTIONAL)
     m.def("tpl_constr_optional", [](std::optional<TplCtorClass> &) {});
-#elif defined(PYBIND11_HAS_EXP_OPTIONAL)
-    m.def("tpl_constr_optional", [](std::experimental::optional<TplCtorClass> &) {});
+    m.def("tpl_constr_optional_exp", [](std::experimental::optional<TplCtorClass> &) {});
+#if defined(PYBIND11_TEST_BOOST)
+    m.def("tpl_constr_optional_boost", [](boost::optional<TplCtorClass> &) {});
     // test_vec_of_reference_wrapper
@@ -287,9 +480,11 @@ TEST_SUBMODULE(stl, m) {
     m.def("stl_pass_by_pointer", [](std::vector<int>* v) { return *v; }, "v"_a=nullptr);
     // #1258: pybind11/stl.h converts string to vector<string>
-    m.def("func_with_string_or_vector_string_arg_overload", [](std::vector<std::string>) { return 1; });
-    m.def("func_with_string_or_vector_string_arg_overload", [](std::list<std::string>) { return 2; });
-    m.def("func_with_string_or_vector_string_arg_overload", [](std::string) { return 3; });
+    m.def("func_with_string_or_vector_string_arg_overload",
+          [](const std::vector<std::string> &) { return 1; });
+    m.def("func_with_string_or_vector_string_arg_overload",
+          [](const std::list<std::string> &) { return 2; });
+    m.def("func_with_string_or_vector_string_arg_overload", [](const std::string &) { return 3; });
     class Placeholder {
@@ -321,4 +516,10 @@ TEST_SUBMODULE(stl, m) {
     py::class_<Issue1561Outer>(m, "Issue1561Outer")
         .def_readwrite("list", &Issue1561Outer::list);
+    m.def(
+        "return_vector_bool_raw_ptr",
+        []() { return new std::vector<bool>(4513); },
+        // Without explicitly specifying `take_ownership`, this function leaks.
+        py::return_value_policy::take_ownership);
diff --git a/wrap/pybind11/tests/test_stl.py b/wrap/pybind11/tests/test_stl.py
index 141b3e8492..e217975944 100644
--- a/wrap/pybind11/tests/test_stl.py
+++ b/wrap/pybind11/tests/test_stl.py
@@ -1,9 +1,8 @@
 # -*- coding: utf-8 -*-
 import pytest
+from pybind11_tests import ConstructorStats, UserType
 from pybind11_tests import stl as m
-from pybind11_tests import UserType
-from pybind11_tests import ConstructorStats
 def test_vector(doc):
@@ -88,7 +87,7 @@ def test_recursive_casting():
     assert m.cast_rv_nested() == [[[{"b": "rvalue", "c": "rvalue"}], [{"a": "rvalue"}]]]
     assert m.cast_lv_nested() == {
         "a": [[["lvalue", "lvalue"]], [["lvalue", "lvalue"]]],
-        "b": [[["lvalue", "lvalue"], ["lvalue", "lvalue"]]]
+        "b": [[["lvalue", "lvalue"], ["lvalue", "lvalue"]]],
     # Issue #853 test case:
@@ -106,15 +105,15 @@ def test_move_out_container():
     assert [x.value for x in moved_out_list] == [0, 1, 2]
-@pytest.mark.skipif(not hasattr(m, "has_optional"), reason='no <optional>')
+@pytest.mark.skipif(not hasattr(m, "has_optional"), reason="no <optional>")
 def test_optional():
     assert m.double_or_zero(None) == 0
     assert m.double_or_zero(42) == 84
-    pytest.raises(TypeError, m.double_or_zero, 'foo')
+    pytest.raises(TypeError, m.double_or_zero, "foo")
     assert m.half_or_none(0) is None
     assert m.half_or_none(42) == 21
-    pytest.raises(TypeError, m.half_or_none, 'foo')
+    pytest.raises(TypeError, m.half_or_none, "foo")
     assert m.test_nullopt() == 42
     assert m.test_nullopt(None) == 42
@@ -133,16 +132,22 @@ def test_optional():
     assert mvalue.initialized
     assert holder.member_initialized()
+    props = m.OptionalProperties()
+    assert int(props.access_by_ref) == 42
+    assert int(props.access_by_copy) == 42
-@pytest.mark.skipif(not hasattr(m, "has_exp_optional"), reason='no <experimental/optional>')
+    not hasattr(m, "has_exp_optional"), reason="no <experimental/optional>"
 def test_exp_optional():
     assert m.double_or_zero_exp(None) == 0
     assert m.double_or_zero_exp(42) == 84
-    pytest.raises(TypeError, m.double_or_zero_exp, 'foo')
+    pytest.raises(TypeError, m.double_or_zero_exp, "foo")
     assert m.half_or_none_exp(0) is None
     assert m.half_or_none_exp(42) == 21
-    pytest.raises(TypeError, m.half_or_none_exp, 'foo')
+    pytest.raises(TypeError, m.half_or_none_exp, "foo")
     assert m.test_nullopt_exp() == 42
     assert m.test_nullopt_exp(None) == 42
@@ -159,8 +164,90 @@ def test_exp_optional():
     assert mvalue.initialized
     assert holder.member_initialized()
+    props = m.OptionalExpProperties()
+    assert int(props.access_by_ref) == 42
+    assert int(props.access_by_copy) == 42
+@pytest.mark.skipif(not hasattr(m, "has_boost_optional"), reason="no <boost/optional>")
+def test_boost_optional():
+    assert m.double_or_zero_boost(None) == 0
+    assert m.double_or_zero_boost(42) == 84
+    pytest.raises(TypeError, m.double_or_zero_boost, "foo")
+    assert m.half_or_none_boost(0) is None
+    assert m.half_or_none_boost(42) == 21
+    pytest.raises(TypeError, m.half_or_none_boost, "foo")
+    assert m.test_nullopt_boost() == 42
+    assert m.test_nullopt_boost(None) == 42
+    assert m.test_nullopt_boost(42) == 42
+    assert m.test_nullopt_boost(43) == 43
+    assert m.test_no_assign_boost() == 42
+    assert m.test_no_assign_boost(None) == 42
+    assert m.test_no_assign_boost(m.NoAssign(43)) == 43
+    pytest.raises(TypeError, m.test_no_assign_boost, 43)
+    holder = m.OptionalBoostHolder()
+    mvalue = holder.member
+    assert mvalue.initialized
+    assert holder.member_initialized()
+    props = m.OptionalBoostProperties()
+    assert int(props.access_by_ref) == 42
+    assert int(props.access_by_copy) == 42
+def test_reference_sensitive_optional():
+    assert m.double_or_zero_refsensitive(None) == 0
+    assert m.double_or_zero_refsensitive(42) == 84
+    pytest.raises(TypeError, m.double_or_zero_refsensitive, "foo")
+    assert m.half_or_none_refsensitive(0) is None
+    assert m.half_or_none_refsensitive(42) == 21
+    pytest.raises(TypeError, m.half_or_none_refsensitive, "foo")
+    assert m.test_nullopt_refsensitive() == 42
+    assert m.test_nullopt_refsensitive(None) == 42
+    assert m.test_nullopt_refsensitive(42) == 42
+    assert m.test_nullopt_refsensitive(43) == 43
-@pytest.mark.skipif(not hasattr(m, "load_variant"), reason='no <variant>')
+    assert m.test_no_assign_refsensitive() == 42
+    assert m.test_no_assign_refsensitive(None) == 42
+    assert m.test_no_assign_refsensitive(m.NoAssign(43)) == 43
+    pytest.raises(TypeError, m.test_no_assign_refsensitive, 43)
+    holder = m.OptionalRefSensitiveHolder()
+    mvalue = holder.member
+    assert mvalue.initialized
+    assert holder.member_initialized()
+    props = m.OptionalRefSensitiveProperties()
+    assert int(props.access_by_ref) == 42
+    assert int(props.access_by_copy) == 42
+@pytest.mark.skipif(not hasattr(m, "has_filesystem"), reason="no <filesystem>")
+def test_fs_path():
+    from pathlib import Path
+    class PseudoStrPath:
+        def __fspath__(self):
+            return "foo/bar"
+    class PseudoBytesPath:
+        def __fspath__(self):
+            return b"foo/bar"
+    assert m.parent_path(Path("foo/bar")) == Path("foo")
+    assert m.parent_path("foo/bar") == Path("foo")
+    assert m.parent_path(b"foo/bar") == Path("foo")
+    assert m.parent_path(PseudoStrPath()) == Path("foo")
+    assert m.parent_path(PseudoBytesPath()) == Path("foo")
+@pytest.mark.skipif(not hasattr(m, "load_variant"), reason="no <variant>")
 def test_variant(doc):
     assert m.load_variant(1) == "int"
     assert m.load_variant("1") == "std::string"
@@ -172,34 +259,44 @@ def test_variant(doc):
     assert m.cast_variant() == (5, "Hello")
-    assert doc(m.load_variant) == "load_variant(arg0: Union[int, str, float, None]) -> str"
+    assert (
+        doc(m.load_variant) == "load_variant(arg0: Union[int, str, float, None]) -> str"
+    )
 def test_vec_of_reference_wrapper():
     """#171: Can't return reference wrappers (or STL structures containing them)"""
-    assert str(m.return_vec_of_reference_wrapper(UserType(4))) == \
-        "[UserType(1), UserType(2), UserType(3), UserType(4)]"
+    assert (
+        str(m.return_vec_of_reference_wrapper(UserType(4)))
+        == "[UserType(1), UserType(2), UserType(3), UserType(4)]"
+    )
 def test_stl_pass_by_pointer(msg):
     """Passing nullptr or None to an STL container pointer is not expected to work"""
     with pytest.raises(TypeError) as excinfo:
         m.stl_pass_by_pointer()  # default value is `nullptr`
-    assert msg(excinfo.value) == """
+    assert (
+        msg(excinfo.value)
+        == """
         stl_pass_by_pointer(): incompatible function arguments. The following argument types are supported:
             1. (v: List[int] = None) -> List[int]
         Invoked with:
     """  # noqa: E501 line too long
+    )
     with pytest.raises(TypeError) as excinfo:
-    assert msg(excinfo.value) == """
+    assert (
+        msg(excinfo.value)
+        == """
         stl_pass_by_pointer(): incompatible function arguments. The following argument types are supported:
             1. (v: List[int] = None) -> List[int]
         Invoked with: None
     """  # noqa: E501 line too long
+    )
     assert m.stl_pass_by_pointer([1, 2, 3]) == [1, 2, 3]
@@ -209,10 +306,12 @@ def test_missing_header_message():
     <pybind11/stl.h> should result in a helpful suggestion in the error message"""
     import pybind11_cross_module_tests as cm
-    expected_message = ("Did you forget to `#include <pybind11/stl.h>`? Or <pybind11/complex.h>,\n"
-                        "<pybind11/functional.h>, <pybind11/chrono.h>, etc. Some automatic\n"
-                        "conversions are optional and require extra headers to be included\n"
-                        "when compiling your pybind11 module.")
+    expected_message = (
+        "Did you forget to `#include <pybind11/stl.h>`? Or <pybind11/complex.h>,\n"
+        "<pybind11/functional.h>, <pybind11/chrono.h>, etc. Some automatic\n"
+        "conversions are optional and require extra headers to be included\n"
+        "when compiling your pybind11 module."
+    )
     with pytest.raises(TypeError) as excinfo:
         cm.missing_header_arg([1.0, 2.0, 3.0])
@@ -226,9 +325,9 @@ def test_missing_header_message():
 def test_function_with_string_and_vector_string_arg():
     """Check if a string is NOT implicitly converted to a list, which was the
     behavior before fix of issue #1258"""
-    assert m.func_with_string_or_vector_string_arg_overload(('A', 'B', )) == 2
-    assert m.func_with_string_or_vector_string_arg_overload(['A', 'B']) == 2
-    assert m.func_with_string_or_vector_string_arg_overload('A') == 3
+    assert m.func_with_string_or_vector_string_arg_overload(("A", "B")) == 2
+    assert m.func_with_string_or_vector_string_arg_overload(["A", "B"]) == 2
+    assert m.func_with_string_or_vector_string_arg_overload("A") == 3
 def test_stl_ownership():
@@ -245,8 +344,15 @@ def test_array_cast_sequence():
 def test_issue_1561():
-    """ check fix for issue #1561 """
+    """check fix for issue #1561"""
     bar = m.Issue1561Outer()
-    bar.list = [m.Issue1561Inner('bar')]
+    bar.list = [m.Issue1561Inner("bar")]
-    assert bar.list[0].data == 'bar'
+    assert bar.list[0].data == "bar"
+def test_return_vector_bool_raw_ptr():
+    # Add `while True:` for manual leak checking.
+    v = m.return_vector_bool_raw_ptr()
+    assert isinstance(v, list)
+    assert len(v) == 4513
diff --git a/wrap/pybind11/tests/test_stl_binders.cpp b/wrap/pybind11/tests/test_stl_binders.cpp
index 8688874091..6b23e3529f 100644
--- a/wrap/pybind11/tests/test_stl_binders.cpp
+++ b/wrap/pybind11/tests/test_stl_binders.cpp
@@ -18,7 +18,7 @@
 class El {
     El() = delete;
-    El(int v) : a(v) { }
+    explicit El(int v) : a(v) {}
     int a;
@@ -86,13 +86,13 @@ TEST_SUBMODULE(stl_binders, m) {
     // test_noncopyable_containers
     py::bind_vector<std::vector<E_nc>>(m, "VectorENC");
-    m.def("get_vnc", &one_to_n<std::vector<E_nc>>, py::return_value_policy::reference);
+    m.def("get_vnc", &one_to_n<std::vector<E_nc>>);
     py::bind_vector<std::deque<E_nc>>(m, "DequeENC");
-    m.def("get_dnc", &one_to_n<std::deque<E_nc>>, py::return_value_policy::reference);
+    m.def("get_dnc", &one_to_n<std::deque<E_nc>>);
     py::bind_map<std::map<int, E_nc>>(m, "MapENC");
-    m.def("get_mnc", &times_ten<std::map<int, E_nc>>, py::return_value_policy::reference);
+    m.def("get_mnc", &times_ten<std::map<int, E_nc>>);
     py::bind_map<std::unordered_map<int, E_nc>>(m, "UmapENC");
-    m.def("get_umnc", &times_ten<std::unordered_map<int, E_nc>>, py::return_value_policy::reference);
+    m.def("get_umnc", &times_ten<std::unordered_map<int, E_nc>>);
     // Issue #1885: binding nested std::map<X, Container<E>> with E non-copyable
     py::bind_map<std::map<int, std::vector<E_nc>>>(m, "MapVecENC");
     m.def("get_nvnc", [](int n)
@@ -102,11 +102,11 @@ TEST_SUBMODULE(stl_binders, m) {
                 for (int j = 1; j <= n; j++)
             return m;
-        }, py::return_value_policy::reference);
+        });
     py::bind_map<std::map<int, std::map<int, E_nc>>>(m, "MapMapENC");
-    m.def("get_nmnc", &times_hundred<std::map<int, std::map<int, E_nc>>>, py::return_value_policy::reference);
+    m.def("get_nmnc", &times_hundred<std::map<int, std::map<int, E_nc>>>);
     py::bind_map<std::unordered_map<int, std::unordered_map<int, E_nc>>>(m, "UmapUmapENC");
-    m.def("get_numnc", &times_hundred<std::unordered_map<int, std::unordered_map<int, E_nc>>>, py::return_value_policy::reference);
+    m.def("get_numnc", &times_hundred<std::unordered_map<int, std::unordered_map<int, E_nc>>>);
     // test_vector_buffer
     py::bind_vector<std::vector<unsigned char>>(m, "VectorUChar", py::buffer_protocol());
@@ -117,7 +117,7 @@ TEST_SUBMODULE(stl_binders, m) {
     // The rest depends on numpy:
-    try { py::module::import("numpy"); }
+    try { py::module_::import("numpy"); }
     catch (...) { return; }
     // test_vector_buffer_numpy
@@ -125,5 +125,7 @@ TEST_SUBMODULE(stl_binders, m) {
     PYBIND11_NUMPY_DTYPE(VStruct, w, x, y, z);
     py::class_<VStruct>(m, "VStruct").def_readwrite("x", &VStruct::x);
     py::bind_vector<std::vector<VStruct>>(m, "VectorStruct", py::buffer_protocol());
-    m.def("get_vectorstruct", [] {return std::vector<VStruct> {{0, 5, 3.0, 1}, {1, 30, -1e4, 0}};});
+    m.def("get_vectorstruct", [] {
+        return std::vector<VStruct>{{false, 5, 3.0, true}, {true, 30, -1e4, false}};
+    });
diff --git a/wrap/pybind11/tests/test_stl_binders.py b/wrap/pybind11/tests/test_stl_binders.py
index f9b8ea4af2..59c5ab6b5d 100644
--- a/wrap/pybind11/tests/test_stl_binders.py
+++ b/wrap/pybind11/tests/test_stl_binders.py
@@ -1,8 +1,7 @@
 # -*- coding: utf-8 -*-
 import pytest
-import env  # noqa: F401
+import env
 from pybind11_tests import stl_binders as m
@@ -45,7 +44,7 @@ def test_vector_int():
     # test error handling, and that the vector is unchanged
     with pytest.raises(RuntimeError):
-        v_int2.extend([8, 'a'])
+        v_int2.extend([8, "a"])
     assert v_int2 == m.VectorInt([0, 99, 2, 3, 4, 5, 6, 7])
@@ -79,8 +78,8 @@ def test_vector_buffer():
         assert mv[2] == 5
         mv[2] = 6
-        assert mv[2] == '\x05'
-        mv[2] = '\x06'
+        assert mv[2] == "\x05"
+        mv[2] = "\x06"
     assert v[2] == 6
     if not env.PY2:
@@ -114,11 +113,17 @@ def test_vector_buffer_numpy():
     v = m.get_vectorstruct()
     assert v[0].x == 5
     ma = np.asarray(v)
-    ma[1]['x'] = 99
+    ma[1]["x"] = 99
     assert v[1].x == 99
-    v = m.VectorStruct(np.zeros(3, dtype=np.dtype([('w', 'bool'), ('x', 'I'),
-                                                   ('y', 'float64'), ('z', 'bool')], align=True)))
+    v = m.VectorStruct(
+        np.zeros(
+            3,
+            dtype=np.dtype(
+                [("w", "bool"), ("x", "I"), ("y", "float64"), ("z", "bool")], align=True
+            ),
+        )
+    )
     assert len(v) == 3
     b = np.array([1, 2, 3, 4], dtype=np.uint8)
@@ -151,31 +156,59 @@ def test_vector_custom():
 def test_map_string_double():
     mm = m.MapStringDouble()
-    mm['a'] = 1
-    mm['b'] = 2.5
+    mm["a"] = 1
+    mm["b"] = 2.5
-    assert list(mm) == ['a', 'b']
-    assert list(mm.items()) == [('a', 1), ('b', 2.5)]
+    assert list(mm) == ["a", "b"]
     assert str(mm) == "MapStringDouble{a: 1, b: 2.5}"
+    assert "b" in mm
+    assert "c" not in mm
+    assert 123 not in mm
+    # Check that keys, values, items are views, not merely iterable
+    keys = mm.keys()
+    values = mm.values()
+    items = mm.items()
+    assert list(keys) == ["a", "b"]
+    assert len(keys) == 2
+    assert "a" in keys
+    assert "c" not in keys
+    assert 123 not in keys
+    assert list(items) == [("a", 1), ("b", 2.5)]
+    assert len(items) == 2
+    assert ("b", 2.5) in items
+    assert "hello" not in items
+    assert ("b", 2.5, None) not in items
+    assert list(values) == [1, 2.5]
+    assert len(values) == 2
+    assert 1 in values
+    assert 2 not in values
+    # Check that views update when the map is updated
+    mm["c"] = -1
+    assert list(keys) == ["a", "b", "c"]
+    assert list(values) == [1, 2.5, -1]
+    assert list(items) == [("a", 1), ("b", 2.5), ("c", -1)]
     um = m.UnorderedMapStringDouble()
-    um['ua'] = 1.1
-    um['ub'] = 2.6
+    um["ua"] = 1.1
+    um["ub"] = 2.6
-    assert sorted(list(um)) == ['ua', 'ub']
-    assert sorted(list(um.items())) == [('ua', 1.1), ('ub', 2.6)]
+    assert sorted(list(um)) == ["ua", "ub"]
+    assert list(um.keys()) == list(um)
+    assert sorted(list(um.items())) == [("ua", 1.1), ("ub", 2.6)]
+    assert list(zip(um.keys(), um.values())) == list(um.items())
     assert "UnorderedMapStringDouble" in str(um)
 def test_map_string_double_const():
     mc = m.MapStringDoubleConst()
-    mc['a'] = 10
-    mc['b'] = 20.5
+    mc["a"] = 10
+    mc["b"] = 20.5
     assert str(mc) == "MapStringDoubleConst{a: 10, b: 20.5}"
     umc = m.UnorderedMapStringDoubleConst()
-    umc['a'] = 11
-    umc['b'] = 21.5
+    umc["a"] = 11
+    umc["b"] = 21.5
@@ -196,7 +229,7 @@ def test_noncopyable_containers():
     i = 1
     for j in dnc:
-        assert(j.value == i)
+        assert j.value == i
         i += 1
     # std::map
@@ -265,21 +298,21 @@ def test_noncopyable_containers():
 def test_map_delitem():
     mm = m.MapStringDouble()
-    mm['a'] = 1
-    mm['b'] = 2.5
+    mm["a"] = 1
+    mm["b"] = 2.5
-    assert list(mm) == ['a', 'b']
-    assert list(mm.items()) == [('a', 1), ('b', 2.5)]
-    del mm['a']
-    assert list(mm) == ['b']
-    assert list(mm.items()) == [('b', 2.5)]
+    assert list(mm) == ["a", "b"]
+    assert list(mm.items()) == [("a", 1), ("b", 2.5)]
+    del mm["a"]
+    assert list(mm) == ["b"]
+    assert list(mm.items()) == [("b", 2.5)]
     um = m.UnorderedMapStringDouble()
-    um['ua'] = 1.1
-    um['ub'] = 2.6
-    assert sorted(list(um)) == ['ua', 'ub']
-    assert sorted(list(um.items())) == [('ua', 1.1), ('ub', 2.6)]
-    del um['ua']
-    assert sorted(list(um)) == ['ub']
-    assert sorted(list(um.items())) == [('ub', 2.6)]
+    um["ua"] = 1.1
+    um["ub"] = 2.6
+    assert sorted(list(um)) == ["ua", "ub"]
+    assert sorted(list(um.items())) == [("ua", 1.1), ("ub", 2.6)]
+    del um["ua"]
+    assert sorted(list(um)) == ["ub"]
+    assert sorted(list(um.items())) == [("ub", 2.6)]
diff --git a/wrap/pybind11/tests/test_tagbased_polymorphic.cpp b/wrap/pybind11/tests/test_tagbased_polymorphic.cpp
index 838a168d2b..2c7bad8bbc 100644
--- a/wrap/pybind11/tests/test_tagbased_polymorphic.cpp
+++ b/wrap/pybind11/tests/test_tagbased_polymorphic.cpp
@@ -37,33 +37,35 @@ struct Animal
 struct Dog : Animal
-    Dog(const std::string& _name, Kind _kind = Kind::Dog) : Animal(_name, _kind) {}
+    explicit Dog(const std::string &_name, Kind _kind = Kind::Dog) : Animal(_name, _kind) {}
     std::string bark() const { return name_of_kind(kind) + " " + name + " goes " + sound; }
     std::string sound = "WOOF!";
 struct Labrador : Dog
-    Labrador(const std::string& _name, int _excitement = 9001)
+    explicit Labrador(const std::string &_name, int _excitement = 9001)
         : Dog(_name, Kind::Labrador), excitement(_excitement) {}
     int excitement;
 struct Chihuahua : Dog
-    Chihuahua(const std::string& _name) : Dog(_name, Kind::Chihuahua) { sound = "iyiyiyiyiyi"; }
+    explicit Chihuahua(const std::string &_name) : Dog(_name, Kind::Chihuahua) {
+        sound = "iyiyiyiyiyi";
+    }
     std::string bark() const { return Dog::bark() + " and runs in circles"; }
 struct Cat : Animal
-    Cat(const std::string& _name, Kind _kind = Kind::Cat) : Animal(_name, _kind) {}
+    explicit Cat(const std::string &_name, Kind _kind = Kind::Cat) : Animal(_name, _kind) {}
     std::string purr() const { return "mrowr"; }
 struct Panther : Cat
-    Panther(const std::string& _name) : Cat(_name, Kind::Panther) {}
+    explicit Panther(const std::string &_name) : Cat(_name, Kind::Panther) {}
     std::string purr() const { return "mrrrRRRRRR"; }
@@ -86,13 +88,13 @@ std::vector<std::unique_ptr<Animal>> create_zoo()
 const std::type_info* Animal::type_of_kind(Kind kind)
     switch (kind) {
-        case Kind::Unknown: break;
+        case Kind::Unknown:
         case Kind::Dog: break;
         case Kind::Labrador: return &typeid(Labrador);
         case Kind::Chihuahua: return &typeid(Chihuahua);
-        case Kind::LastDog: break;
+        case Kind::LastDog:
         case Kind::Cat: break;
         case Kind::Panther: return &typeid(Panther);
         case Kind::LastCat: break;
diff --git a/wrap/pybind11/tests/test_tagbased_polymorphic.py b/wrap/pybind11/tests/test_tagbased_polymorphic.py
index 94f374da90..64eb8a3c1b 100644
--- a/wrap/pybind11/tests/test_tagbased_polymorphic.py
+++ b/wrap/pybind11/tests/test_tagbased_polymorphic.py
@@ -5,16 +5,24 @@
 def test_downcast():
     zoo = m.create_zoo()
     assert [type(animal) for animal in zoo] == [
-        m.Labrador, m.Dog, m.Chihuahua, m.Cat, m.Panther
+        m.Labrador,
+        m.Dog,
+        m.Chihuahua,
+        m.Cat,
+        m.Panther,
     assert [animal.name for animal in zoo] == [
-        "Fido", "Ginger", "Hertzl", "Tiger", "Leo"
+        "Fido",
+        "Ginger",
+        "Hertzl",
+        "Tiger",
+        "Leo",
     zoo[1].sound = "woooooo"
     assert [dog.bark() for dog in zoo[:3]] == [
         "Labrador Fido goes WOOF!",
         "Dog Ginger goes woooooo",
-        "Chihuahua Hertzl goes iyiyiyiyiyi and runs in circles"
+        "Chihuahua Hertzl goes iyiyiyiyiyi and runs in circles",
     assert [cat.purr() for cat in zoo[3:]] == ["mrowr", "mrrrRRRRRR"]
     zoo[0].excitement -= 1000
diff --git a/wrap/pybind11/tests/test_thread.cpp b/wrap/pybind11/tests/test_thread.cpp
new file mode 100644
index 0000000000..19d91768b3
--- /dev/null
+++ b/wrap/pybind11/tests/test_thread.cpp
@@ -0,0 +1,66 @@
+    tests/test_thread.cpp -- call pybind11 bound methods in threads
+    Copyright (c) 2021 Laramie Leavitt (Google LLC) <lar@google.com>
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+#include <pybind11/cast.h>
+#include <pybind11/pybind11.h>
+#include <chrono>
+#include <thread>
+#include "pybind11_tests.h"
+namespace py = pybind11;
+namespace {
+struct IntStruct {
+    explicit IntStruct(int v) : value(v) {};
+    ~IntStruct() { value = -value; }
+    IntStruct(const IntStruct&) = default;
+    IntStruct& operator=(const IntStruct&) = default;
+    int value;
+} // namespace
+TEST_SUBMODULE(thread, m) {
+    py::class_<IntStruct>(m, "IntStruct").def(py::init([](const int i) { return IntStruct(i); }));
+    // implicitly_convertible uses loader_life_support when an implicit
+    // conversion is required in order to lifetime extend the reference.
+    //
+    // This test should be run with ASAN for better effectiveness.
+    py::implicitly_convertible<int, IntStruct>();
+    m.def("test", [](int expected, const IntStruct &in) {
+        {
+            py::gil_scoped_release release;
+            std::this_thread::sleep_for(std::chrono::milliseconds(5));
+        }
+        if (in.value != expected) {
+            throw std::runtime_error("Value changed!!");
+        }
+    });
+    m.def(
+        "test_no_gil",
+        [](int expected, const IntStruct &in) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(5));
+            if (in.value != expected) {
+                throw std::runtime_error("Value changed!!");
+            }
+        },
+        py::call_guard<py::gil_scoped_release>());
+    // NOTE: std::string_view also uses loader_life_support to ensure that
+    // the string contents remain alive, but that's a C++ 17 feature.
diff --git a/wrap/pybind11/tests/test_thread.py b/wrap/pybind11/tests/test_thread.py
new file mode 100644
index 0000000000..f9db1babaf
--- /dev/null
+++ b/wrap/pybind11/tests/test_thread.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+import threading
+from pybind11_tests import thread as m
+class Thread(threading.Thread):
+    def __init__(self, fn):
+        super(Thread, self).__init__()
+        self.fn = fn
+        self.e = None
+    def run(self):
+        try:
+            for i in range(10):
+                self.fn(i, i)
+        except Exception as e:
+            self.e = e
+    def join(self):
+        super(Thread, self).join()
+        if self.e:
+            raise self.e
+def test_implicit_conversion():
+    a = Thread(m.test)
+    b = Thread(m.test)
+    c = Thread(m.test)
+    for x in [a, b, c]:
+        x.start()
+    for x in [c, b, a]:
+        x.join()
+def test_implicit_conversion_no_gil():
+    a = Thread(m.test_no_gil)
+    b = Thread(m.test_no_gil)
+    c = Thread(m.test_no_gil)
+    for x in [a, b, c]:
+        x.start()
+    for x in [c, b, a]:
+        x.join()
diff --git a/wrap/pybind11/tests/test_virtual_functions.cpp b/wrap/pybind11/tests/test_virtual_functions.cpp
index 4fc04acf45..d98b57f174 100644
--- a/wrap/pybind11/tests/test_virtual_functions.cpp
+++ b/wrap/pybind11/tests/test_virtual_functions.cpp
@@ -15,9 +15,12 @@
 /* This is an example class that we'll want to be able to extend from Python */
 class ExampleVirt  {
-    ExampleVirt(int state) : state(state) { print_created(this, state); }
+    explicit ExampleVirt(int state) : state(state) { print_created(this, state); }
     ExampleVirt(const ExampleVirt &e) : state(e.state) { print_copy_created(this); }
-    ExampleVirt(ExampleVirt &&e) : state(e.state) { print_move_created(this); e.state = 0; }
+    ExampleVirt(ExampleVirt &&e) noexcept : state(e.state) {
+        print_move_created(this);
+        e.state = 0;
+    }
     virtual ~ExampleVirt() { print_destroyed(this); }
     virtual int run(int value) {
@@ -100,13 +103,18 @@ class PyExampleVirt : public ExampleVirt {
 class NonCopyable {
     NonCopyable(int a, int b) : value{new int(a*b)} { print_created(this, a, b); }
-    NonCopyable(NonCopyable &&o) { value = std::move(o.value); print_move_created(this); }
+    NonCopyable(NonCopyable &&o) noexcept {
+        value = std::move(o.value);
+        print_move_created(this);
+    }
     NonCopyable(const NonCopyable &) = delete;
     NonCopyable() = delete;
     void operator=(const NonCopyable &) = delete;
     void operator=(NonCopyable &&) = delete;
     std::string get_value() const {
-        if (value) return std::to_string(*value); else return "(null)";
+        if (value)
+            return std::to_string(*value);
+        return "(null)";
     ~NonCopyable() { print_destroyed(this); }
@@ -120,7 +128,10 @@ class Movable {
     Movable(int a, int b) : value{a+b} { print_created(this, a, b); }
     Movable(const Movable &m) { value = m.value; print_copy_created(this); }
-    Movable(Movable &&m) { value = std::move(m.value); print_move_created(this); }
+    Movable(Movable &&m) noexcept {
+        value = m.value;
+        print_move_created(this);
+    }
     std::string get_value() const { return std::to_string(value); }
     ~Movable() { print_destroyed(this); }
@@ -163,6 +174,25 @@ struct DispatchIssue : Base {
+// An abstract adder class that uses visitor pattern to add two data
+// objects and send the result to the visitor functor
+struct AdderBase {
+    struct Data {};
+    using DataVisitor = std::function<void (const Data&)>;
+    virtual void operator()(const Data& first, const Data& second, const DataVisitor& visitor) const = 0;
+    virtual ~AdderBase() = default;
+    AdderBase() = default;
+    AdderBase(const AdderBase&) = delete;
+struct Adder : AdderBase {
+    void operator()(const Data& first, const Data& second, const DataVisitor& visitor) const override {
+        PYBIND11_OVERRIDE_PURE_NAME(void, AdderBase, "__call__", operator(), first, second, visitor);
+    }
 static void test_gil() {
         py::gil_scoped_acquire lock;
@@ -184,10 +214,29 @@ static void test_gil_from_thread() {
+class test_override_cache_helper {
+    virtual int func() { return 0; }
+    test_override_cache_helper() = default;
+    virtual ~test_override_cache_helper() = default;
+    // Non-copyable
+    test_override_cache_helper &operator=(test_override_cache_helper const &Right) = delete;
+    test_override_cache_helper(test_override_cache_helper const &Copy) = delete;
+class test_override_cache_helper_trampoline : public test_override_cache_helper {
+    int func() override { PYBIND11_OVERRIDE(int, test_override_cache_helper, func); }
+inline int test_override_cache(std::shared_ptr<test_override_cache_helper> const &instance) { return instance->func(); }
 // Forward declaration (so that we can put the main tests here; the inherited virtual approaches are
 // rather long).
-void initialize_inherited_virtuals(py::module &m);
+void initialize_inherited_virtuals(py::module_ &m);
 TEST_SUBMODULE(virtual_functions, m) {
     // test_override
@@ -284,6 +333,27 @@ TEST_SUBMODULE(virtual_functions, m) {
     m.def("dispatch_issue_go", [](const Base * b) { return b->dispatch(); });
+    // test_recursive_dispatch_issue
+    // #3357: Recursive dispatch fails to find python function override
+    pybind11::class_<AdderBase, Adder>(m, "Adder")
+        .def(pybind11::init<>())
+        .def("__call__", &AdderBase::operator());
+    pybind11::class_<AdderBase::Data>(m, "Data")
+        .def(pybind11::init<>());
+    m.def("add2", [](const AdderBase::Data& first, const AdderBase::Data& second,
+                     const AdderBase& adder, const AdderBase::DataVisitor& visitor) {
+        adder(first, second, visitor);
+    });
+    m.def("add3", [](const AdderBase::Data& first, const AdderBase::Data& second, const AdderBase::Data& third,
+                     const AdderBase& adder, const AdderBase::DataVisitor& visitor) {
+        adder(first, second, [&] (const AdderBase::Data& first_plus_second) {
+            adder(first_plus_second, third, visitor); // NOLINT(readability-suspicious-call-argument)
+        });
+    });
     // test_override_ref
     // #392/397: overriding reference-returning functions
     class OverrideTest {
@@ -327,6 +397,12 @@ TEST_SUBMODULE(virtual_functions, m) {
 //      .def("str_ref", &OverrideTest::str_ref)
         .def("A_value", &OverrideTest::A_value)
         .def("A_ref", &OverrideTest::A_ref);
+    py::class_<test_override_cache_helper, test_override_cache_helper_trampoline, std::shared_ptr<test_override_cache_helper>>(m, "test_override_cache_helper")
+        .def(py::init_alias<>())
+        .def("func", &test_override_cache_helper::func);
+    m.def("test_override_cache", test_override_cache);
@@ -443,6 +519,7 @@ template <class Base = B_Tpl>
 class PyB_Tpl : public PyA_Tpl<Base> {
     using PyA_Tpl<Base>::PyA_Tpl; // Inherit constructors (via PyA_Tpl's inherited constructors)
+    // NOLINTNEXTLINE(bugprone-parent-virtual-call)
     int unlucky_number() override { PYBIND11_OVERRIDE(int, Base, unlucky_number, ); }
     double lucky_number() override { PYBIND11_OVERRIDE(double, Base, lucky_number, ); }
@@ -459,7 +536,7 @@ template <class Base = D_Tpl> class PyD_Tpl : public PyC_Tpl<Base> {
-void initialize_inherited_virtuals(py::module &m) {
+void initialize_inherited_virtuals(py::module_ &m) {
     // test_inherited_virtuals
     // Method 1: repeat
diff --git a/wrap/pybind11/tests/test_virtual_functions.py b/wrap/pybind11/tests/test_virtual_functions.py
index 66a353ae7f..4f25cac4a3 100644
--- a/wrap/pybind11/tests/test_virtual_functions.py
+++ b/wrap/pybind11/tests/test_virtual_functions.py
@@ -14,18 +14,18 @@ def __init__(self, state):
             self.data = "Hello world"
         def run(self, value):
-            print('ExtendedExampleVirt::run(%i), calling parent..' % value)
+            print("ExtendedExampleVirt::run(%i), calling parent.." % value)
             return super(ExtendedExampleVirt, self).run(value + 1)
         def run_bool(self):
-            print('ExtendedExampleVirt::run_bool()')
+            print("ExtendedExampleVirt::run_bool()")
             return False
         def get_string1(self):
             return "override1"
         def pure_virtual(self):
-            print('ExtendedExampleVirt::pure_virtual(): %s' % self.data)
+            print("ExtendedExampleVirt::pure_virtual(): %s" % self.data)
     class ExtendedExampleVirt2(ExtendedExampleVirt):
         def __init__(self, state):
@@ -37,21 +37,30 @@ def get_string2(self):
     ex12 = m.ExampleVirt(10)
     with capture:
         assert m.runExampleVirt(ex12, 20) == 30
-    assert capture == """
+    assert (
+        capture
+        == """
         Original implementation of ExampleVirt::run(state=10, value=20, str1=default1, str2=default2)
     """  # noqa: E501 line too long
+    )
     with pytest.raises(RuntimeError) as excinfo:
-    assert msg(excinfo.value) == 'Tried to call pure virtual function "ExampleVirt::pure_virtual"'
+    assert (
+        msg(excinfo.value)
+        == 'Tried to call pure virtual function "ExampleVirt::pure_virtual"'
+    )
     ex12p = ExtendedExampleVirt(10)
     with capture:
         assert m.runExampleVirt(ex12p, 20) == 32
-    assert capture == """
+    assert (
+        capture
+        == """
         ExtendedExampleVirt::run(20), calling parent..
         Original implementation of ExampleVirt::run(state=11, value=21, str1=override1, str2=default2)
     """  # noqa: E501 line too long
+    )
     with capture:
         assert m.runExampleVirtBool(ex12p) is False
     assert capture == "ExtendedExampleVirt::run_bool()"
@@ -62,16 +71,19 @@ def get_string2(self):
     ex12p2 = ExtendedExampleVirt2(15)
     with capture:
         assert m.runExampleVirt(ex12p2, 50) == 68
-    assert capture == """
+    assert (
+        capture
+        == """
         ExtendedExampleVirt::run(50), calling parent..
         Original implementation of ExampleVirt::run(state=17, value=51, str1=override1, str2=override2)
     """  # noqa: E501 line too long
+    )
     cstats = ConstructorStats.get(m.ExampleVirt)
     assert cstats.alive() == 3
     del ex12, ex12p, ex12p2
     assert cstats.alive() == 0
-    assert cstats.values() == ['10', '11', '17']
+    assert cstats.values() == ["10", "11", "17"]
     assert cstats.copy_constructions == 0
     assert cstats.move_constructions >= 0
@@ -82,6 +94,7 @@ def test_alias_delay_initialization1(capture):
     If we just create and use an A instance directly, the trampoline initialization is
     bypassed and we only initialize an A() instead (for performance reasons).
     class B(m.A):
         def __init__(self):
             super(B, self).__init__()
@@ -103,12 +116,15 @@ def f(self):
         del b
-    assert capture == """
+    assert (
+        capture
+        == """
         In python f()
+    )
 def test_alias_delay_initialization2(capture):
@@ -118,6 +134,7 @@ def test_alias_delay_initialization2(capture):
     performance penalty, it also allows us to do more things with the trampoline
     class such as defining local variables and performing construction/destruction.
     class B2(m.A2):
         def __init__(self):
             super(B2, self).__init__()
@@ -135,7 +152,9 @@ def f(self):
         del a3
-    assert capture == """
+    assert (
+        capture
+        == """
@@ -145,6 +164,7 @@ def f(self):
+    )
     # Python subclass version
     with capture:
@@ -152,18 +172,23 @@ def f(self):
         del b2
-    assert capture == """
+    assert (
+        capture
+        == """
         In python B2.f()
+    )
 # PyPy: Reference count > 1 causes call with noncopyable instance
 # to fail in ncv1.print_nc()
-@pytest.mark.skipif(not hasattr(m, "NCVirt"), reason="NCVirt test broken on ICPC")
+    not hasattr(m, "NCVirt"), reason="NCVirt does not work on Intel/PGI/NVCC compilers"
 def test_move_support():
     class NCVirtExt(m.NCVirt):
         def get_noncopyable(self, a, b):
@@ -202,8 +227,8 @@ def get_movable(self, a, b):
     del ncv1, ncv2
     assert nc_stats.alive() == 0
     assert mv_stats.alive() == 0
-    assert nc_stats.values() == ['4', '9', '9', '9']
-    assert mv_stats.values() == ['4', '5', '7', '7']
+    assert nc_stats.values() == ["4", "9", "9", "9"]
+    assert mv_stats.values() == ["4", "5", "7", "7"]
     assert nc_stats.copy_constructions == 0
     assert mv_stats.copy_constructions == 1
     assert nc_stats.move_constructions >= 0
@@ -212,6 +237,7 @@ def get_movable(self, a, b):
 def test_dispatch_issue(msg):
     """#159: virtual function dispatch has problems with similar-named functions"""
     class PyClass1(m.DispatchIssue):
         def dispatch(self):
             return "Yay.."
@@ -220,15 +246,50 @@ class PyClass2(m.DispatchIssue):
         def dispatch(self):
             with pytest.raises(RuntimeError) as excinfo:
                 super(PyClass2, self).dispatch()
-            assert msg(excinfo.value) == 'Tried to call pure virtual function "Base::dispatch"'
+            assert (
+                msg(excinfo.value)
+                == 'Tried to call pure virtual function "Base::dispatch"'
+            )
-            p = PyClass1()
-            return m.dispatch_issue_go(p)
+            return m.dispatch_issue_go(PyClass1())
     b = PyClass2()
     assert m.dispatch_issue_go(b) == "Yay.."
+def test_recursive_dispatch_issue(msg):
+    """#3357: Recursive dispatch fails to find python function override"""
+    class Data(m.Data):
+        def __init__(self, value):
+            super(Data, self).__init__()
+            self.value = value
+    class Adder(m.Adder):
+        def __call__(self, first, second, visitor):
+            # lambda is a workaround, which adds extra frame to the
+            # current CPython thread. Removing lambda reveals the bug
+            # [https://github.com/pybind/pybind11/issues/3357]
+            (lambda: visitor(Data(first.value + second.value)))()
+    class StoreResultVisitor:
+        def __init__(self):
+            self.result = None
+        def __call__(self, data):
+            self.result = data.value
+    store = StoreResultVisitor()
+    m.add2(Data(1), Data(2), Adder(), store)
+    assert store.result == 3
+    # without lambda in Adder class, this function fails with
+    # RuntimeError: Tried to call pure virtual function "AdderBase::__call__"
+    m.add3(Data(1), Data(2), Data(3), Adder(), store)
+    assert store.result == 6
 def test_override_ref():
     """#392/397: overriding reference-returning functions"""
     o = m.OverrideTest("asdf")
@@ -336,7 +397,7 @@ def lucky_number(self):
     class DT(m.D_Tpl):
         def say_something(self, times):
-            return "DT says:" + (' quack' * times)
+            return "DT says:" + (" quack" * times)
         def unlucky_number(self):
             return 1234
@@ -352,7 +413,7 @@ def lucky_number(self):
     class DT2(DT):
         def say_something(self, times):
-            return "DT2: " + ('QUACK' * times)
+            return "DT2: " + ("QUACK" * times)
         def unlucky_number(self):
             return -3
@@ -378,3 +439,22 @@ def test_issue_1454():
     # Fix issue #1454 (crash when acquiring/releasing GIL on another thread in Python 2.7)
+def test_python_override():
+    def func():
+        class Test(m.test_override_cache_helper):
+            def func(self):
+                return 42
+        return Test()
+    def func2():
+        class Test(m.test_override_cache_helper):
+            pass
+        return Test()
+    for _ in range(1500):
+        assert m.test_override_cache(func()) == 42
+        assert m.test_override_cache(func2()) == 0
diff --git a/wrap/pybind11/tests/valgrind-numpy-scipy.supp b/wrap/pybind11/tests/valgrind-numpy-scipy.supp
new file mode 100644
index 0000000000..16db302c17
--- /dev/null
+++ b/wrap/pybind11/tests/valgrind-numpy-scipy.supp
@@ -0,0 +1,140 @@
+# Valgrind suppression file for NumPy & SciPy errors and leaks in pybind11 tests
+# On updating a dependency, to get a list of "default" leaks in e.g. NumPy, run
+# `PYTHONMALLOC=malloc valgrind --leak-check=full --show-leak-kinds=definite,indirect python3.9-dbg -c "import numpy"`
+# To use these suppression files, add e.g. `--suppressions=valgrind-numpy-scipy.supp`
+   Leaks when importing NumPy
+   Memcheck:Leak
+   fun:malloc
+   fun:_PyMem_RawMalloc
+   fun:PyObject_Malloc
+   fun:_PyObject_GC_Alloc
+   fun:_PyObject_GC_Malloc
+   fun:_PyObject_GC_NewVar
+   fun:tuple_alloc
+   fun:PyTuple_Pack
+   ...
+   fun:__pyx_pymod_exec_*
+   Leaks when importing NumPy (bis)
+   Memcheck:Leak
+   fun:malloc
+   fun:_PyMem_RawMalloc
+   fun:PyObject_Malloc
+   fun:_PyObject_New
+   fun:PyCode_NewWithPosOnlyArgs
+   fun:PyCode_New
+   ...
+   fun:__pyx_pymod_exec_*
+   Leaks when importing NumPy (ter)
+   Memcheck:Leak
+   fun:malloc
+   fun:_PyMem_RawMalloc
+   fun:PyObject_Malloc
+   fun:_PyObject_GC_Alloc
+   fun:_PyObject_GC_Malloc
+   fun:_PyObject_GC_NewVar
+   fun:tuple_alloc
+   fun:_PyTuple_FromArray
+   fun:_PyObject_MakeTpCall
+   fun:_PyObject_VectorcallTstate
+   fun:PyObject_Vectorcall
+   fun:call_function
+   fun:_PyEval_EvalFrameDefault
+   fun:_PyEval_EvalFrame
+   fun:function_code_fastcall
+   fun:_PyFunction_Vectorcall
+   Leaks when importing NumPy (quater)
+   Memcheck:Leak
+   fun:malloc
+   fun:_PyMem_RawMalloc
+   fun:PyObject_Malloc
+   fun:_PyObject_GC_Alloc
+   fun:_PyObject_GC_Malloc
+   fun:_PyObject_GC_NewVar
+   fun:tuple_alloc
+   fun:_PyTuple_FromArray
+   fun:_PyObject_MakeTpCall
+   fun:_PyObject_VectorcallTstate
+   fun:_PyObject_CallFunctionVa
+   fun:PyObject_CallFunction
+   fun:PyImport_Import
+   Leaks when importing NumPy (quinquies)
+   Memcheck:Leak
+   fun:malloc
+   fun:_PyMem_RawMalloc
+   fun:PyObject_Malloc
+   fun:_PyObject_GC_Alloc
+   fun:_PyObject_GC_Malloc
+   fun:_PyObject_GC_NewVar
+   fun:tuple_alloc
+   fun:PyTuple_New
+   fun:r_object
+   fun:r_object
+   fun:r_object
+   fun:r_object
+   Leaks when importing NumPy (sexies)
+   Memcheck:Leak
+   fun:malloc
+   fun:_PyMem_RawMalloc
+   fun:PyObject_Malloc
+   fun:_PyObject_GC_Alloc
+   fun:_PyObject_GC_Malloc
+   fun:_PyObject_GC_NewVar
+   fun:tuple_alloc
+   fun:PyTuple_New
+   fun:dictiter_iternextitem
+   fun:list_extend
+   fun:_PyList_Extend
+   fun:PySequence_List
+   Leak when importing scipy.fft
+   Memcheck:Leak
+   fun:_Znwm
+   fun:PyInit_pypocketfft
+   fun:_PyImport_LoadDynamicModuleWithSpec
+   fun:_imp_create_dynamic_impl*
+   fun:_imp_create_dynamic
+   fun:cfunction_vectorcall_FASTCALL
+   fun:PyVectorcall_Call
+   fun:_PyObject_Call
+   fun:PyObject_Call
+   fun:do_call_core
+   fun:_PyEval_EvalFrameDefault
+   fun:_PyEval_EvalFrame
+   fun:_PyEval_EvalCode
+   NumPy leaks when spawning a subprocess
+   Memcheck:Leak
+   fun:malloc
+   ...
+   fun:_buffer_get_info
+   fun:array_getbuffer
+   fun:PyObject_GetBuffer
+   fun:__Pyx__GetBufferAndValidate*
+   fun:__pyx_f_5numpy_6random_13bit_generator_12SeedSequence_mix_entropy
+   fun:__pyx_pw_5numpy_6random_13bit_generator_12SeedSequence_1__init__
+   fun:type_call
+   fun:__Pyx__PyObject_CallOneArg
+   fun:__pyx_pw_5numpy_6random_13bit_generator_12BitGenerator_1__init__
diff --git a/wrap/pybind11/tests/valgrind-python.supp b/wrap/pybind11/tests/valgrind-python.supp
new file mode 100644
index 0000000000..d77d5e5c5c
--- /dev/null
+++ b/wrap/pybind11/tests/valgrind-python.supp
@@ -0,0 +1,117 @@
+# Valgrind suppression file for CPython errors and leaks in pybind11 tests
+# Taken verbatim from https://github.com/python/cpython/blob/3.9/Misc/valgrind-python.supp#L266-L272
+   Uninitialised byte(s) false alarm, see bpo-35561
+   Memcheck:Param
+   epoll_ctl(event)
+   fun:epoll_ctl
+   fun:pyepoll_internal_ctl
+   Python leaks when spawning a subprocess
+   Memcheck:Leak
+   fun:malloc
+   fun:_PyMem_RawMalloc
+   fun:PyMem_RawMalloc
+   fun:PyThread_allocate_lock
+   fun:_PyEval_InitState
+   fun:PyInterpreterState_New
+   ...
+   fun:pyinit_core*
+   fun:Py_InitializeFromConfig
+   fun:pymain_init
+   fun:pymain_main
+   Python leaks when spawning a subprocess
+   Memcheck:Leak
+   fun:malloc
+   fun:_PyMem_RawMalloc
+   fun:_PyMem_DebugRawAlloc
+   fun:_PyMem_DebugRawMalloc
+   fun:PyMem_RawMalloc
+   fun:PyThread_allocate_lock
+   fun:_PyRuntimeState_Init_impl
+   fun:_PyRuntimeState_Init
+   fun:_PyRuntime_Initialize
+   fun:pymain_init
+   fun:pymain_main
+   fun:Py_BytesMain
+   Python leaks when spawning a subprocess
+   Memcheck:Leak
+   fun:malloc
+   fun:_PyMem_RawMalloc
+   fun:PyMem_RawMalloc
+   fun:PyThread_allocate_lock
+   fun:_PyImport_AcquireLock
+   fun:_imp_acquire_lock_impl*
+   fun:_imp_acquire_lock
+   fun:cfunction_vectorcall_NOARGS
+   fun:_PyObject_VectorcallTstate
+   fun:PyObject_Vectorcall
+   fun:call_function
+   fun:_PyEval_EvalFrameDefault
+   fun:_PyEval_EvalFrame
+   fun:function_code_fastcall
+   Python leaks when spawning a subprocess
+   Memcheck:Leak
+   fun:malloc
+   fun:_PyMem_RawMalloc
+   fun:PyMem_RawMalloc
+   fun:PyThread_allocate_lock
+   fun:newlockobject
+   ...
+   fun:cfunction_vectorcall_NOARGS
+   fun:_PyObject_VectorcallTstate
+   fun:PyObject_Vectorcall
+   fun:call_function
+   fun:_PyEval_EvalFrameDefault
+   fun:_PyEval_EvalFrame
+   fun:function_code_fastcall
+   fun:_PyFunction_Vectorcall
+   Python leaks when spawning a subprocess
+   Memcheck:Leak
+   fun:malloc
+   fun:_PyMem_RawMalloc
+   fun:PyMem_RawMalloc
+   fun:PyThread_allocate_lock
+   fun:rlock_new
+   fun:type_call
+   fun:_PyObject_Call
+   fun:PyObject_Call
+   fun:do_call_core
+   fun:_PyEval_EvalFrameDefault
+   fun:_PyEval_EvalFrame
+   fun:_PyEval_EvalCode
+   fun:_PyFunction_Vectorcall
+# Not really CPython-specific, see link
+   dlopen leak (https://stackoverflow.com/questions/1542457/memory-leak-reported-by-valgrind-in-dlopen)
+   Memcheck:Leak
+   fun:malloc
+   ...
+   fun:dl_open_worker
+   fun:_dl_catch_exception
+   fun:_dl_open
+   fun:dlopen_doit
+   fun:_dl_catch_exception
+   fun:_dl_catch_error
+   fun:_dlerror_run
+   fun:dlopen@@GLIBC_2.2.5
+   fun:_PyImport_FindSharedFuncptr
+   fun:_PyImport_LoadDynamicModuleWithSpec
diff --git a/wrap/pybind11/tools/FindEigen3.cmake b/wrap/pybind11/tools/FindEigen3.cmake
index 98ab43d9e6..83625d92e0 100644
--- a/wrap/pybind11/tools/FindEigen3.cmake
+++ b/wrap/pybind11/tools/FindEigen3.cmake
@@ -64,6 +64,9 @@ if(EIGEN3_INCLUDE_DIR)
+    set(KDE4_INCLUDE_DIR "")
+  endif()
diff --git a/wrap/pybind11/tools/FindPythonLibsNew.cmake b/wrap/pybind11/tools/FindPythonLibsNew.cmake
index c1c72c763c..3605aebcf3 100644
--- a/wrap/pybind11/tools/FindPythonLibsNew.cmake
+++ b/wrap/pybind11/tools/FindPythonLibsNew.cmake
@@ -57,6 +57,8 @@ endif()
   set(_pythonlibs_quiet QUIET)
+  set(_pythonlibs_quiet "")
@@ -115,7 +117,7 @@ print('.'.join(str(v) for v in sys.version_info));
+print(s.get_config_var('EXT_SUFFIX') or s.get_config_var('SO'));
 print(hasattr(sys, 'gettotalrefcount')+0);
 print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
diff --git a/wrap/pybind11/tools/check-style.sh b/wrap/pybind11/tools/check-style.sh
index f7af2a4169..6d832523a7 100755
--- a/wrap/pybind11/tools/check-style.sh
+++ b/wrap/pybind11/tools/check-style.sh
@@ -16,11 +16,11 @@ check_style_errors=0
-found="$(grep '\<\(if\|for\|while\|catch\)(\|){' $@ -rn --color=always)"
+found="$(grep '\<\(if\|for\|while\|catch\)(\|){' "$@" -rn --color=always)"
 if [ -n "$found" ]; then
     echo -e '\033[31;01mError: found the following coding style problems:\033[0m'
-    echo "$found" | sed -e 's/^/    /'
+    echo "${found//^/    /}"
 found="$(awk '
@@ -34,7 +34,7 @@ last && /^\s*{/ {
 { last = /(if|for|while|catch|switch)\s*\(.*\)\s*$/ ? $0 : "" }
-' $(find include -type f) $@)"
+' "$(find include -type f)" "$@")"
 if [ -n "$found" ]; then
     echo -e '\033[31;01mError: braces should occur on the same line as the if/while/.. statement. Found issues in the following files:\033[0m'
diff --git a/wrap/pybind11/tools/libsize.py b/wrap/pybind11/tools/libsize.py
index 50f88bdb3d..1551477e66 100644
--- a/wrap/pybind11/tools/libsize.py
+++ b/wrap/pybind11/tools/libsize.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
-from __future__ import print_function, division
+from __future__ import division, print_function
 import os
 import sys
@@ -19,7 +20,7 @@
 libsize = os.path.getsize(lib)
-print("------", os.path.basename(lib), "file size:", libsize, end='')
+print("------", os.path.basename(lib), "file size:", libsize, end="")
 if os.path.exists(save):
     with open(save) as sf:
@@ -34,5 +35,5 @@
-with open(save, 'w') as sf:
+with open(save, "w") as sf:
diff --git a/wrap/pybind11/tools/make_changelog.py b/wrap/pybind11/tools/make_changelog.py
new file mode 100755
index 0000000000..629c284d39
--- /dev/null
+++ b/wrap/pybind11/tools/make_changelog.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import re
+import ghapi.all
+from rich import print
+from rich.syntax import Syntax
+ENTRY = re.compile(
+    r"""
+    Suggested \s changelog \s entry:
+    .*
+    ```rst
+    \s*
+    (.*?)
+    \s*
+    ```
+    re.DOTALL | re.VERBOSE,
+api = ghapi.all.GhApi(owner="pybind", repo="pybind11")
+issues_pages = ghapi.page.paged(
+    api.issues.list_for_repo, labels="needs changelog", state="closed"
+issues = (issue for page in issues_pages for issue in page)
+missing = []
+for issue in issues:
+    changelog = ENTRY.findall(issue.body)
+    if changelog:
+        (msg,) = changelog
+        if not msg.startswith("* "):
+            msg = "* " + msg
+        if not msg.endswith("."):
+            msg += "."
+        msg += f"\n  `#{issue.number} <{issue.html_url}>`_"
+        print(Syntax(msg, "rst", theme="ansi_light", word_wrap=True))
+        print()
+    else:
+        missing.append(issue)
+if missing:
+    print()
+    print("[blue]" + "-" * 30)
+    print()
+    for issue in missing:
+        print(f"[red bold]Missing:[/red bold][red] {issue.title}")
+        print(f"[red]  {issue.html_url}\n")
+    print("[bold]Template:\n")
+    msg = "## Suggested changelog entry:\n\n```rst\n\n```"
+    print(Syntax(msg, "md", theme="ansi_light"))
diff --git a/wrap/pybind11/tools/pybind11Common.cmake b/wrap/pybind11/tools/pybind11Common.cmake
index 26a1e04892..df24781213 100644
--- a/wrap/pybind11/tools/pybind11Common.cmake
+++ b/wrap/pybind11/tools/pybind11Common.cmake
@@ -15,11 +15,12 @@ Adds the following targets::
 Adds the following functions::
     pybind11_strip(target) - strip target after building on linux/macOS
+    pybind11_find_import(module) - See if a module is installed.
 # CMake 3.10 has an include_guard command, but we can't use that yet
+# include_guard(global) (pre-CMake 3.10)
 if(TARGET pybind11::lto)
@@ -36,6 +37,12 @@ if(NOT is_config)
   set(optional_global GLOBAL)
+# If not run in Python mode, we still would like this to at least
+# include pybind11's include directory:
+    "${pybind11_INCLUDE_DIR}"
+    CACHE INTERNAL "Include directory for pybind11 (Python not requested)")
 # --------------------- Shared targets ----------------------------
 # Build an interface library target:
@@ -109,28 +116,32 @@ endif()
 add_library(pybind11::windows_extras IMPORTED INTERFACE ${optional_global})
-  # /MP enables multithreaded builds (relevant when there are many files), /bigobj is
-  # needed for bigger binding projects due to the limit to 64k addressable sections
+if(MSVC) # That's also clang-cl
+  # /bigobj is needed for bigger binding projects due to the limit to 64k
+  # addressable sections
     TARGET pybind11::windows_extras
-    set_property(
-      TARGET pybind11::windows_extras
-      APPEND
-  else()
-    # Only set these options for C++ files.  This is important so that, for
-    # instance, projects that include other types of source files like CUDA
-    # .cu files don't get these options propagated to nvcc since that would
-    # cause the build to fail.
-    set_property(
-      TARGET pybind11::windows_extras
-      APPEND
+  # /MP enables multithreaded builds (relevant when there are many files) for MSVC
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") # no Clang no Intel
+      set_property(
+        TARGET pybind11::windows_extras
+        APPEND
+    else()
+      # Only set these options for C++ files.  This is important so that, for
+      # instance, projects that include other types of source files like CUDA
+      # .cu files don't get these options propagated to nvcc since that would
+      # cause the build to fail.
+      set_property(
+        TARGET pybind11::windows_extras
+        APPEND
+                 $<$<NOT:$<CONFIG:Debug>>:$<$<COMPILE_LANGUAGE:CXX>:/MP>>)
+    endif()
@@ -196,6 +207,77 @@ else()
+# --------------------- pybind11_find_import -------------------------------
+if(NOT _pybind11_nopython)
+  # Check to see if modules are importable. Use REQUIRED to force an error if
+  # one of the modules is not found. <package_name>_FOUND will be set if the
+  # package was found (underscores replace dashes if present). QUIET will hide
+  # the found message, and VERSION will require a minimum version. A successful
+  # find will cache the result.
+  function(pybind11_find_import PYPI_NAME)
+    # CMake variables need underscores (PyPI doesn't care)
+    string(REPLACE "-" "_" NORM_PYPI_NAME "${PYPI_NAME}")
+    # Return if found previously
+      return()
+    endif()
+    set(options "REQUIRED;QUIET")
+    set(oneValueArgs "VERSION")
+    cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "" ${ARGN})
+      set(status_level FATAL_ERROR)
+    else()
+      set(status_level WARNING)
+    endif()
+    execute_process(
+      COMMAND
+        ${${_Python}_EXECUTABLE} -c
+        "from pkg_resources import get_distribution; print(get_distribution('${PYPI_NAME}').version)"
+    # If a result is present, this failed
+      set(${NORM_PYPI_NAME}_FOUND
+          CACHE INTERNAL "")
+      # Always warn or error
+      message(
+        ${status_level}
+        "Missing: ${PYPI_NAME} ${ARG_VERSION}\nTry: ${${_Python}_EXECUTABLE} -m pip install ${PYPI_NAME}"
+      )
+    else()
+        message(
+          ${status_level}
+          "Version incorrect: ${PYPI_NAME} ${PKG_VERSION} found, ${ARG_VERSION} required - try upgrading"
+        )
+      else()
+        set(${NORM_PYPI_NAME}_FOUND
+            YES
+            CACHE INTERNAL "")
+        set(${NORM_PYPI_NAME}_VERSION
+            ${PKG_VERSION}
+            CACHE INTERNAL "")
+      endif()
+      if(NOT ARG_QUIET)
+        message(STATUS "Found ${PYPI_NAME} ${PKG_VERSION}")
+      endif()
+    endif()
+      # We have successfully found a good version, cache to avoid calling again.
+    endif()
+  endfunction()
 # --------------------- LTO -------------------------------
@@ -221,23 +303,36 @@ function(_pybind11_return_if_cxx_and_linker_flags_work result cxxflags linkerfla
 function(_pybind11_generate_lto target prefer_thin_lto)
+  if(MINGW)
+    message(STATUS "${target} disabled (problems with undefined symbols for MinGW for now)")
+    return()
+  endif()
     set(cxx_append "")
     set(linker_append "")
       # Clang Gold plugin does not support -Os; append -O3 to MinSizeRel builds to override it
       set(linker_append ";$<$<CONFIG:MinSizeRel>:-O3>")
       set(cxx_append ";-fno-fat-lto-objects")
-    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND prefer_thin_lto)
+      set(NO_FLTO_ARCH TRUE)
+    else()
+      set(NO_FLTO_ARCH FALSE)
+    endif()
+       AND prefer_thin_lto
         HAS_FLTO_THIN "-flto=thin${cxx_append}" "-flto=thin${linker_append}"
         HAS_FLTO "-flto${cxx_append}" "-flto${linker_append}" PYBIND11_LTO_CXX_FLAGS
@@ -256,7 +351,9 @@ function(_pybind11_generate_lto target prefer_thin_lto)
   # Enable LTO flags if found, except for Debug builds
-    set(not_debug "$<NOT:$<CONFIG:Debug>>")
+    # CONFIG takes multiple values in CMake 3.19+, until then we have to use OR
+    set(is_debug "$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>")
+    set(not_debug "$<NOT:${is_debug}>")
     set(cxx_lang "$<COMPILE_LANGUAGE:CXX>")
       set(genex "${not_debug}")
diff --git a/wrap/pybind11/tools/pybind11Config.cmake.in b/wrap/pybind11/tools/pybind11Config.cmake.in
index 3f11172963..262020d14a 100644
--- a/wrap/pybind11/tools/pybind11Config.cmake.in
+++ b/wrap/pybind11/tools/pybind11Config.cmake.in
@@ -1,57 +1,80 @@
-PYBIND11 cmake module.
-This module sets the following variables in your project::
-  pybind11_FOUND - true if pybind11 and all required components found on the system
-  pybind11_VERSION - pybind11 version in format Major.Minor.Release
-  pybind11_VERSION_TYPE - pybind11 version type (dev, release)
-  pybind11_INCLUDE_DIRS - Directories where pybind11 and python headers are located.
-  pybind11_INCLUDE_DIR - Directory where pybind11 headers are located.
-  pybind11_DEFINITIONS - Definitions necessary to use pybind11, namely USING_pybind11.
-  pybind11_LIBRARIES - compile flags and python libraries (as needed) to link against.
-  pybind11_LIBRARY - empty.
+Exported variables
+This module sets the following variables in your project:
+  true if pybind11 and all required components found on the system
+  pybind11 version in format Major.Minor.Release
+  pybind11 version type (``dev*`` or empty for a release)
+  Directories where pybind11 and python headers are located.
+  Directory where pybind11 headers are located.
+  Definitions necessary to use pybind11, namely USING_pybind11.
+  Compile flags and python libraries (as needed) to link against.
+  Empty.
 Available components: None
-Exported targets::
+Exported targets
-If pybind11 is found, this module defines the following :prop_tgt:`IMPORTED`
-interface library targets::
+If pybind11 is found, this module defines the following ``IMPORTED``
+interface library targets:
-  pybind11::module - for extension modules
-  pybind11::embed - for embedding the Python interpreter
+  for extension modules.
+  for embedding the Python interpreter.
 Python headers, libraries (as needed by platform), and the C++ standard
 are attached to the target.
 Advanced targets are also supplied - these are primary for users building
-complex applications, and they are available in all modes::
-  pybind11::headers - Just the pybind11 headers and minimum compile requirements
-  pybind11::pybind11 - Python headers too
-  pybind11::python_link_helper - Just the "linking" part of pybind11:module, for CMake < 3.15
-  pybind11::python2_no_register - Quiets the warning/error when mixing C++14+ and Python 2, also included in pybind11::module
-  pybind11::thin_lto - An alternative to INTERPROCEDURAL_OPTIMIZATION
-  pybind11::lto - An alternative to INTERPROCEDURAL_OPTIMIZATION (also avoids thin LTO on clang)
-  pybind11::windows_extras - Adds bigobj and mp for MSVC
+complex applications, and they are available in all modes:
+  Just the pybind11 headers and minimum compile requirements.
+  Python headers too.
+  Just the "linking" part of ``pybind11:module``, for CMake < 3.15.
+  Quiets the warning/error when mixing C++14+ and Python 2, also included in ``pybind11::module``.
+  An alternative to ``INTERPROCEDURAL_OPTIMIZATION`` (also avoids thin LTO on clang).
+  Adds bigobj and mp for MSVC.
 There are two modes provided; classic, which is built on the old Python
 discovery packages in CMake, or the new FindPython mode, which uses FindPython
 from 3.12+ forward (3.15+ _highly_ recommended).
-New FindPython mode::
+New FindPython mode
 To activate this mode, either call ``find_package(Python COMPONENTS Interpreter Development)``
 before finding this package, or set the ``PYBIND11_FINDPYTHON`` variable to ON. In this mode,
-you can either use the basic targets, or use the FindPython tools::
+you can either use the basic targets, or use the FindPython tools:
+.. code-block:: cmake
   find_package(Python COMPONENTS Interpreter Development)
   find_package(pybind11 CONFIG)
@@ -64,16 +87,19 @@ you can either use the basic targets, or use the FindPython tools::
   target_link_libraries(MyModule2 pybind11::headers)
   set_target_properties(MyModule2 PROPERTIES
                                   INTERPROCEDURAL_OPTIMIZATION ON
-                                  CXX__VISIBILITY_PRESET ON
+                                  CXX_VISIBILITY_PRESET ON
                                   VISIBLITY_INLINES_HIDDEN ON)
 If you build targets yourself, you may be interested in stripping the output
 for reduced size; this is the one other feature that the helper function gives you.
-Classic mode::
+Classic mode
 Set PythonLibsNew variables to influence python detection and
-CMAKE_CXX_STANDARD to influence standard setting. ::
+CMAKE_CXX_STANDARD to influence standard setting.
+.. code-block:: cmake
   find_package(pybind11 CONFIG REQUIRED)
@@ -85,36 +111,98 @@ CMAKE_CXX_STANDARD to influence standard setting. ::
   add_executable(myexe main.cpp)
   target_link_libraries(myexe PUBLIC pybind11::embed)
-Suggested usage::
-find_package with version info is not recommended except for release versions. ::
-  find_package(pybind11 CONFIG)
-  find_package(pybind11 2.0 EXACT CONFIG REQUIRED)
+The following variables can be set to guide the search for this package:
+  CMake variable, set to directory containing this Config file.
+  CMake variable, set to root directory of this package.
+  Environment variable, set to bin directory of this package.
+  CMake variable, disables ``find_package(pybind11)`` when not ``REQUIRED``,
+  perhaps to force internal build.
+This module defines the following commands to assist with creating Python modules:
+.. code-block:: cmake
+  pybind11_add_module(<target>
+    <files>...
+    )
+Add a module and setup all helpers. You can select the type of the library; the
+default is ``MODULE``. There are several options:
-The following variables can be set to guide the search for this package::
+  Optimize for size, even if the ``CMAKE_BUILD_TYPE`` is not ``MinSizeRel``.
+  Use thin TLO instead of regular if there's a choice (pybind11's selection
+  is disabled if ``CMAKE_INTERPROCEDURAL_OPTIMIZATIONS`` is set).
+  Disable the SOABI component (``PYBIND11_NEWPYTHON`` mode only).
+  Disable all extras, exit immediately after making the module.
-  pybind11_DIR - CMake variable, set to directory containing this Config file
-  CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package
-  PATH - environment variable, set to bin directory of this package
-  CMAKE_DISABLE_FIND_PACKAGE_pybind11 - CMake variable, disables
-    find_package(pybind11) when not REQUIRED, perhaps to force internal build
-Helper functions::
+.. code-block:: cmake
-  pybind11_add_module(...) - Add a library and setup all helpers
-  pybind11_strip(target) - Strip a target after building it (linux/macOS)
-  pybind11_extension(target) - Injects the Python extension name
+  pybind11_strip(<target>)
-See ``pybind11Tools.cmake`` or ``pybind11NewTools.cmake`` for details on
+Strip a target after building it (linux/macOS), called by ``pybind11_add_module``.
+.. code-block:: cmake
+    pybind11_extension(<target>)
+Sets the Python extension name correctly for Python on your platform, called by
+.. code-block:: cmake
+    pybind11_find_import(<module> [VERSION <number>] [REQUIRED] [QUIET])
+See if a module is installed. Use the registered name (the one on PyPI). You
+can specify a ``VERSION``, and you can specify ``REQUIRED`` or ``QUIET``. Only available if
+``NOPYTHON`` mode is not active.  Sets ``module_VERSION`` and ``module_FOUND``. Caches the
+result once a valid install is found.
+Suggested usage
+Using ``find_package`` with version info is not recommended except for release versions.
+.. code-block:: cmake
+  find_package(pybind11 CONFIG)
+  find_package(pybind11 2.0 EXACT CONFIG REQUIRED)
 # Location of pybind11/pybind11.h
+# This will be relative unless explicitly set as absolute
+set(pybind11_INCLUDE_DIR "@pybind11_INCLUDEDIR@")
 set(pybind11_LIBRARY "")
 set(pybind11_DEFINITIONS USING_pybind11)
@@ -140,6 +228,6 @@ include("${CMAKE_CURRENT_LIST_DIR}/pybind11Common.cmake")
 if(NOT pybind11_FIND_QUIETLY)
-      "Found pybind11: ${pybind11_INCLUDE_DIR} (found version \"${pybind11_VERSION}\" ${pybind11_VERSION_TYPE})"
+      "Found pybind11: ${pybind11_INCLUDE_DIR} (found version \"${pybind11_VERSION}${pybind11_VERSION_TYPE}\")"
diff --git a/wrap/pybind11/tools/pybind11NewTools.cmake b/wrap/pybind11/tools/pybind11NewTools.cmake
index 27eb4d9205..0b4e21ccef 100644
--- a/wrap/pybind11/tools/pybind11NewTools.cmake
+++ b/wrap/pybind11/tools/pybind11NewTools.cmake
@@ -5,6 +5,12 @@
 # All rights reserved. Use of this source code is governed by a
 # BSD-style license that can be found in the LICENSE file.
+  message(FATAL_ERROR "You cannot use the new FindPython module with CMake < 3.12")
   TARGET pybind11::headers
@@ -12,10 +18,8 @@ get_property(
   set(_pybind11_quiet QUIET)
-  message(FATAL_ERROR "You cannot use the new FindPython module with CMake < 3.12")
+  set(_pybind11_quiet "")
 if(NOT Python_FOUND
@@ -70,23 +74,58 @@ if(PYBIND11_MASTER_PROJECT)
-# Debug check - see https://stackoverflow.com/questions/646518/python-how-to-detect-debug-Interpreter
-  COMMAND "${${_Python}_EXECUTABLE}" "-c" "import sys; sys.exit(hasattr(sys, 'gettotalrefcount'))"
+# If a user finds Python, they may forget to include the Interpreter component
+# and the following two steps require it. It is highly recommended by CMake
+# when finding development libraries anyway, so we will require it.
+  message(
+      "${_Python} was found without the Interpreter component. Pybind11 requires this component.")
+  # Detect changes to the Python version/binary in subsequent CMake runs, and refresh config if needed
+      "${${_Python}_EXECUTABLE}"
+      CACHE INTERNAL "Python executable during the last CMake run")
+  # Debug check - see https://stackoverflow.com/questions/646518/python-how-to-detect-debug-Interpreter
+  execute_process(
+    COMMAND "${${_Python}_EXECUTABLE}" "-c"
+            "import sys; sys.exit(hasattr(sys, 'gettotalrefcount'))"
+      "${_PYTHON_IS_DEBUG}"
+      CACHE INTERNAL "Python debug status")
 # Get the suffix - SO is deprecated, should use EXT_SUFFIX, but this is
 # required for PyPy3 (as of 7.3.1)
-  COMMAND "${${_Python}_EXECUTABLE}" "-c"
-          "from distutils import sysconfig; print(sysconfig.get_config_var('SO'))"
+  execute_process(
+      "${${_Python}_EXECUTABLE}" "-c"
+      "import sys, importlib; s = importlib.import_module('distutils.sysconfig' if sys.version_info < (3, 10) else 'sysconfig'); print(s.get_config_var('EXT_SUFFIX') or s.get_config_var('SO'))"
+    message(
+      FATAL_ERROR "pybind11 could not query the module file extension, likely the 'distutils'"
+                  "package is not installed. Full error message:\n${_PYTHON_MODULE_EXTENSION_ERR}")
+  endif()
-# This needs to be available for the pybind11_extension function
+  # This needs to be available for the pybind11_extension function
 # Python debug libraries expose slightly different objects before 3.8
 # https://docs.python.org/3.6/c-api/intro.html#debugging-builds
@@ -101,10 +140,23 @@ endif()
 # Check on every access - since Python2 and Python3 could have been used - do nothing in that case.
+  # Only add Python for build - must be added during the import for config
+  # since it has to be re-discovered.
+  #
+  # This needs to be a target to be included after the local pybind11
+  # directory, just in case there there is an installed pybind11 sitting
+  # next to Python's includes. It also ensures Python is a SYSTEM library.
+  add_library(pybind11::python_headers INTERFACE IMPORTED)
+  set_property(
+                                             "$<BUILD_INTERFACE:${${_Python}_INCLUDE_DIRS}>")
     TARGET pybind11::pybind11
+    PROPERTY INTERFACE_LINK_LIBRARIES pybind11::python_headers)
+  set(pybind11_INCLUDE_DIRS
+      "${pybind11_INCLUDE_DIR}" "${${_Python}_INCLUDE_DIRS}"
+      CACHE INTERNAL "Directories where pybind11 and possibly Python headers are located")
@@ -115,11 +167,11 @@ if(DEFINED ${_Python}_VERSION AND ${_Python}_VERSION VERSION_LESS 3)
 # In CMake 3.18+, you can find these separately, so include an if
-if(TARGET ${_Python}::${_Python})
+if(TARGET ${_Python}::Python)
     TARGET pybind11::embed
 # CMake 3.15+ has this
@@ -141,27 +193,27 @@ function(pybind11_add_module target_name)
   cmake_parse_arguments(PARSE_ARGV 1 ARG
-    set(type STATIC)
-    set(type SHARED)
+    set(lib_type STATIC)
+  elseif(ARG_SHARED)
+    set(lib_type SHARED)
-    set(type MODULE)
+    set(lib_type MODULE)
   if("${_Python}" STREQUAL "Python")
-    python_add_library(${target_name} ${type} ${ARG_UNPARSED_ARGUMENTS})
+    python_add_library(${target_name} ${lib_type} ${ARG_UNPARSED_ARGUMENTS})
   elseif("${_Python}" STREQUAL "Python3")
-    python3_add_library(${target_name} ${type} ${ARG_UNPARSED_ARGUMENTS})
+    python3_add_library(${target_name} ${lib_type} ${ARG_UNPARSED_ARGUMENTS})
   elseif("${_Python}" STREQUAL "Python2")
-    python2_add_library(${target_name} ${type} ${ARG_UNPARSED_ARGUMENTS})
+    python2_add_library(${target_name} ${lib_type} ${ARG_UNPARSED_ARGUMENTS})
     message(FATAL_ERROR "Cannot detect FindPython version: ${_Python}")
   target_link_libraries(${target_name} PRIVATE pybind11::headers)
-  if(type STREQUAL "MODULE")
+  if(lib_type STREQUAL "MODULE")
     target_link_libraries(${target_name} PRIVATE pybind11::module)
     target_link_libraries(${target_name} PRIVATE pybind11::embed)
@@ -175,12 +227,21 @@ function(pybind11_add_module target_name)
     target_link_libraries(${target_name} PRIVATE pybind11::python2_no_register)
-  set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET "hidden"
-                                                  CUDA_VISIBILITY_PRESET "hidden")
+  # -fvisibility=hidden is required to allow multiple modules compiled against
+  # different pybind versions to work properly, and for some features (e.g.
+  # py::module_local).  We force it on everything inside the `pybind11`
+  # namespace; also turning it on for a pybind module compilation here avoids
+  # potential warnings or issues from having mixed hidden/non-hidden types.
+    set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET "hidden")
+  endif()
+    set_target_properties(${target_name} PROPERTIES CUDA_VISIBILITY_PRESET "hidden")
+  endif()
   # If we don't pass a WITH_SOABI or WITHOUT_SOABI, use our own default handling of extensions
-                                                               ARG_UNPARSED_ARGUMENTS))
diff --git a/wrap/pybind11/tools/pybind11Tools.cmake b/wrap/pybind11/tools/pybind11Tools.cmake
index a0a3b60eb1..c255e5cfd8 100644
--- a/wrap/pybind11/tools/pybind11Tools.cmake
+++ b/wrap/pybind11/tools/pybind11Tools.cmake
@@ -5,11 +5,18 @@
 # All rights reserved. Use of this source code is governed by a
 # BSD-style license that can be found in the LICENSE file.
+# include_guard(global) (pre-CMake 3.10)
+if(TARGET pybind11::python_headers)
+  return()
 # Built-in in CMake 3.5+
   set(_pybind11_quiet QUIET)
+  set(_pybind11_quiet "")
 # If this is the first run, PYTHON_VERSION can stand in for PYBIND11_PYTHON_VERSION
       CACHE STRING "Python version to use for compiling modules")
-  # If this is set as a normal variable, promote it, otherwise, make an empty cache variable.
+  # If this is set as a normal variable, promote it
       CACHE STRING "Python version to use for compiling modules")
+  # Make an empty cache variable.
+      ""
+      CACHE STRING "Python version to use for compiling modules")
 # A user can set versions manually too
-    "3.9;3.8;3.7;3.6;3.5;3.4"
+    "3.11;3.10;3.9;3.8;3.7;3.6;3.5;3.4"
 find_package(PythonLibsNew ${PYBIND11_PYTHON_VERSION} MODULE REQUIRED ${_pybind11_quiet})
+# Makes a normal variable a cached variable
+  set(_tmp_ptc "${${NAME}}")
+  # CMake 3.21 complains if a cached variable is shadowed by a normal one
+  unset(${NAME})
+  set(${NAME}
+      "${_tmp_ptc}"
 # Cache variables so pybind11_add_module can be used in parent projects
-                [=[import sys; print(".".join(map(str, sys.pypy_version_info[:3])))]=]
+                [=[import sys; sys.stdout.write(".".join(map(str, sys.pypy_version_info[:3])))]=]
         OUTPUT_VARIABLE pypy_version)
@@ -81,11 +87,23 @@ if(PYBIND11_MASTER_PROJECT)
-# Only add Python for build - must be added during the import for config since it has to be re-discovered.
+# Only add Python for build - must be added during the import for config since
+# it has to be re-discovered.
+# This needs to be an target to it is included after the local pybind11
+# directory, just in case there are multiple versions of pybind11, we want the
+# one we expect.
+add_library(pybind11::python_headers INTERFACE IMPORTED)
+set_property(TARGET pybind11::python_headers PROPERTY INTERFACE_INCLUDE_DIRECTORIES
+                                                      "$<BUILD_INTERFACE:${PYTHON_INCLUDE_DIRS}>")
   TARGET pybind11::pybind11
+  PROPERTY INTERFACE_LINK_LIBRARIES pybind11::python_headers)
+    "${pybind11_INCLUDE_DIR}" "${PYTHON_INCLUDE_DIRS}"
+    CACHE INTERNAL "Directories where pybind11 and possibly Python headers are located")
 # Python debug libraries expose slightly different objects before 3.8
 # https://docs.python.org/3.6/c-api/intro.html#debugging-builds
@@ -162,8 +180,13 @@ function(pybind11_add_module target_name)
   # py::module_local).  We force it on everything inside the `pybind11`
   # namespace; also turning it on for a pybind module compilation here avoids
   # potential warnings or issues from having mixed hidden/non-hidden types.
-  set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET "hidden"
-                                                  CUDA_VISIBILITY_PRESET "hidden")
+    set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET "hidden")
+  endif()
+    set_target_properties(${target_name} PROPERTIES CUDA_VISIBILITY_PRESET "hidden")
+  endif()
@@ -189,3 +212,8 @@ function(pybind11_add_module target_name)
     target_link_libraries(${target_name} PRIVATE pybind11::opt_size)
+# Provide general way to call common Python commands in "common" file.
diff --git a/wrap/pybind11/tools/pyproject.toml b/wrap/pybind11/tools/pyproject.toml
index 9787c3bdf0..8fe2f47af9 100644
--- a/wrap/pybind11/tools/pyproject.toml
+++ b/wrap/pybind11/tools/pyproject.toml
@@ -1,3 +1,3 @@
-requires = ["setuptools", "wheel"]
+requires = ["setuptools>=42", "wheel"]
 build-backend = "setuptools.build_meta"
diff --git a/wrap/pybind11/tools/setup_global.py.in b/wrap/pybind11/tools/setup_global.py.in
index 3325cd0ead..8b7e538714 100644
--- a/wrap/pybind11/tools/setup_global.py.in
+++ b/wrap/pybind11/tools/setup_global.py.in
@@ -33,21 +33,33 @@ class InstallHeadersNested(install_headers):
 main_headers = glob.glob("pybind11/include/pybind11/*.h")
 detail_headers = glob.glob("pybind11/include/pybind11/detail/*.h")
+stl_headers = glob.glob("pybind11/include/pybind11/stl/*.h")
 cmake_files = glob.glob("pybind11/share/cmake/pybind11/*.cmake")
-headers = main_headers + detail_headers
+headers = main_headers + detail_headers + stl_headers
 cmdclass = {"install_headers": InstallHeadersNested}
+# This will _not_ affect installing from wheels,
+# only building wheels or installing from SDist.
+# Primarily intended on Windows, where this is sometimes
+# customized (for example, conda-forge uses Library/)
+base = os.environ.get("PYBIND11_GLOBAL_PREFIX", "")
+# Must have a separator
+if base and not base.endswith("/"):
+    base += "/"
-        ("share/cmake/pybind11", cmake_files),
-        ("include/pybind11", main_headers),
-        ("include/pybind11/detail", detail_headers),
+        (base + "share/cmake/pybind11", cmake_files),
+        (base + "include/pybind11", main_headers),
+        (base + "include/pybind11/detail", detail_headers),
+        (base + "include/pybind11/stl", stl_headers),
diff --git a/wrap/pybind11/tools/setup_main.py.in b/wrap/pybind11/tools/setup_main.py.in
index c859c1f755..533a75ae71 100644
--- a/wrap/pybind11/tools/setup_main.py.in
+++ b/wrap/pybind11/tools/setup_main.py.in
@@ -16,11 +16,14 @@ setup(
+        "pybind11.include.pybind11.stl",
+        "pybind11": ["py.typed", "*.pyi"],
         "pybind11.include.pybind11": ["*.h"],
         "pybind11.include.pybind11.detail": ["*.h"],
+        "pybind11.include.pybind11.stl": ["*.h"],
         "pybind11.share.cmake.pybind11": ["*.cmake"],
@@ -29,6 +32,9 @@ setup(
         "console_scripts": [
              "pybind11-config = pybind11.__main__:main",
+        ],
+        "pipx.run": [
+             "pybind11 = pybind11.__main__:main",