diff --git a/.travis.yml b/.travis.yml index 2563b54dc6741..d79723c969458 100644 --- a/.travis.yml +++ b/.travis.yml @@ -38,7 +38,7 @@ matrix: # This environment tests the newest supported Anaconda release (4.4.0) # It also runs tests requiring Pandas. - env: DISTRIB="conda" PYTHON_VERSION="3.6.1" INSTALL_MKL="true" - NUMPY_VERSION="1.12.1" SCIPY_VERSION="0.19.0" PANDAS_VERSION="0.20.1" + NUMPY_VERSION="1.13" SCIPY_VERSION="0.19.0" PANDAS_VERSION="0.20.2" CYTHON_VERSION="0.25.2" COVERAGE=true # This environment use pytest to run the tests. It uses the newest # supported Anaconda release (4.4.0). It also runs tests requiring Pandas. @@ -49,7 +49,7 @@ matrix: # flake8 linting on diff wrt common ancestor with upstream/master - env: RUN_FLAKE8="true" SKIP_TESTS="true" DISTRIB="conda" PYTHON_VERSION="3.5" INSTALL_MKL="true" - NUMPY_VERSION="1.12.1" SCIPY_VERSION="0.19.0" CYTHON_VERSION="0.23.5" + NUMPY_VERSION="1.13" SCIPY_VERSION="0.19.0" CYTHON_VERSION="0.23.5" # This environment tests scikit-learn against numpy and scipy master # installed from their CI wheels in a virtualenv with the Python # interpreter provided by travis. diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index 63c8da5aafeac..b3f785254c2ae 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -109,7 +109,7 @@ conda update --yes --quiet conda conda create -n $CONDA_ENV_NAME --yes --quiet python numpy scipy \ cython nose coverage matplotlib sphinx=1.6.2 pillow source activate testenv -pip install numpydoc +pip install sphinx-gallery numpydoc # Build and install scikit-learn in dev mode python setup.py develop diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 8cd774d649338..1b0832b19ab9c 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -39,22 +39,30 @@ if [[ "$DISTRIB" == "conda" ]]; then # Configure the conda environment and put it in the path using the # provided versions + if [[ "$USE_PYTEST" == "true" ]]; then + TEST_RUNNER_PACKAGE=pytest + else + TEST_RUNNER_PACKAGE=nose + fi + if [[ "$INSTALL_MKL" == "true" ]]; then - conda create -n testenv --yes python=$PYTHON_VERSION pip nose pytest \ - numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \ + conda create -n testenv --yes python=$PYTHON_VERSION pip \ + $TEST_RUNNER_PACKAGE numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \ mkl cython=$CYTHON_VERSION \ ${PANDAS_VERSION+pandas=$PANDAS_VERSION} else - conda create -n testenv --yes python=$PYTHON_VERSION pip nose pytest \ - numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \ + conda create -n testenv --yes python=$PYTHON_VERSION pip \ + $TEST_RUNNER_PACKAGE numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \ nomkl cython=$CYTHON_VERSION \ ${PANDAS_VERSION+pandas=$PANDAS_VERSION} fi source activate testenv - # Install nose-timer via pip - pip install nose-timer + if [[ $USE_PYTEST != "true" ]]; then + # Install nose-timer via pip + pip install nose-timer + fi elif [[ "$DISTRIB" == "ubuntu" ]]; then # At the time of writing numpy 1.9.1 is included in the travis diff --git a/build_tools/travis/test_script.sh b/build_tools/travis/test_script.sh index cdcfbe01b3b8b..f7d3ab2a32e0e 100755 --- a/build_tools/travis/test_script.sh +++ b/build_tools/travis/test_script.sh @@ -43,10 +43,13 @@ run_tests() { fi $TEST_CMD sklearn - # Test doc (only with nose until we switch completely to pytest) - if [[ "$USE_PYTEST" != "true" ]]; then - # Going back to git checkout folder needed for make test-doc - cd $OLDPWD + # Going back to git checkout folder needed to test documentation + cd $OLDPWD + + if [[ "$USE_PYTEST" == "true" ]]; then + pytest $(find doc -name '*.rst' | sort) + else + # Makefile is using nose make test-doc fi } diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/doc/README.md b/doc/README.md index 141db3d7a8da5..82240fb701aa3 100644 --- a/doc/README.md +++ b/doc/README.md @@ -1,8 +1,13 @@ # Documentation for scikit-learn This section contains the full manual and web page as displayed in -http://scikit-learn.org. To generate the full web page, including -the example gallery (this might take a while): +http://scikit-learn.org. +Building the website requires the sphinx and sphinx-gallery packages: + + pip install sphinx sphinx-gallery + +To generate the full web page, including the example gallery (this might take a +while): make html @@ -16,7 +21,6 @@ To build the PDF manual, run make latexpdf - The website is hosted at github and can be updated manually (for releases) by pushing to the https://github.com/scikit-learn/scikit-learn.github.io repository. diff --git a/doc/datasets/conftest.py b/doc/datasets/conftest.py new file mode 100644 index 0000000000000..0ccc0bced9ee7 --- /dev/null +++ b/doc/datasets/conftest.py @@ -0,0 +1,75 @@ +from os.path import exists +from os.path import join + +import numpy as np + +from sklearn.utils.testing import SkipTest +from sklearn.utils.testing import check_skip_network +from sklearn.datasets import get_data_home +from sklearn.utils.testing import install_mldata_mock +from sklearn.utils.testing import uninstall_mldata_mock + + +def setup_labeled_faces(): + data_home = get_data_home() + if not exists(join(data_home, 'lfw_home')): + raise SkipTest("Skipping dataset loading doctests") + + +def setup_mldata(): + # setup mock urllib2 module to avoid downloading from mldata.org + install_mldata_mock({ + 'mnist-original': { + 'data': np.empty((70000, 784)), + 'label': np.repeat(np.arange(10, dtype='d'), 7000), + }, + 'iris': { + 'data': np.empty((150, 4)), + }, + 'datasets-uci-iris': { + 'double0': np.empty((150, 4)), + 'class': np.empty((150,)), + }, + }) + + +def teardown_mldata(): + uninstall_mldata_mock() + + +def setup_rcv1(): + check_skip_network() + # skip the test in rcv1.rst if the dataset is not already loaded + rcv1_dir = join(get_data_home(), "RCV1") + if not exists(rcv1_dir): + raise SkipTest("Download RCV1 dataset to run this test.") + + +def setup_twenty_newsgroups(): + data_home = get_data_home() + if not exists(join(data_home, '20news_home')): + raise SkipTest("Skipping dataset loading doctests") + + +def setup_working_with_text_data(): + check_skip_network() + + +def pytest_runtest_setup(item): + fname = item.fspath.strpath + if fname.endswith('datasets/labeled_faces.rst'): + setup_labeled_faces() + elif fname.endswith('datasets/mldata.rst'): + setup_mldata() + elif fname.endswith('datasets/rcv1.rst'): + setup_rcv1() + elif fname.endswith('datasets/twenty_newsgroups.rst'): + setup_twenty_newsgroups() + elif fname.endswith('datasets/working_with_text_data.rst'): + setup_working_with_text_data() + + +def pytest_runtest_teardown(item): + fname = item.fspath.strpath + if fname.endswith('datasets/mldata.rst'): + teardown_mldata() diff --git a/doc/datasets/mldata.rst b/doc/datasets/mldata.rst index 5083317cffc53..b94dfd7620a24 100644 --- a/doc/datasets/mldata.rst +++ b/doc/datasets/mldata.rst @@ -3,6 +3,11 @@ >>> import numpy as np >>> import os + >>> import tempfile + >>> # Create a temporary folder for the data fetcher + >>> custom_data_home = tempfile.mkdtemp() + >>> os.makedirs(os.path.join(custom_data_home, 'mldata')) + .. _mldata: @@ -70,3 +75,8 @@ defaults to individual datasets: ... data_home=custom_data_home) >>> iris3 = fetch_mldata('datasets-UCI iris', target_name='class', ... data_name='double0', data_home=custom_data_home) + + +.. + >>> import shutil + >>> shutil.rmtree(custom_data_home) diff --git a/doc/datasets/mldata_fixture.py b/doc/datasets/mldata_fixture.py index 37d9f9af05dc3..0ee5cccaa0f5e 100644 --- a/doc/datasets/mldata_fixture.py +++ b/doc/datasets/mldata_fixture.py @@ -3,26 +3,12 @@ Mock urllib2 access to mldata.org and create a temporary data folder. """ -from os import makedirs -from os.path import join import numpy as np -import tempfile -import shutil -from sklearn import datasets from sklearn.utils.testing import install_mldata_mock from sklearn.utils.testing import uninstall_mldata_mock -def globs(globs): - # Create a temporary folder for the data fetcher - global custom_data_home - custom_data_home = tempfile.mkdtemp() - makedirs(join(custom_data_home, 'mldata')) - globs['custom_data_home'] = custom_data_home - return globs - - def setup_module(): # setup mock urllib2 module to avoid downloading from mldata.org install_mldata_mock({ @@ -42,4 +28,3 @@ def setup_module(): def teardown_module(): uninstall_mldata_mock() - shutil.rmtree(custom_data_home) diff --git a/doc/index.rst b/doc/index.rst index e835de46a660e..ecea32e3229b9 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -207,13 +207,13 @@
  • On-going development: What's new (Changelog)
  • -
  • September 2016. scikit-learn 0.18.0 is available for download (Changelog). +
  • September 2016. scikit-learn 0.18.0 is available for download (Changelog).
  • -
  • November 2015. scikit-learn 0.17.0 is available for download (Changelog). +
  • November 2015. scikit-learn 0.17.0 is available for download (Changelog).
  • -
  • March 2015. scikit-learn 0.16.0 is available for download (Changelog). +
  • March 2015. scikit-learn 0.16.0 is available for download (Changelog).
  • -
  • July 2014. scikit-learn 0.15.0 is available for download (Changelog). +
  • July 2014. scikit-learn 0.15.0 is available for download (Changelog).
  • July 14-20th, 2014: international sprint. During this week-long sprint, we gathered 18 of the core @@ -227,7 +227,7 @@ Inria, and tinyclues.
  • -
  • August 2013. scikit-learn 0.14 is available for download (Changelog). +
  • August 2013. scikit-learn 0.14 is available for download (Changelog).
  • diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index b47726979351f..c68bb7ef275b0 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -273,7 +273,7 @@ validation strategies. .. _iid_cv: Cross-validation iterators for i.i.d. data -========================================== +------------------------------------------ Assuming that some data is Independent and Identically Distributed (i.i.d.) is making the assumption that all samples stem from the same generative process @@ -294,7 +294,7 @@ devices) it safer to use :ref:`group-wise cross-validation `. K-fold ------- +^^^^^^ :class:`KFold` divides all the samples in :math:`k` groups of samples, called folds (if :math:`k = n`, this is equivalent to the *Leave One @@ -323,7 +323,7 @@ Thus, one can create the training/test sets using numpy indexing:: Repeated K-Fold ---------------- +^^^^^^^^^^^^^^^ :class:`RepeatedKFold` repeats K-Fold n times. It can be used when one requires to run :class:`KFold` n times, producing different splits in @@ -350,7 +350,7 @@ with different randomization in each repetition. Leave One Out (LOO) -------------------- +^^^^^^^^^^^^^^^^^^^ :class:`LeaveOneOut` (or LOO) is a simple cross-validation. Each learning set is created by taking all the samples except one, the test set being @@ -408,7 +408,7 @@ fold cross validation should be preferred to LOO. Leave P Out (LPO) ------------------ +^^^^^^^^^^^^^^^^^ :class:`LeavePOut` is very similar to :class:`LeaveOneOut` as it creates all the possible training/test sets by removing :math:`p` samples from the complete @@ -435,7 +435,7 @@ Example of Leave-2-Out on a dataset with 4 samples:: .. _ShuffleSplit: Random permutations cross-validation a.k.a. Shuffle & Split ------------------------------------------------------------ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :class:`ShuffleSplit` @@ -465,7 +465,7 @@ validation that allows a finer control on the number of iterations and the proportion of samples on each side of the train / test split. Cross-validation iterators with stratification based on class labels. -===================================================================== +--------------------------------------------------------------------- Some classification problems can exhibit a large imbalance in the distribution of the target classes: for instance there could be several times more negative @@ -475,7 +475,7 @@ stratified sampling as implemented in :class:`StratifiedKFold` and approximately preserved in each train and validation fold. Stratified k-fold ------------------ +^^^^^^^^^^^^^^^^^ :class:`StratifiedKFold` is a variation of *k-fold* which returns *stratified* folds: each set contains approximately the same percentage of samples of each @@ -500,7 +500,7 @@ with different randomization in each repetition. Stratified Shuffle Split ------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^ :class:`StratifiedShuffleSplit` is a variation of *ShuffleSplit*, which returns stratified splits, *i.e* which creates splits by preserving the same @@ -509,7 +509,7 @@ percentage for each target class as in the complete set. .. _group_cv: Cross-validation iterators for grouped data. -============================================ +-------------------------------------------- The i.i.d. assumption is broken if the underlying generative process yield groups of dependent samples. @@ -530,7 +530,7 @@ parameter. Group k-fold ------------- +^^^^^^^^^^^^ :class:`GroupKFold` is a variation of k-fold which ensures that the same group is not represented in both testing and training sets. For example if the data is @@ -560,7 +560,7 @@ size due to the imbalance in the data. Leave One Group Out -------------------- +^^^^^^^^^^^^^^^^^^^ :class:`LeaveOneGroupOut` is a cross-validation scheme which holds out the samples according to a third-party provided array of integer groups. This @@ -591,7 +591,7 @@ groups could be the year of collection of the samples and thus allow for cross-validation against time-based splits. Leave P Groups Out ------------------- +^^^^^^^^^^^^^^^^^^ :class:`LeavePGroupsOut` is similar as :class:`LeaveOneGroupOut`, but removes samples related to :math:`P` groups for each training/test set. @@ -611,7 +611,7 @@ Example of Leave-2-Group Out:: [0 1] [2 3 4 5] Group Shuffle Split -------------------- +^^^^^^^^^^^^^^^^^^^ The :class:`GroupShuffleSplit` iterator behaves as a combination of :class:`ShuffleSplit` and :class:`LeavePGroupsOut`, and generates a @@ -643,7 +643,7 @@ generated by :class:`LeavePGroupsOut`. Predefined Fold-Splits / Validation-Sets -======================================== +---------------------------------------- For some datasets, a pre-defined split of the data into training- and validation fold or into several cross-validation folds already @@ -656,7 +656,7 @@ samples that are part of the validation set, and to -1 for all other samples. .. _timeseries_cv: Cross validation of time series data -==================================== +------------------------------------ Time series data is characterised by the correlation between observations that are near in time (*autocorrelation*). However, classical @@ -671,7 +671,7 @@ solution is provided by :class:`TimeSeriesSplit`. Time Series Split ------------------ +^^^^^^^^^^^^^^^^^ :class:`TimeSeriesSplit` is a variation of *k-fold* which returns first :math:`k` folds as train set and the :math:`(k+1)` th diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst index 386865d3d0a8a..62d566fe150ba 100644 --- a/doc/modules/svm.rst +++ b/doc/modules/svm.rst @@ -212,13 +212,12 @@ Then ``dual_coef_`` looks like this: Scores and probabilities ------------------------ -The :class:`SVC` method ``decision_function`` gives per-class scores -for each sample (or a single score per sample in the binary case). -When the constructor option ``probability`` is set to ``True``, -class membership probability estimates -(from the methods ``predict_proba`` and ``predict_log_proba``) are enabled. -In the binary case, the probabilities are calibrated using Platt scaling: -logistic regression on the SVM's scores, +The ``decision_function`` method of :class:`SVC` and :class:`NuSVC` gives +per-class scores for each sample (or a single score per sample in the binary +case). When the constructor option ``probability`` is set to ``True``, +class membership probability estimates (from the methods ``predict_proba`` and +``predict_log_proba``) are enabled. In the binary case, the probabilities are +calibrated using Platt scaling: logistic regression on the SVM's scores, fit by an additional cross-validation on the training data. In the multiclass case, this is extended as per Wu et al. (2004). @@ -245,7 +244,7 @@ and use ``decision_function`` instead of ``predict_proba``. * Platt `"Probabilistic outputs for SVMs and comparisons to regularized likelihood methods" - `. + `_. Unbalanced problems -------------------- @@ -399,7 +398,7 @@ Tips on Practical Use function can be configured to be almost the same as the :class:`LinearSVC` model. - * **Kernel cache size**: For :class:`SVC`, :class:`SVR`, :class:`nuSVC` and + * **Kernel cache size**: For :class:`SVC`, :class:`SVR`, :class:`NuSVC` and :class:`NuSVR`, the size of the kernel cache has a strong impact on run times for larger problems. If you have enough RAM available, it is recommended to set ``cache_size`` to a higher value than the default of @@ -423,10 +422,24 @@ Tips on Practical Use positive and few negative), set ``class_weight='balanced'`` and/or try different penalty parameters ``C``. - * The underlying :class:`LinearSVC` implementation uses a random - number generator to select features when fitting the model. It is - thus not uncommon, to have slightly different results for the same - input data. If that happens, try with a smaller tol parameter. + * **Randomness of the underlying implementations**: The underlying + implementations of :class:`SVC` and :class:`NuSVC` use a random number + generator only to shuffle the data for probability estimation (when + ``probability`` is set to ``True``). This randomness can be controlled + with the ``random_state`` parameter. If ``probability`` is set to ``False`` + these estimators are not random and ``random_state`` has no effect on the + results. The underlying :class:`OneClassSVM` implementation is similar to + the ones of :class:`SVC` and :class:`NuSVC`. As no probability estimation + is provided for :class:`OneClassSVM`, it is not random. + + The underlying :class:`LinearSVC` implementation uses a random number + generator to select features when fitting the model with a dual coordinate + descent (i.e when ``dual`` is set to ``True``). It is thus not uncommon, + to have slightly different results for the same input data. If that + happens, try with a smaller tol parameter. This randomness can also be + controlled with the ``random_state`` parameter. When ``dual`` is + set to ``False`` the underlying implementation of :class:`LinearSVC` is + not random and ``random_state`` has no effect on the results. * Using L1 penalization as provided by ``LinearSVC(loss='l2', penalty='l1', dual=False)`` yields a sparse solution, i.e. only a subset of feature diff --git a/doc/sphinxext/sphinx_gallery/__init__.py b/doc/sphinxext/sphinx_gallery/__init__.py deleted file mode 100644 index e113f97d2a2c7..0000000000000 --- a/doc/sphinxext/sphinx_gallery/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -""" -Sphinx Gallery -============== - -""" -import os -__version__ = '0.1.11' - - -def glr_path_static(): - """Returns path to packaged static files""" - return os.path.abspath(os.path.join(os.path.dirname(__file__), '_static')) diff --git a/doc/sphinxext/sphinx_gallery/_static/broken_example.png b/doc/sphinxext/sphinx_gallery/_static/broken_example.png deleted file mode 100644 index 4fea24e7df478..0000000000000 Binary files a/doc/sphinxext/sphinx_gallery/_static/broken_example.png and /dev/null differ diff --git a/doc/sphinxext/sphinx_gallery/_static/gallery.css b/doc/sphinxext/sphinx_gallery/_static/gallery.css deleted file mode 100644 index 37047a9b91175..0000000000000 --- a/doc/sphinxext/sphinx_gallery/_static/gallery.css +++ /dev/null @@ -1,192 +0,0 @@ -/* -Sphinx-Gallery has compatible CSS to fix default sphinx themes -Tested for Sphinx 1.3.1 for all themes: default, alabaster, sphinxdoc, -scrolls, agogo, traditional, nature, haiku, pyramid -Tested for Read the Docs theme 0.1.7 */ -.sphx-glr-thumbcontainer { - background: #fff; - border: solid #fff 1px; - -moz-border-radius: 5px; - -webkit-border-radius: 5px; - border-radius: 5px; - box-shadow: none; - float: left; - margin: 5px; - min-height: 230px; - padding-top: 5px; - position: relative; -} -.sphx-glr-thumbcontainer:hover { - border: solid #b4ddfc 1px; - box-shadow: 0 0 15px rgba(142, 176, 202, 0.5); -} -.sphx-glr-thumbcontainer a.internal { - bottom: 0; - display: block; - left: 0; - padding: 150px 10px 0; - position: absolute; - right: 0; - top: 0; -} -/* Next one is to avoid Sphinx traditional theme to cover all the -thumbnail with its default link Background color */ -.sphx-glr-thumbcontainer a.internal:hover { - background-color: transparent; -} - -.sphx-glr-thumbcontainer p { - margin: 0 0 .1em 0; -} -.sphx-glr-thumbcontainer .figure { - margin: 10px; - width: 160px; -} -.sphx-glr-thumbcontainer img { - display: inline; - max-height: 160px; - width: 160px; -} -.sphx-glr-thumbcontainer[tooltip]:hover:after { - background: rgba(0, 0, 0, 0.8); - -webkit-border-radius: 5px; - -moz-border-radius: 5px; - border-radius: 5px; - color: #fff; - content: attr(tooltip); - left: 95%; - padding: 5px 15px; - position: absolute; - z-index: 98; - width: 220px; - bottom: 52%; -} -.sphx-glr-thumbcontainer[tooltip]:hover:before { - border: solid; - border-color: #333 transparent; - border-width: 18px 0 0 20px; - bottom: 58%; - content: ''; - left: 85%; - position: absolute; - z-index: 99; -} - -.highlight-pytb pre { - background-color: #ffe4e4; - border: 1px solid #f66; - margin-top: 10px; - padding: 7px; -} - -.sphx-glr-script-out { - color: #888; - margin: 0; -} -.sphx-glr-script-out .highlight { - background-color: transparent; - margin-left: 2.5em; - margin-top: -1.4em; -} -.sphx-glr-script-out .highlight pre { - background-color: #fafae2; - border: 0; - max-height: 30em; - overflow: auto; - padding-left: 1ex; - margin: 0px; - word-break: break-word; -} -.sphx-glr-script-out + p { - margin-top: 1.8em; -} -blockquote.sphx-glr-script-out { - margin-left: 0pt; -} - -div.sphx-glr-footer { - text-align: center; -} - -div.sphx-glr-download { - display: inline-block; - margin: 1em auto 1ex 2ex; - vertical-align: middle; -} - -div.sphx-glr-download a { - background-color: #ffc; - background-image: linear-gradient(to bottom, #FFC, #d5d57e); - border-radius: 4px; - border: 1px solid #c2c22d; - color: #000; - display: inline-block; - /* Not valid in old browser, hence we keep the line above to override */ - display: table-caption; - font-weight: bold; - padding: 1ex; - text-align: center; -} - -/* The last child of a download button is the file name */ -div.sphx-glr-download a span:last-child { - font-size: smaller; -} - -@media (min-width: 20em) { - div.sphx-glr-download a { - min-width: 10em; - } -} - -@media (min-width: 30em) { - div.sphx-glr-download a { - min-width: 13em; - } -} - -@media (min-width: 40em) { - div.sphx-glr-download a { - min-width: 16em; - } -} - - -div.sphx-glr-download code.download { - display: inline-block; - white-space: normal; - word-break: normal; - overflow-wrap: break-word; - /* border and background are given by the enclosing 'a' */ - border: none; - background: none; -} - -div.sphx-glr-download a:hover { - box-shadow: inset 0 1px 0 rgba(255,255,255,.1), 0 1px 5px rgba(0,0,0,.25); - text-decoration: none; - background-image: none; - background-color: #d5d57e; -} - -ul.sphx-glr-horizontal { - list-style: none; - padding: 0; -} -ul.sphx-glr-horizontal li { - display: inline; -} -ul.sphx-glr-horizontal img { - height: auto !important; -} - -p.sphx-glr-signature a.reference.external { - -moz-border-radius: 5px; - -webkit-border-radius: 5px; - border-radius: 5px; - padding: 3px; - font-size: 75%; - text-align: right; - margin-left: auto; - display: table; -} diff --git a/doc/sphinxext/sphinx_gallery/_static/no_image.png b/doc/sphinxext/sphinx_gallery/_static/no_image.png deleted file mode 100644 index 8c2d48d5d3f00..0000000000000 Binary files a/doc/sphinxext/sphinx_gallery/_static/no_image.png and /dev/null differ diff --git a/doc/sphinxext/sphinx_gallery/backreferences.py b/doc/sphinxext/sphinx_gallery/backreferences.py deleted file mode 100644 index 32e4dd913f901..0000000000000 --- a/doc/sphinxext/sphinx_gallery/backreferences.py +++ /dev/null @@ -1,197 +0,0 @@ -# -*- coding: utf-8 -*- -# Author: Óscar Nájera -# License: 3-clause BSD -""" -Backreferences Generator -======================== - -Parses example file code in order to keep track of used functions -""" - -from __future__ import print_function -import ast -import os - - -# Try Python 2 first, otherwise load from Python 3 -try: - import cPickle as pickle -except ImportError: - import pickle - - -class NameFinder(ast.NodeVisitor): - """Finds the longest form of variable names and their imports in code - - Only retains names from imported modules. - """ - - def __init__(self): - super(NameFinder, self).__init__() - self.imported_names = {} - self.accessed_names = set() - - def visit_Import(self, node, prefix=''): - for alias in node.names: - local_name = alias.asname or alias.name - self.imported_names[local_name] = prefix + alias.name - - def visit_ImportFrom(self, node): - self.visit_Import(node, node.module + '.') - - def visit_Name(self, node): - self.accessed_names.add(node.id) - - def visit_Attribute(self, node): - attrs = [] - while isinstance(node, ast.Attribute): - attrs.append(node.attr) - node = node.value - - if isinstance(node, ast.Name): - # This is a.b, not e.g. a().b - attrs.append(node.id) - self.accessed_names.add('.'.join(reversed(attrs))) - else: - # need to get a in a().b - self.visit(node) - - def get_mapping(self): - for name in self.accessed_names: - local_name = name.split('.', 1)[0] - remainder = name[len(local_name):] - if local_name in self.imported_names: - # Join import path to relative path - full_name = self.imported_names[local_name] + remainder - yield name, full_name - - -def get_short_module_name(module_name, obj_name): - """ Get the shortest possible module name """ - parts = module_name.split('.') - short_name = module_name - for i in range(len(parts) - 1, 0, -1): - short_name = '.'.join(parts[:i]) - try: - exec('from %s import %s' % (short_name, obj_name)) - except Exception: # libraries can throw all sorts of exceptions... - # get the last working module name - short_name = '.'.join(parts[:(i + 1)]) - break - return short_name - - -def identify_names(code): - """Builds a codeobj summary by identifying and resolving used names - - >>> code = ''' - ... from a.b import c - ... import d as e - ... print(c) - ... e.HelloWorld().f.g - ... ''' - >>> for name, o in sorted(identify_names(code).items()): - ... print(name, o['name'], o['module'], o['module_short']) - c c a.b a.b - e.HelloWorld HelloWorld d d - """ - finder = NameFinder() - try: - finder.visit(ast.parse(code)) - except SyntaxError: - return {} - - example_code_obj = {} - for name, full_name in finder.get_mapping(): - # name is as written in file (e.g. np.asarray) - # full_name includes resolved import path (e.g. numpy.asarray) - splitted = full_name.rsplit('.', 1) - if len(splitted) == 1: - # module without attribute. This is not useful for - # backreferences - continue - - module, attribute = splitted - # get shortened module name - module_short = get_short_module_name(module, attribute) - cobj = {'name': attribute, 'module': module, - 'module_short': module_short} - example_code_obj[name] = cobj - return example_code_obj - - -def scan_used_functions(example_file, gallery_conf): - """save variables so we can later add links to the documentation""" - example_code_obj = identify_names(open(example_file).read()) - if example_code_obj: - codeobj_fname = example_file[:-3] + '_codeobj.pickle' - with open(codeobj_fname, 'wb') as fid: - pickle.dump(example_code_obj, fid, pickle.HIGHEST_PROTOCOL) - - backrefs = set('{module_short}.{name}'.format(**entry) - for entry in example_code_obj.values() - if entry['module'].startswith(gallery_conf['doc_module'])) - - return backrefs - - -THUMBNAIL_TEMPLATE = """ -.. raw:: html - -
    - -.. only:: html - - .. figure:: /{thumbnail} - - :ref:`sphx_glr_{ref_name}` - -.. raw:: html - -
    -""" - -BACKREF_THUMBNAIL_TEMPLATE = THUMBNAIL_TEMPLATE + """ -.. only:: not html - - * :ref:`sphx_glr_{ref_name}` -""" - - -def _thumbnail_div(full_dir, fname, snippet, is_backref=False): - """Generates RST to place a thumbnail in a gallery""" - thumb = os.path.join(full_dir, 'images', 'thumb', - 'sphx_glr_%s_thumb.png' % fname[:-3]) - - # Inside rst files forward slash defines paths - thumb = thumb.replace(os.sep, "/") - - ref_name = os.path.join(full_dir, fname).replace(os.path.sep, '_') - - template = BACKREF_THUMBNAIL_TEMPLATE if is_backref else THUMBNAIL_TEMPLATE - return template.format(snippet=snippet, thumbnail=thumb, ref_name=ref_name) - - -def write_backreferences(seen_backrefs, gallery_conf, - target_dir, fname, snippet): - """Writes down back reference files, which include a thumbnail list - of examples using a certain module""" - if gallery_conf['backreferences_dir'] is None: - return - - example_file = os.path.join(target_dir, fname) - build_target_dir = os.path.relpath(target_dir, gallery_conf['src_dir']) - backrefs = scan_used_functions(example_file, gallery_conf) - for backref in backrefs: - include_path = os.path.join(gallery_conf['src_dir'], - gallery_conf['backreferences_dir'], - '%s.examples' % backref) - seen = backref in seen_backrefs - with open(include_path, 'a' if seen else 'w') as ex_file: - if not seen: - heading = '\n\nExamples using ``%s``' % backref - ex_file.write(heading + '\n') - ex_file.write('^' * len(heading) + '\n') - ex_file.write(_thumbnail_div(build_target_dir, fname, snippet, - is_backref=True)) - seen_backrefs.add(backref) diff --git a/doc/sphinxext/sphinx_gallery/docs_resolv.py b/doc/sphinxext/sphinx_gallery/docs_resolv.py deleted file mode 100644 index 0f9943b683d1c..0000000000000 --- a/doc/sphinxext/sphinx_gallery/docs_resolv.py +++ /dev/null @@ -1,463 +0,0 @@ -# -*- coding: utf-8 -*- -# Author: Óscar Nájera -# License: 3-clause BSD -""" -Link resolver objects -===================== -""" -from __future__ import print_function -import gzip -import os -import posixpath -import re -import shelve -import sys - -from sphinx.util.console import fuchsia - -# Try Python 2 first, otherwise load from Python 3 -try: - import cPickle as pickle - import urllib2 as urllib - from urllib2 import HTTPError, URLError -except ImportError: - import pickle - import urllib.request - import urllib.error - import urllib.parse - from urllib.error import HTTPError, URLError - -from io import StringIO - - -def _get_data(url): - """Helper function to get data over http or from a local file""" - if url.startswith('http://'): - # Try Python 2, use Python 3 on exception - try: - resp = urllib.urlopen(url) - encoding = resp.headers.dict.get('content-encoding', 'plain') - except AttributeError: - resp = urllib.request.urlopen(url) - encoding = resp.headers.get('content-encoding', 'plain') - data = resp.read() - if encoding == 'plain': - pass - elif encoding == 'gzip': - data = StringIO(data) - data = gzip.GzipFile(fileobj=data).read() - else: - raise RuntimeError('unknown encoding') - else: - with open(url, 'r') as fid: - data = fid.read() - - return data - - -def get_data(url, gallery_dir): - """Persistent dictionary usage to retrieve the search indexes""" - - # shelve keys need to be str in python 2 - if sys.version_info[0] == 2 and isinstance(url, unicode): - url = url.encode('utf-8') - - cached_file = os.path.join(gallery_dir, 'searchindex') - search_index = shelve.open(cached_file) - if url in search_index: - data = search_index[url] - else: - data = _get_data(url) - search_index[url] = data - search_index.close() - - return data - - -def _select_block(str_in, start_tag, end_tag): - """Select first block delimited by start_tag and end_tag""" - start_pos = str_in.find(start_tag) - if start_pos < 0: - raise ValueError('start_tag not found') - depth = 0 - for pos in range(start_pos, len(str_in)): - if str_in[pos] == start_tag: - depth += 1 - elif str_in[pos] == end_tag: - depth -= 1 - - if depth == 0: - break - sel = str_in[start_pos + 1:pos] - return sel - - -def _parse_dict_recursive(dict_str): - """Parse a dictionary from the search index""" - dict_out = dict() - pos_last = 0 - pos = dict_str.find(':') - while pos >= 0: - key = dict_str[pos_last:pos] - if dict_str[pos + 1] == '[': - # value is a list - pos_tmp = dict_str.find(']', pos + 1) - if pos_tmp < 0: - raise RuntimeError('error when parsing dict') - value = dict_str[pos + 2: pos_tmp].split(',') - # try to convert elements to int - for i in range(len(value)): - try: - value[i] = int(value[i]) - except ValueError: - pass - elif dict_str[pos + 1] == '{': - # value is another dictionary - subdict_str = _select_block(dict_str[pos:], '{', '}') - value = _parse_dict_recursive(subdict_str) - pos_tmp = pos + len(subdict_str) - else: - raise ValueError('error when parsing dict: unknown elem') - - key = key.strip('"') - if len(key) > 0: - dict_out[key] = value - - pos_last = dict_str.find(',', pos_tmp) - if pos_last < 0: - break - pos_last += 1 - pos = dict_str.find(':', pos_last) - - return dict_out - - -def parse_sphinx_searchindex(searchindex): - """Parse a Sphinx search index - - Parameters - ---------- - searchindex : str - The Sphinx search index (contents of searchindex.js) - - Returns - ------- - filenames : list of str - The file names parsed from the search index. - objects : dict - The objects parsed from the search index. - """ - # Make sure searchindex uses UTF-8 encoding - if hasattr(searchindex, 'decode'): - searchindex = searchindex.decode('UTF-8') - - # parse objects - query = 'objects:' - pos = searchindex.find(query) - if pos < 0: - raise ValueError('"objects:" not found in search index') - - sel = _select_block(searchindex[pos:], '{', '}') - objects = _parse_dict_recursive(sel) - - # parse filenames - query = 'filenames:' - pos = searchindex.find(query) - if pos < 0: - raise ValueError('"filenames:" not found in search index') - filenames = searchindex[pos + len(query) + 1:] - filenames = filenames[:filenames.find(']')] - filenames = [f.strip('"') for f in filenames.split(',')] - - return filenames, objects - - -class SphinxDocLinkResolver(object): - """ Resolve documentation links using searchindex.js generated by Sphinx - - Parameters - ---------- - doc_url : str - The base URL of the project website. - searchindex : str - Filename of searchindex, relative to doc_url. - extra_modules_test : list of str - List of extra module names to test. - relative : bool - Return relative links (only useful for links to documentation of this - package). - """ - - def __init__(self, doc_url, gallery_dir, searchindex='searchindex.js', - extra_modules_test=None, relative=False): - self.doc_url = doc_url - self.gallery_dir = gallery_dir - self.relative = relative - self._link_cache = {} - - self.extra_modules_test = extra_modules_test - self._page_cache = {} - if doc_url.startswith('http://'): - if relative: - raise ValueError('Relative links are only supported for local ' - 'URLs (doc_url cannot start with "http://)"') - searchindex_url = doc_url + '/' + searchindex - else: - searchindex_url = os.path.join(doc_url, searchindex) - - # detect if we are using relative links on a Windows system - if os.name.lower() == 'nt' and not doc_url.startswith('http://'): - if not relative: - raise ValueError('You have to use relative=True for the local' - ' package on a Windows system.') - self._is_windows = True - else: - self._is_windows = False - - # download and initialize the search index - sindex = get_data(searchindex_url, gallery_dir) - filenames, objects = parse_sphinx_searchindex(sindex) - - self._searchindex = dict(filenames=filenames, objects=objects) - - def _get_link(self, cobj): - """Get a valid link, False if not found""" - - fname_idx = None - full_name = cobj['module_short'] + '.' + cobj['name'] - if full_name in self._searchindex['objects']: - value = self._searchindex['objects'][full_name] - if isinstance(value, dict): - value = value[next(iter(value.keys()))] - fname_idx = value[0] - elif cobj['module_short'] in self._searchindex['objects']: - value = self._searchindex['objects'][cobj['module_short']] - if cobj['name'] in value.keys(): - fname_idx = value[cobj['name']][0] - - if fname_idx is not None: - fname = self._searchindex['filenames'][fname_idx] - # In 1.5+ Sphinx seems to have changed from .rst.html to only - # .html extension in converted files. But URLs could be - # built with < 1.5 or >= 1.5 regardless of what we're currently - # building with, so let's just check both :( - fnames = [fname + '.html', os.path.splitext(fname)[0] + '.html'] - for fname in fnames: - try: - if self._is_windows: - fname = fname.replace('/', '\\') - link = os.path.join(self.doc_url, fname) - else: - link = posixpath.join(self.doc_url, fname) - - if hasattr(link, 'decode'): - link = link.decode('utf-8', 'replace') - - if link in self._page_cache: - html = self._page_cache[link] - else: - html = get_data(link, self.gallery_dir) - self._page_cache[link] = html - except (HTTPError, URLError, IOError): - pass - else: - break - else: - raise - - # test if cobj appears in page - comb_names = [cobj['module_short'] + '.' + cobj['name']] - if self.extra_modules_test is not None: - for mod in self.extra_modules_test: - comb_names.append(mod + '.' + cobj['name']) - url = False - if hasattr(html, 'decode'): - # Decode bytes under Python 3 - html = html.decode('utf-8', 'replace') - - for comb_name in comb_names: - if hasattr(comb_name, 'decode'): - # Decode bytes under Python 3 - comb_name = comb_name.decode('utf-8', 'replace') - if comb_name in html: - url = link + u'#' + comb_name - link = url - else: - link = False - - return link - - def resolve(self, cobj, this_url): - """Resolve the link to the documentation, returns None if not found - - Parameters - ---------- - cobj : dict - Dict with information about the "code object" for which we are - resolving a link. - cobj['name'] : function or class name (str) - cobj['module_short'] : shortened module name (str) - cobj['module'] : module name (str) - this_url: str - URL of the current page. Needed to construct relative URLs - (only used if relative=True in constructor). - - Returns - ------- - link : str | None - The link (URL) to the documentation. - """ - full_name = cobj['module_short'] + '.' + cobj['name'] - link = self._link_cache.get(full_name, None) - if link is None: - # we don't have it cached - link = self._get_link(cobj) - # cache it for the future - self._link_cache[full_name] = link - - if link is False or link is None: - # failed to resolve - return None - - if self.relative: - link = os.path.relpath(link, start=this_url) - if self._is_windows: - # replace '\' with '/' so it on the web - link = link.replace('\\', '/') - - # for some reason, the relative link goes one directory too high up - link = link[3:] - - return link - - -def _embed_code_links(app, gallery_conf, gallery_dir): - # Add resolvers for the packages for which we want to show links - doc_resolvers = {} - - src_gallery_dir = os.path.join(app.builder.srcdir, gallery_dir) - for this_module, url in gallery_conf['reference_url'].items(): - try: - if url is None: - doc_resolvers[this_module] = SphinxDocLinkResolver( - app.builder.outdir, - src_gallery_dir, - relative=True) - else: - doc_resolvers[this_module] = SphinxDocLinkResolver(url, - src_gallery_dir) - - except HTTPError as e: - print("The following HTTP Error has occurred:\n") - print(e.code) - except URLError as e: - print("\n...\n" - "Warning: Embedding the documentation hyperlinks requires " - "Internet access.\nPlease check your network connection.\n" - "Unable to continue embedding `{0}` links due to a URL " - "Error:\n".format(this_module)) - print(e.args) - - html_gallery_dir = os.path.abspath(os.path.join(app.builder.outdir, - gallery_dir)) - - # patterns for replacement - link_pattern = ('%s') - orig_pattern = '%s' - period = '.' - - # This could be turned into a generator if necessary, but should be okay - flat = [[dirpath, filename] - for dirpath, _, filenames in os.walk(html_gallery_dir) - for filename in filenames] - iterator = app.status_iterator( - flat, os.path.basename(html_gallery_dir), colorfunc=fuchsia, - length=len(flat), stringify_func=lambda x: os.path.basename(x[1])) - for dirpath, fname in iterator: - full_fname = os.path.join(html_gallery_dir, dirpath, fname) - subpath = dirpath[len(html_gallery_dir) + 1:] - pickle_fname = os.path.join(src_gallery_dir, subpath, - fname[:-5] + '_codeobj.pickle') - - if os.path.exists(pickle_fname): - # we have a pickle file with the objects to embed links for - with open(pickle_fname, 'rb') as fid: - example_code_obj = pickle.load(fid) - fid.close() - str_repl = {} - # generate replacement strings with the links - for name, cobj in example_code_obj.items(): - this_module = cobj['module'].split('.')[0] - - if this_module not in doc_resolvers: - continue - - try: - link = doc_resolvers[this_module].resolve(cobj, - full_fname) - except (HTTPError, URLError) as e: - if isinstance(e, HTTPError): - extra = e.code - else: - extra = e.reason - print("\n\t\tError resolving %s.%s: %r (%s)" - % (cobj['module'], cobj['name'], e, extra)) - continue - - if link is not None: - parts = name.split('.') - name_html = period.join(orig_pattern % part - for part in parts) - full_function_name = '%s.%s' % ( - cobj['module'], cobj['name']) - str_repl[name_html] = link_pattern % ( - link, full_function_name, name_html) - # do the replacement in the html file - - # ensure greediness - names = sorted(str_repl, key=len, reverse=True) - regex_str = '|'.join(re.escape(name) for name in names) - regex = re.compile(regex_str) - - def substitute_link(match): - return str_repl[match.group()] - - if len(str_repl) > 0: - with open(full_fname, 'rb') as fid: - lines_in = fid.readlines() - with open(full_fname, 'wb') as fid: - for line in lines_in: - line = line.decode('utf-8') - line = regex.sub(substitute_link, line) - fid.write(line.encode('utf-8')) - - -def embed_code_links(app, exception): - """Embed hyperlinks to documentation into example code""" - if exception is not None: - return - - # No need to waste time embedding hyperlinks when not running the examples - # XXX: also at the time of writing this fixes make html-noplot - # for some reason I don't fully understand - if not app.builder.config.plot_gallery: - return - - # XXX: Whitelist of builders for which it makes sense to embed - # hyperlinks inside the example html. Note that the link embedding - # require searchindex.js to exist for the links to the local doc - # and there does not seem to be a good way of knowing which - # builders creates a searchindex.js. - if app.builder.name not in ['html', 'readthedocs']: - return - - print('Embedding documentation hyperlinks in examples..') - - gallery_conf = app.config.sphinx_gallery_conf - - gallery_dirs = gallery_conf['gallery_dirs'] - if not isinstance(gallery_dirs, list): - gallery_dirs = [gallery_dirs] - - for gallery_dir in gallery_dirs: - _embed_code_links(app, gallery_conf, gallery_dir) diff --git a/doc/sphinxext/sphinx_gallery/downloads.py b/doc/sphinxext/sphinx_gallery/downloads.py deleted file mode 100644 index 6b5b3df17fc87..0000000000000 --- a/doc/sphinxext/sphinx_gallery/downloads.py +++ /dev/null @@ -1,120 +0,0 @@ -# -*- coding: utf-8 -*- -r""" -Utilities for downloadable items -================================ - -""" -# Author: Óscar Nájera -# License: 3-clause BSD - -from __future__ import absolute_import, division, print_function - -import os -import zipfile - -CODE_DOWNLOAD = """ -\n.. container:: sphx-glr-footer - -\n .. container:: sphx-glr-download - - :download:`Download Python source code: {0} <{0}>`\n - -\n .. container:: sphx-glr-download - - :download:`Download Jupyter notebook: {1} <{1}>`\n""" - -CODE_ZIP_DOWNLOAD = """ -\n.. container:: sphx-glr-footer - -\n .. container:: sphx-glr-download - - :download:`Download all examples in Python source code: {0} `\n - -\n .. container:: sphx-glr-download - - :download:`Download all examples in Jupyter notebooks: {2} `\n""" - - -def python_zip(file_list, gallery_path, extension='.py'): - """Stores all files in file_list into an zip file - - Parameters - ---------- - file_list : list of strings - Holds all the file names to be included in zip file - gallery_path : string - path to where the zipfile is stored - extension : str - '.py' or '.ipynb' In order to deal with downloads of python - sources and jupyter notebooks the file extension from files in - file_list will be removed and replace with the value of this - variable while generating the zip file - Returns - ------- - zipname : string - zip file name, written as `target_dir_{python,jupyter}.zip` - depending on the extension - """ - zipname = os.path.basename(gallery_path) - zipname += '_python' if extension == '.py' else '_jupyter' - zipname = os.path.join(gallery_path, zipname + '.zip') - - zipf = zipfile.ZipFile(zipname, mode='w') - for fname in file_list: - file_src = os.path.splitext(fname)[0] + extension - zipf.write(file_src, os.path.relpath(file_src, gallery_path)) - zipf.close() - - return zipname - - -def list_downloadable_sources(target_dir): - """Returns a list of python source files is target_dir - - Parameters - ---------- - target_dir : string - path to the directory where python source file are - Returns - ------- - list - list of paths to all Python source files in `target_dir` - """ - return [os.path.join(target_dir, fname) - for fname in os.listdir(target_dir) - if fname.endswith('.py')] - - -def generate_zipfiles(gallery_dir): - """ - Collects all Python source files and Jupyter notebooks in - gallery_dir and makes zipfiles of them - - Parameters - ---------- - gallery_dir : string - path of the gallery to collect downloadable sources - - Return - ------ - download_rst: string - RestructuredText to include download buttons to the generated files - """ - - listdir = list_downloadable_sources(gallery_dir) - for directory in sorted(os.listdir(gallery_dir)): - if os.path.isdir(os.path.join(gallery_dir, directory)): - target_dir = os.path.join(gallery_dir, directory) - listdir.extend(list_downloadable_sources(target_dir)) - - py_zipfile = python_zip(listdir, gallery_dir) - jy_zipfile = python_zip(listdir, gallery_dir, ".ipynb") - - def rst_path(filepath): - return filepath.replace(os.sep, '/') - - dw_rst = CODE_ZIP_DOWNLOAD.format(os.path.basename(py_zipfile), - rst_path(py_zipfile), - os.path.basename(jy_zipfile), - rst_path(jy_zipfile)) - return dw_rst diff --git a/doc/sphinxext/sphinx_gallery/gen_gallery.py b/doc/sphinxext/sphinx_gallery/gen_gallery.py deleted file mode 100644 index 1a1ce299fab1c..0000000000000 --- a/doc/sphinxext/sphinx_gallery/gen_gallery.py +++ /dev/null @@ -1,304 +0,0 @@ -# -*- coding: utf-8 -*- -# Author: Óscar Nájera -# License: 3-clause BSD -""" -Sphinx-Gallery Generator -======================== - -Attaches Sphinx-Gallery to Sphinx in order to generate the galleries -when building the documentation. -""" - - -from __future__ import division, print_function, absolute_import -import copy -import re -import os - -from . import glr_path_static -from .gen_rst import generate_dir_rst, SPHX_GLR_SIG -from .docs_resolv import embed_code_links -from .downloads import generate_zipfiles - -try: - FileNotFoundError -except NameError: - # Python2 - FileNotFoundError = IOError - -DEFAULT_GALLERY_CONF = { - 'filename_pattern': re.escape(os.sep) + 'plot', - 'examples_dirs': os.path.join('..', 'examples'), - 'gallery_dirs': 'auto_examples', - 'backreferences_dir': None, - 'doc_module': (), - 'reference_url': {}, - # build options - 'plot_gallery': True, - 'download_all_examples': True, - 'abort_on_example_error': False, - 'failing_examples': {}, - 'expected_failing_examples': set(), -} - - -def clean_gallery_out(build_dir): - """Deletes images under the sphx_glr namespace in the build directory""" - # Sphinx hack: sphinx copies generated images to the build directory - # each time the docs are made. If the desired image name already - # exists, it appends a digit to prevent overwrites. The problem is, - # the directory is never cleared. This means that each time you build - # the docs, the number of images in the directory grows. - # - # This question has been asked on the sphinx development list, but there - # was no response: http://osdir.com/ml/sphinx-dev/2011-02/msg00123.html - # - # The following is a hack that prevents this behavior by clearing the - # image build directory from gallery images each time the docs are built. - # If sphinx changes their layout between versions, this will not - # work (though it should probably not cause a crash). - # Tested successfully on Sphinx 1.0.7 - - build_image_dir = os.path.join(build_dir, '_images') - if os.path.exists(build_image_dir): - filelist = os.listdir(build_image_dir) - for filename in filelist: - if filename.startswith('sphx_glr') and filename.endswith('png'): - os.remove(os.path.join(build_image_dir, filename)) - - -def parse_config(app): - """Process the Sphinx Gallery configuration""" - # TODO: Test this behavior. - try: - plot_gallery = eval(app.builder.config.plot_gallery) - except TypeError: - plot_gallery = bool(app.builder.config.plot_gallery) - - gallery_conf = copy.deepcopy(DEFAULT_GALLERY_CONF) - gallery_conf.update(app.config.sphinx_gallery_conf) - gallery_conf.update(plot_gallery=plot_gallery) - gallery_conf.update( - abort_on_example_error=app.builder.config.abort_on_example_error) - gallery_conf['src_dir'] = app.builder.srcdir - - backreferences_warning = """\n======== -Sphinx-Gallery now requires you to set the configuration variable -'backreferences_dir' in your config to activate the -backreferences. That is mini galleries clustered by the functions used -in the example scripts. Have a look at it in sphinx-gallery - -https://sphinx-gallery.readthedocs.io/en/stable/index.html#examples-using-numpy-linspace -""" - - if gallery_conf.get("mod_example_dir", False): - update_msg = """\nFor a quick fix try replacing 'mod_example_dir' -by 'backreferences_dir' in your conf.py file. If that does not solve the -present issue read carefully how to update in the online documentation - -https://sphinx-gallery.readthedocs.io/en/latest/advanced_configuration.html#references-to-examples""" - - gallery_conf['backreferences_dir'] = gallery_conf['mod_example_dir'] - app.warn("Old configuration for backreferences detected \n" - "using the configuration variable `mod_example_dir`\n" - + backreferences_warning - + update_msg, prefix="DeprecationWarning: ") - - elif gallery_conf['backreferences_dir'] is None: - no_care_msg = """ -If you don't care about this features set in your conf.py -'backreferences_dir': False\n""" - - app.warn(backreferences_warning + no_care_msg) - - gallery_conf['backreferences_dir'] = os.path.join( - 'modules', 'generated') - app.warn("using old default 'backreferences_dir':'{}'.\n" - " This will be disabled in future releases\n".format( - gallery_conf['backreferences_dir']), - prefix="DeprecationWarning: ") - - # this assures I can call the config in other places - app.config.sphinx_gallery_conf = gallery_conf - app.config.html_static_path.append(glr_path_static()) - - return gallery_conf - - -def _prepare_sphx_glr_dirs(gallery_conf, srcdir): - """Creates necessary folders for sphinx_gallery files """ - examples_dirs = gallery_conf['examples_dirs'] - gallery_dirs = gallery_conf['gallery_dirs'] - - if not isinstance(examples_dirs, list): - examples_dirs = [examples_dirs] - if not isinstance(gallery_dirs, list): - gallery_dirs = [gallery_dirs] - - if bool(gallery_conf['backreferences_dir']): - backreferences_dir = os.path.join( - srcdir, gallery_conf['backreferences_dir']) - if not os.path.exists(backreferences_dir): - os.makedirs(backreferences_dir) - - return examples_dirs, gallery_dirs - - -def generate_gallery_rst(app): - """Generate the Main examples gallery reStructuredText - - Start the sphinx-gallery configuration and recursively scan the examples - directories in order to populate the examples gallery - """ - print('Generating gallery') - gallery_conf = parse_config(app) - - clean_gallery_out(app.builder.outdir) - - seen_backrefs = set() - - computation_times = [] - examples_dirs, gallery_dirs = _prepare_sphx_glr_dirs(gallery_conf, - app.builder.srcdir) - - for examples_dir, gallery_dir in zip(examples_dirs, gallery_dirs): - examples_dir = os.path.join(app.builder.srcdir, examples_dir) - gallery_dir = os.path.join(app.builder.srcdir, gallery_dir) - - for workdir in [examples_dir, gallery_dir]: - if not os.path.exists(workdir): - os.makedirs(workdir) - # Here we don't use an os.walk, but we recurse only twice: flat is - # better than nested. - this_fhindex, this_computation_times = generate_dir_rst( - examples_dir, gallery_dir, gallery_conf, seen_backrefs) - if this_fhindex == "": - raise FileNotFoundError("Main example directory {0} does not " - "have a README.txt file. Please write " - "one to introduce your gallery." - .format(examples_dir)) - - computation_times += this_computation_times - - # we create an index.rst with all examples - fhindex = open(os.path.join(gallery_dir, 'index.rst'), 'w') - # :orphan: to suppress "not included in TOCTREE" sphinx warnings - fhindex.write(":orphan:\n\n" + this_fhindex) - for directory in sorted(os.listdir(examples_dir)): - if os.path.isdir(os.path.join(examples_dir, directory)): - src_dir = os.path.join(examples_dir, directory) - target_dir = os.path.join(gallery_dir, directory) - this_fhindex, this_computation_times = generate_dir_rst(src_dir, target_dir, gallery_conf, - seen_backrefs) - fhindex.write(this_fhindex) - computation_times += this_computation_times - - if gallery_conf['download_all_examples']: - download_fhindex = generate_zipfiles(gallery_dir) - fhindex.write(download_fhindex) - - fhindex.write(SPHX_GLR_SIG) - fhindex.flush() - - if gallery_conf['plot_gallery']: - print("Computation time summary:") - for time_elapsed, fname in sorted(computation_times)[::-1]: - if time_elapsed is not None: - print("\t- %s : %.2g sec" % (fname, time_elapsed)) - else: - print("\t- %s : not run" % fname) - - -def touch_empty_backreferences(app, what, name, obj, options, lines): - """Generate empty back-reference example files - - This avoids inclusion errors/warnings if there are no gallery - examples for a class / module that is being parsed by autodoc""" - - if not bool(app.config.sphinx_gallery_conf['backreferences_dir']): - return - - examples_path = os.path.join(app.srcdir, - app.config.sphinx_gallery_conf[ - "backreferences_dir"], - "%s.examples" % name) - - if not os.path.exists(examples_path): - # touch file - open(examples_path, 'w').close() - - -def sumarize_failing_examples(app, exception): - """Collects the list of falling examples during build and prints them with the traceback - - Raises ValueError if there where failing examples - """ - if exception is not None: - return - - # Under no-plot Examples are not run so nothing to summarize - if not app.config.sphinx_gallery_conf['plot_gallery']: - return - - gallery_conf = app.config.sphinx_gallery_conf - failing_examples = set(gallery_conf['failing_examples'].keys()) - expected_failing_examples = set([os.path.normpath(os.path.join(app.srcdir, path)) - for path in - gallery_conf['expected_failing_examples']]) - - examples_expected_to_fail = failing_examples.intersection( - expected_failing_examples) - expected_fail_msg = [] - if examples_expected_to_fail: - expected_fail_msg.append("\n\nExamples failing as expected:") - for fail_example in examples_expected_to_fail: - expected_fail_msg.append(fail_example + ' failed leaving traceback:\n' + - gallery_conf['failing_examples'][fail_example] + '\n') - print("\n".join(expected_fail_msg)) - - examples_not_expected_to_fail = failing_examples.difference( - expected_failing_examples) - fail_msgs = [] - if examples_not_expected_to_fail: - fail_msgs.append("Unexpected failing examples:") - for fail_example in examples_not_expected_to_fail: - fail_msgs.append(fail_example + ' failed leaving traceback:\n' + - gallery_conf['failing_examples'][fail_example] + '\n') - - examples_not_expected_to_pass = expected_failing_examples.difference( - failing_examples) - if examples_not_expected_to_pass: - fail_msgs.append("Examples expected to fail, but not failling:\n" + - "Please remove these examples from\n" + - "sphinx_gallery_conf['expected_failing_examples']\n" + - "in your conf.py file" - "\n".join(examples_not_expected_to_pass)) - - if fail_msgs: - raise ValueError("Here is a summary of the problems encountered when " - "running the examples\n\n" + "\n".join(fail_msgs) + - "\n" + "-" * 79) - - -def get_default_config_value(key): - def default_getter(conf): - return conf['sphinx_gallery_conf'].get(key, DEFAULT_GALLERY_CONF[key]) - return default_getter - - -def setup(app): - """Setup sphinx-gallery sphinx extension""" - app.add_config_value('sphinx_gallery_conf', DEFAULT_GALLERY_CONF, 'html') - for key in ['plot_gallery', 'abort_on_example_error']: - app.add_config_value(key, get_default_config_value(key), 'html') - - app.add_stylesheet('gallery.css') - # Sphinx < 1.6 calls it `_extensions`, >= 1.6 is `extensions`. - extensions_attr = '_extensions' if hasattr(app, '_extensions') else 'extensions' - if 'sphinx.ext.autodoc' in getattr(app, extensions_attr): - app.connect('autodoc-process-docstring', touch_empty_backreferences) - - app.connect('builder-inited', generate_gallery_rst) - - app.connect('build-finished', sumarize_failing_examples) - app.connect('build-finished', embed_code_links) diff --git a/doc/sphinxext/sphinx_gallery/gen_rst.py b/doc/sphinxext/sphinx_gallery/gen_rst.py deleted file mode 100644 index c2a0b95545499..0000000000000 --- a/doc/sphinxext/sphinx_gallery/gen_rst.py +++ /dev/null @@ -1,641 +0,0 @@ -# -*- coding: utf-8 -*- -# Author: Óscar Nájera -# License: 3-clause BSD -""" -RST file generator -================== - -Generate the rst files for the examples by iterating over the python -example files. - -Files that generate images should start with 'plot' - -""" -# Don't use unicode_literals here (be explicit with u"..." instead) otherwise -# tricky errors come up with exec(code_blocks, ...) calls -from __future__ import division, print_function, absolute_import -from time import time -import codecs -import hashlib -import os -import re -import shutil -import subprocess -import sys -import traceback -import warnings - - -# Try Python 2 first, otherwise load from Python 3 -try: - # textwrap indent only exists in python 3 - from textwrap import indent -except ImportError: - def indent(text, prefix, predicate=None): - """Adds 'prefix' to the beginning of selected lines in 'text'. - - If 'predicate' is provided, 'prefix' will only be added to the lines - where 'predicate(line)' is True. If 'predicate' is not provided, - it will default to adding 'prefix' to all non-empty lines that do not - consist solely of whitespace characters. - """ - if predicate is None: - def predicate(line): - return line.strip() - - def prefixed_lines(): - for line in text.splitlines(True): - yield (prefix + line if predicate(line) else line) - return ''.join(prefixed_lines()) - -from io import StringIO - -# make sure that the Agg backend is set before importing any -# matplotlib -import matplotlib -matplotlib.use('agg') -matplotlib_backend = matplotlib.get_backend() - -if matplotlib_backend != 'agg': - mpl_backend_msg = ( - "Sphinx-Gallery relies on the matplotlib 'agg' backend to " - "render figures and write them to files. You are " - "currently using the {} backend. Sphinx-Gallery will " - "terminate the build now, because changing backends is " - "not well supported by matplotlib. We advise you to move " - "sphinx_gallery imports before any matplotlib-dependent " - "import. Moving sphinx_gallery imports at the top of " - "your conf.py file should fix this issue") - - raise ValueError(mpl_backend_msg.format(matplotlib_backend)) - -import matplotlib.pyplot as plt - -from . import glr_path_static -from .backreferences import write_backreferences, _thumbnail_div -from .downloads import CODE_DOWNLOAD -from .py_source_parser import (get_docstring_and_rest, - split_code_and_text_blocks) - -from .notebook import jupyter_notebook, save_notebook - -try: - basestring -except NameError: - basestring = str - unicode = str - - -############################################################################### - - -class Tee(object): - """A tee object to redirect streams to multiple outputs""" - - def __init__(self, file1, file2): - self.file1 = file1 - self.file2 = file2 - - def write(self, data): - self.file1.write(data) - self.file2.write(data) - - def flush(self): - self.file1.flush() - self.file2.flush() - - # When called from a local terminal seaborn needs it in Python3 - def isatty(self): - self.file1.isatty() - - -class MixedEncodingStringIO(StringIO): - """Helper when both ASCII and unicode strings will be written""" - - def write(self, data): - if not isinstance(data, unicode): - data = data.decode('utf-8') - StringIO.write(self, data) - - -############################################################################### -# The following strings are used when we have several pictures: we use -# an html div tag that our CSS uses to turn the lists into horizontal -# lists. -HLIST_HEADER = """ -.. rst-class:: sphx-glr-horizontal - -""" - -HLIST_IMAGE_TEMPLATE = """ - * - - .. image:: /%s - :scale: 47 -""" - -SINGLE_IMAGE = """ -.. image:: /%s - :align: center -""" - - -# This one could contain unicode -CODE_OUTPUT = u""".. rst-class:: sphx-glr-script-out - - Out:: - -{0}\n""" - - -SPHX_GLR_SIG = """\n.. rst-class:: sphx-glr-signature - - `Generated by Sphinx-Gallery `_\n""" - - -def codestr2rst(codestr, lang='python'): - """Return reStructuredText code block from code string""" - code_directive = "\n.. code-block:: {0}\n\n".format(lang) - indented_block = indent(codestr, ' ' * 4) - return code_directive + indented_block - - -def extract_thumbnail_number(text): - """ Pull out the thumbnail image number specified in the docstring. """ - - # check whether the user has specified a specific thumbnail image - pattr = re.compile( - r"^\s*#\s*sphinx_gallery_thumbnail_number\s*=\s*([0-9]+)\s*$", - flags=re.MULTILINE) - match = pattr.search(text) - - if match is None: - # by default, use the first figure created - thumbnail_number = 1 - else: - thumbnail_number = int(match.groups()[0]) - - return thumbnail_number - - -def extract_intro(filename): - """ Extract the first paragraph of module-level docstring. max:95 char""" - - docstring, _ = get_docstring_and_rest(filename) - - # lstrip is just in case docstring has a '\n\n' at the beginning - paragraphs = docstring.lstrip().split('\n\n') - if len(paragraphs) > 1: - first_paragraph = re.sub('\n', ' ', paragraphs[1]) - first_paragraph = (first_paragraph[:95] + '...' - if len(first_paragraph) > 95 else first_paragraph) - else: - raise ValueError( - "Example docstring should have a header for the example title " - "and at least a paragraph explaining what the example is about. " - "Please check the example file:\n {}\n".format(filename)) - - return first_paragraph - - -def get_md5sum(src_file): - """Returns md5sum of file""" - - with open(src_file, 'rb') as src_data: - src_content = src_data.read() - - src_md5 = hashlib.md5(src_content).hexdigest() - return src_md5 - - -def md5sum_is_current(src_file): - """Checks whether src_file has the same md5 hash as the one on disk""" - - src_md5 = get_md5sum(src_file) - - src_md5_file = src_file + '.md5' - if os.path.exists(src_md5_file): - with open(src_md5_file, 'r') as file_checksum: - ref_md5 = file_checksum.read() - - return src_md5 == ref_md5 - - return False - - -def save_figures(image_path, fig_count, gallery_conf): - """Save all open matplotlib figures of the example code-block - - Parameters - ---------- - image_path : str - Path where plots are saved (format string which accepts figure number) - fig_count : int - Previous figure number count. Figure number add from this number - gallery_conf : dict - Contains the configuration of Sphinx-Gallery - - Returns - ------- - images_rst : str - rst code to embed the images in the document - fig_num : int - number of figures saved - """ - figure_list = [] - - for fig_num in plt.get_fignums(): - # Set the fig_num figure as the current figure as we can't - # save a figure that's not the current figure. - fig = plt.figure(fig_num) - kwargs = {} - to_rgba = matplotlib.colors.colorConverter.to_rgba - for attr in ['facecolor', 'edgecolor']: - fig_attr = getattr(fig, 'get_' + attr)() - default_attr = matplotlib.rcParams['figure.' + attr] - if to_rgba(fig_attr) != to_rgba(default_attr): - kwargs[attr] = fig_attr - - current_fig = image_path.format(fig_count + fig_num) - fig.savefig(current_fig, **kwargs) - figure_list.append(current_fig) - - if gallery_conf.get('find_mayavi_figures', False): - from mayavi import mlab - e = mlab.get_engine() - last_matplotlib_fig_num = fig_count + len(figure_list) - total_fig_num = last_matplotlib_fig_num + len(e.scenes) - mayavi_fig_nums = range(last_matplotlib_fig_num + 1, total_fig_num + 1) - - for scene, mayavi_fig_num in zip(e.scenes, mayavi_fig_nums): - current_fig = image_path.format(mayavi_fig_num) - mlab.savefig(current_fig, figure=scene) - # make sure the image is not too large - scale_image(current_fig, current_fig, 850, 999) - figure_list.append(current_fig) - mlab.close(all=True) - - return figure_rst(figure_list, gallery_conf['src_dir']) - - -def figure_rst(figure_list, sources_dir): - """Given a list of paths to figures generate the corresponding rst - - Depending on whether we have one or more figures, we use a - single rst call to 'image' or a horizontal list. - - Parameters - ---------- - figure_list : list of str - Strings are the figures' absolute paths - sources_dir : str - absolute path of Sphinx documentation sources - - Returns - ------- - images_rst : str - rst code to embed the images in the document - fig_num : int - number of figures saved - """ - - figure_paths = [os.path.relpath(figure_path, sources_dir) - .replace(os.sep, '/').lstrip('/') - for figure_path in figure_list] - images_rst = "" - if len(figure_paths) == 1: - figure_name = figure_paths[0] - images_rst = SINGLE_IMAGE % figure_name - elif len(figure_paths) > 1: - images_rst = HLIST_HEADER - for figure_name in figure_paths: - images_rst += HLIST_IMAGE_TEMPLATE % figure_name - - return images_rst, len(figure_list) - - -def scale_image(in_fname, out_fname, max_width, max_height): - """Scales an image with the same aspect ratio centered in an - image with a given max_width and max_height - if in_fname == out_fname the image can only be scaled down - """ - # local import to avoid testing dependency on PIL: - try: - from PIL import Image - except ImportError: - import Image - img = Image.open(in_fname) - width_in, height_in = img.size - scale_w = max_width / float(width_in) - scale_h = max_height / float(height_in) - - if height_in * scale_w <= max_height: - scale = scale_w - else: - scale = scale_h - - if scale >= 1.0 and in_fname == out_fname: - return - - width_sc = int(round(scale * width_in)) - height_sc = int(round(scale * height_in)) - - # resize the image - img.thumbnail((width_sc, height_sc), Image.ANTIALIAS) - - # insert centered - thumb = Image.new('RGB', (max_width, max_height), (255, 255, 255)) - pos_insert = ((max_width - width_sc) // 2, (max_height - height_sc) // 2) - thumb.paste(img, pos_insert) - - thumb.save(out_fname) - # Use optipng to perform lossless compression on the resized image if - # software is installed - if os.environ.get('SKLEARN_DOC_OPTIPNG', False): - try: - subprocess.call(["optipng", "-quiet", "-o", "9", out_fname]) - except Exception: - warnings.warn('Install optipng to reduce the size of the \ - generated images') - - -def save_thumbnail(image_path_template, src_file, gallery_conf): - """Save the thumbnail image""" - # read specification of the figure to display as thumbnail from main text - _, content = get_docstring_and_rest(src_file) - thumbnail_number = extract_thumbnail_number(content) - thumbnail_image_path = image_path_template.format(thumbnail_number) - - thumb_dir = os.path.join(os.path.dirname(thumbnail_image_path), 'thumb') - if not os.path.exists(thumb_dir): - os.makedirs(thumb_dir) - - base_image_name = os.path.splitext(os.path.basename(src_file))[0] - thumb_file = os.path.join(thumb_dir, - 'sphx_glr_%s_thumb.png' % base_image_name) - - if src_file in gallery_conf['failing_examples']: - broken_img = os.path.join(glr_path_static(), 'broken_example.png') - scale_image(broken_img, thumb_file, 200, 140) - - elif os.path.exists(thumbnail_image_path): - scale_image(thumbnail_image_path, thumb_file, 400, 280) - - elif not os.path.exists(thumb_file): - # create something to replace the thumbnail - default_thumb_file = os.path.join(glr_path_static(), 'no_image.png') - default_thumb_file = gallery_conf.get("default_thumb_file", - default_thumb_file) - scale_image(default_thumb_file, thumb_file, 200, 140) - - -def generate_dir_rst(src_dir, target_dir, gallery_conf, seen_backrefs): - """Generate the gallery reStructuredText for an example directory""" - if not os.path.exists(os.path.join(src_dir, 'README.txt')): - print(80 * '_') - print('Example directory %s does not have a README.txt file' % - src_dir) - print('Skipping this directory') - print(80 * '_') - return "", [] # because string is an expected return type - - with open(os.path.join(src_dir, 'README.txt')) as fid: - fhindex = fid.read() - # Add empty lines to avoid bug in issue #165 - fhindex += "\n\n" - - if not os.path.exists(target_dir): - os.makedirs(target_dir) - sorted_listdir = [fname for fname in sorted(os.listdir(src_dir)) - if fname.endswith('.py')] - entries_text = [] - computation_times = [] - build_target_dir = os.path.relpath(target_dir, gallery_conf['src_dir']) - for fname in sorted_listdir: - amount_of_code, time_elapsed = \ - generate_file_rst(fname, target_dir, src_dir, gallery_conf) - computation_times.append((time_elapsed, fname)) - new_fname = os.path.join(src_dir, fname) - intro = extract_intro(new_fname) - this_entry = _thumbnail_div(build_target_dir, fname, intro) + """ - -.. toctree:: - :hidden: - - /%s\n""" % os.path.join(build_target_dir, fname[:-3]).replace(os.sep, '/') - entries_text.append((amount_of_code, this_entry)) - - if gallery_conf['backreferences_dir']: - write_backreferences(seen_backrefs, gallery_conf, - target_dir, fname, intro) - - # sort to have the smallest entries in the beginning - entries_text.sort() - - for _, entry_text in entries_text: - fhindex += entry_text - - # clear at the end of the section - fhindex += """.. raw:: html\n -
    \n\n""" - - return fhindex, computation_times - - -def execute_code_block(code_block, example_globals, - block_vars, gallery_conf): - """Executes the code block of the example file""" - time_elapsed = 0 - stdout = '' - - # If example is not suitable to run, skip executing its blocks - if not block_vars['execute_script']: - return stdout, time_elapsed - - plt.close('all') - cwd = os.getcwd() - # Redirect output to stdout and - orig_stdout = sys.stdout - src_file = block_vars['src_file'] - - try: - # First cd in the original example dir, so that any file - # created by the example get created in this directory - os.chdir(os.path.dirname(src_file)) - my_buffer = MixedEncodingStringIO() - my_stdout = Tee(sys.stdout, my_buffer) - sys.stdout = my_stdout - - t_start = time() - # don't use unicode_literals at the top of this file or you get - # nasty errors here on Py2.7 - exec(code_block, example_globals) - time_elapsed = time() - t_start - - sys.stdout = orig_stdout - - my_stdout = my_buffer.getvalue().strip().expandtabs() - # raise RuntimeError - if my_stdout: - stdout = CODE_OUTPUT.format(indent(my_stdout, u' ' * 4)) - os.chdir(cwd) - images_rst, fig_num = save_figures(block_vars['image_path'], - block_vars['fig_count'], gallery_conf) - - except Exception: - formatted_exception = traceback.format_exc() - - fail_example_warning = 80 * '_' + '\n' + \ - '%s failed to execute correctly:' % src_file + \ - formatted_exception + 80 * '_' + '\n' - warnings.warn(fail_example_warning) - - fig_num = 0 - images_rst = codestr2rst(formatted_exception, lang='pytb') - - # Breaks build on first example error - # XXX This check can break during testing e.g. if you uncomment the - # `raise RuntimeError` by the `my_stdout` call, maybe use `.get()`? - if gallery_conf['abort_on_example_error']: - raise - # Stores failing file - gallery_conf['failing_examples'][src_file] = formatted_exception - block_vars['execute_script'] = False - - finally: - os.chdir(cwd) - sys.stdout = orig_stdout - - code_output = u"\n{0}\n\n{1}\n\n".format(images_rst, stdout) - block_vars['fig_count'] += fig_num - - return code_output, time_elapsed - - -def clean_modules(): - """Remove "unload" seaborn from the name space - - After a script is executed it can load a variety of setting that one - does not want to influence in other examples in the gallery.""" - - # Horrible code to 'unload' seaborn, so that it resets - # its default when is load - # Python does not support unloading of modules - # https://bugs.python.org/issue9072 - for module in list(sys.modules.keys()): - if 'seaborn' in module: - del sys.modules[module] - - # Reset Matplotlib to default - plt.rcdefaults() - - -def generate_file_rst(fname, target_dir, src_dir, gallery_conf): - """Generate the rst file for a given example. - - Returns - ------- - amount_of_code : int - character count of the corresponding python script in file - time_elapsed : float - seconds required to run the script - """ - - src_file = os.path.normpath(os.path.join(src_dir, fname)) - example_file = os.path.join(target_dir, fname) - shutil.copyfile(src_file, example_file) - script_blocks = split_code_and_text_blocks(src_file) - amount_of_code = sum([len(bcontent) - for blabel, bcontent in script_blocks - if blabel == 'code']) - - if md5sum_is_current(example_file): - return amount_of_code, 0 - - image_dir = os.path.join(target_dir, 'images') - if not os.path.exists(image_dir): - os.makedirs(image_dir) - - base_image_name = os.path.splitext(fname)[0] - image_fname = 'sphx_glr_' + base_image_name + '_{0:03}.png' - build_image_dir = os.path.relpath(image_dir, gallery_conf['src_dir']) - image_path_template = os.path.join(image_dir, image_fname) - - ref_fname = os.path.relpath(example_file, gallery_conf['src_dir']) - ref_fname = ref_fname.replace(os.path.sep, '_') - example_rst = """\n\n.. _sphx_glr_{0}:\n\n""".format(ref_fname) - - filename_pattern = gallery_conf.get('filename_pattern') - execute_script = re.search(filename_pattern, src_file) and gallery_conf[ - 'plot_gallery'] - example_globals = { - # A lot of examples contains 'print(__doc__)' for example in - # scikit-learn so that running the example prints some useful - # information. Because the docstring has been separated from - # the code blocks in sphinx-gallery, __doc__ is actually - # __builtin__.__doc__ in the execution context and we do not - # want to print it - '__doc__': '', - # Examples may contain if __name__ == '__main__' guards - # for in example scikit-learn if the example uses multiprocessing - '__name__': '__main__', - # Don't ever support __file__: Issues #166 #212 - } - - # A simple example has two blocks: one for the - # example introduction/explanation and one for the code - is_example_notebook_like = len(script_blocks) > 2 - time_elapsed = 0 - block_vars = {'execute_script': execute_script, 'fig_count': 0, - 'image_path': image_path_template, 'src_file': src_file} - if block_vars['execute_script']: - print('Executing file %s' % src_file) - for blabel, bcontent in script_blocks: - if blabel == 'code': - code_output, rtime = execute_code_block(bcontent, - example_globals, - block_vars, - gallery_conf) - - time_elapsed += rtime - - if is_example_notebook_like: - example_rst += codestr2rst(bcontent) + '\n' - example_rst += code_output - else: - example_rst += code_output - if 'sphx-glr-script-out' in code_output: - # Add some vertical space after output - example_rst += "\n\n|\n\n" - example_rst += codestr2rst(bcontent) + '\n' - - else: - example_rst += bcontent + '\n\n' - - clean_modules() - - # Writes md5 checksum if example has build correctly - # not failed and was initially meant to run(no-plot shall not cache md5sum) - if block_vars['execute_script']: - with open(example_file + '.md5', 'w') as file_checksum: - file_checksum.write(get_md5sum(example_file)) - - save_thumbnail(image_path_template, src_file, gallery_conf) - - time_m, time_s = divmod(time_elapsed, 60) - example_nb = jupyter_notebook(script_blocks) - save_notebook(example_nb, example_file.replace('.py', '.ipynb')) - with codecs.open(os.path.join(target_dir, base_image_name + '.rst'), - mode='w', encoding='utf-8') as f: - example_rst += "**Total running time of the script:**" \ - " ({0: .0f} minutes {1: .3f} seconds)\n\n".format( - time_m, time_s) - example_rst += CODE_DOWNLOAD.format(fname, - fname.replace('.py', '.ipynb')) - example_rst += SPHX_GLR_SIG - f.write(example_rst) - - if block_vars['execute_script']: - print("{0} ran in : {1:.2g} seconds\n".format(src_file, time_elapsed)) - - return amount_of_code, time_elapsed diff --git a/doc/sphinxext/sphinx_gallery/notebook.py b/doc/sphinxext/sphinx_gallery/notebook.py deleted file mode 100644 index a0cfdbd7881d6..0000000000000 --- a/doc/sphinxext/sphinx_gallery/notebook.py +++ /dev/null @@ -1,193 +0,0 @@ -# -*- coding: utf-8 -*- -r""" -Parser for Jupyter notebooks -============================ - -Class that holds the Jupyter notebook information - -""" -# Author: Óscar Nájera -# License: 3-clause BSD - -from __future__ import division, absolute_import, print_function -from functools import partial -import argparse -import json -import re -import sys -from .py_source_parser import split_code_and_text_blocks - - -def jupyter_notebook_skeleton(): - """Returns a dictionary with the elements of a Jupyter notebook""" - py_version = sys.version_info - notebook_skeleton = { - "cells": [], - "metadata": { - "kernelspec": { - "display_name": "Python " + str(py_version[0]), - "language": "python", - "name": "python" + str(py_version[0]) - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": py_version[0] - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython" + str(py_version[0]), - "version": '{0}.{1}.{2}'.format(*sys.version_info[:3]) - } - }, - "nbformat": 4, - "nbformat_minor": 0 - } - return notebook_skeleton - - -def directive_fun(match, directive): - """Helper to fill in directives""" - directive_to_alert = dict(note="info", warning="danger") - return ('

    {1}

    {2}

    ' - .format(directive_to_alert[directive], directive.capitalize(), - match.group(1).strip())) - - -def rst2md(text): - """Converts the RST text from the examples docstrigs and comments - into markdown text for the Jupyter notebooks""" - - top_heading = re.compile(r'^=+$\s^([\w\s-]+)^=+$', flags=re.M) - text = re.sub(top_heading, r'# \1', text) - - math_eq = re.compile(r'^\.\. math::((?:.+)?(?:\n+^ .+)*)', flags=re.M) - text = re.sub(math_eq, - lambda match: r'\begin{{align}}{0}\end{{align}}'.format( - match.group(1).strip()), - text) - inline_math = re.compile(r':math:`(.+?)`', re.DOTALL) - text = re.sub(inline_math, r'$\1$', text) - - directives = ('warning', 'note') - for directive in directives: - directive_re = re.compile(r'^\.\. %s::((?:.+)?(?:\n+^ .+)*)' - % directive, flags=re.M) - text = re.sub(directive_re, - partial(directive_fun, directive=directive), text) - - links = re.compile(r'^ *\.\. _.*:.*$\n', flags=re.M) - text = re.sub(links, '', text) - - refs = re.compile(r':ref:`') - text = re.sub(refs, '`', text) - - contents = re.compile(r'^\s*\.\. contents::.*$(\n +:\S+: *$)*\n', - flags=re.M) - text = re.sub(contents, '', text) - - images = re.compile( - r'^\.\. image::(.*$)(?:\n *:alt:(.*$)\n)?(?: +:\S+:.*$\n)*', - flags=re.M) - text = re.sub( - images, lambda match: '![{1}]({0})\n'.format( - match.group(1).strip(), (match.group(2) or '').strip()), text) - - return text - - -def jupyter_notebook(script_blocks): - """Generate a Jupyter notebook file cell-by-cell - - Parameters - ---------- - script_blocks: list - script execution cells - """ - - work_notebook = jupyter_notebook_skeleton() - add_code_cell(work_notebook, "%matplotlib inline") - fill_notebook(work_notebook, script_blocks) - - return work_notebook - - -def add_code_cell(work_notebook, code): - """Add a code cell to the notebook - - Parameters - ---------- - code : str - Cell content - """ - - code_cell = { - "cell_type": "code", - "execution_count": None, - "metadata": {"collapsed": False}, - "outputs": [], - "source": [code.strip()] - } - work_notebook["cells"].append(code_cell) - - -def add_markdown_cell(work_notebook, text): - """Add a markdown cell to the notebook - - Parameters - ---------- - code : str - Cell content - """ - markdown_cell = { - "cell_type": "markdown", - "metadata": {}, - "source": [rst2md(text)] - } - work_notebook["cells"].append(markdown_cell) - - -def fill_notebook(work_notebook, script_blocks): - """Writes the Jupyter notebook cells - - Parameters - ---------- - script_blocks : list of tuples - """ - - for blabel, bcontent in script_blocks: - if blabel == 'code': - add_code_cell(work_notebook, bcontent) - else: - add_markdown_cell(work_notebook, bcontent + '\n') - - -def save_notebook(work_notebook, write_file): - """Saves the Jupyter work_notebook to write_file""" - with open(write_file, 'w') as out_nb: - json.dump(work_notebook, out_nb, indent=2) - - -############################################################################### -# Notebook shell utility - -def python_to_jupyter_cli(args=None, namespace=None): - """Exposes the jupyter notebook renderer to the command line - - Takes the same arguments as ArgumentParser.parse_args - """ - parser = argparse.ArgumentParser( - description='Sphinx-Gallery Notebook converter') - parser.add_argument('python_src_file', nargs='+', - help='Input Python file script to convert. ' - 'Supports multiple files and shell wildcards' - ' (e.g. *.py)') - args = parser.parse_args(args, namespace) - - for src_file in args.python_src_file: - blocks = split_code_and_text_blocks(src_file) - print('Converting {0}'.format(src_file)) - example_nb = jupyter_notebook(blocks) - save_notebook(example_nb, src_file.replace('.py', '.ipynb')) diff --git a/doc/sphinxext/sphinx_gallery/py_source_parser.py b/doc/sphinxext/sphinx_gallery/py_source_parser.py deleted file mode 100644 index d397087f99fbd..0000000000000 --- a/doc/sphinxext/sphinx_gallery/py_source_parser.py +++ /dev/null @@ -1,99 +0,0 @@ -# -*- coding: utf-8 -*- -r""" -Parser for python source files -============================== -""" -# Created Sun Nov 27 14:03:07 2016 -# Author: Óscar Nájera - -from __future__ import division, absolute_import, print_function -import ast -import re -from textwrap import dedent - -SYNTAX_ERROR_DOCSTRING = """ -SyntaxError -=========== - -Example script with invalid Python syntax -""" - - -def get_docstring_and_rest(filename): - """Separate `filename` content between docstring and the rest - - Strongly inspired from ast.get_docstring. - - Returns - ------- - docstring: str - docstring of `filename` - rest: str - `filename` content without the docstring - """ - # can't use codecs.open(filename, 'r', 'utf-8') here b/c ast doesn't - # seem to work with unicode strings in Python2.7 - # "SyntaxError: encoding declaration in Unicode string" - with open(filename, 'rb') as fid: - content = fid.read() - # change from Windows format to UNIX for uniformity - content = content.replace(b'\r\n', b'\n') - - try: - node = ast.parse(content) - except SyntaxError: - return SYNTAX_ERROR_DOCSTRING, content.decode('utf-8') - - if not isinstance(node, ast.Module): - raise TypeError("This function only supports modules. " - "You provided {0}".format(node.__class__.__name__)) - if node.body and isinstance(node.body[0], ast.Expr) and \ - isinstance(node.body[0].value, ast.Str): - docstring_node = node.body[0] - docstring = docstring_node.value.s - if hasattr(docstring, 'decode'): # python2.7 - docstring = docstring.decode('utf-8') - # This get the content of the file after the docstring last line - # Note: 'maxsplit' argument is not a keyword argument in python2 - rest = content.decode('utf-8').split('\n', docstring_node.lineno)[-1] - return docstring, rest - else: - raise ValueError(('Could not find docstring in file "{0}". ' - 'A docstring is required by sphinx-gallery') - .format(filename)) - - -def split_code_and_text_blocks(source_file): - """Return list with source file separated into code and text blocks. - - Returns - ------- - blocks : list of (label, content) - List where each element is a tuple with the label ('text' or 'code'), - and content string of block. - """ - docstring, rest_of_content = get_docstring_and_rest(source_file) - blocks = [('text', docstring)] - - pattern = re.compile( - r'(?P^#{20,}.*)\s(?P(?:^#.*\s)*)', - flags=re.M) - - pos_so_far = 0 - for match in re.finditer(pattern, rest_of_content): - match_start_pos, match_end_pos = match.span() - code_block_content = rest_of_content[pos_so_far:match_start_pos] - text_content = match.group('text_content') - sub_pat = re.compile('^#', flags=re.M) - text_block_content = dedent(re.sub(sub_pat, '', text_content)).lstrip() - if code_block_content.strip(): - blocks.append(('code', code_block_content)) - if text_block_content.strip(): - blocks.append(('text', text_block_content)) - pos_so_far = match_end_pos - - remaining_content = rest_of_content[pos_so_far:] - if remaining_content.strip(): - blocks.append(('code', remaining_content)) - - return blocks diff --git a/doc/whats_new.rst b/doc/whats_new.rst index e5159054c8153..65b47a42289e4 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -1,8 +1,8 @@ .. currentmodule:: sklearn - - +.. include:: includes/big_toc_css.rst +.. include:: whats_new/_contributors.rst =============== -Release history +Release History =============== Version 0.20 (under development) @@ -5756,3 +5756,19 @@ David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. .. _Neeraj Gangwar: http://neerajgangwar.in .. _Arthur Mensch: https://amensch.fr +.. include:: whats_new/v0.20.rst +.. include:: whats_new/v0.19.rst + +================= +Previous Releases +================= +.. toctree:: + :maxdepth: 1 + + Version 0.18 + Version 0.17 + Version 0.16 + Version 0.15 + Version 0.14 + Version 0.13 + Older Versions diff --git a/doc/whats_new/_contributors.rst b/doc/whats_new/_contributors.rst new file mode 100644 index 0000000000000..dfbc319da88f4 --- /dev/null +++ b/doc/whats_new/_contributors.rst @@ -0,0 +1,143 @@ +.. _Olivier Grisel: https://twitter.com/ogrisel + +.. _Gael Varoquaux: http://gael-varoquaux.info + +.. _Alexandre Gramfort: http://alexandre.gramfort.net + +.. _Fabian Pedregosa: http://fa.bianp.net + +.. _Mathieu Blondel: http://www.mblondel.org + +.. _James Bergstra: http://www-etud.iro.umontreal.ca/~bergstrj/ + +.. _liblinear: http://www.csie.ntu.edu.tw/~cjlin/liblinear/ + +.. _Yaroslav Halchenko: http://www.onerussian.com/ + +.. _Vlad Niculae: http://vene.ro + +.. _Edouard Duchesnay: https://sites.google.com/site/duchesnay/home + +.. _Peter Prettenhofer: https://sites.google.com/site/peterprettenhofer/ + +.. _Alexandre Passos: http://atpassos.me + +.. _Nicolas Pinto: https://twitter.com/npinto + +.. _Bertrand Thirion: https://team.inria.fr/parietal/bertrand-thirions-page + +.. _Andreas Müller: http://peekaboo-vision.blogspot.com + +.. _Matthieu Perrot: http://brainvisa.info/biblio/lnao/en/Author/PERROT-M.html + +.. _Jake Vanderplas: http://staff.washington.edu/jakevdp/ + +.. _Gilles Louppe: http://www.montefiore.ulg.ac.be/~glouppe/ + +.. _INRIA: http://www.inria.fr + +.. _Parietal Team: http://parietal.saclay.inria.fr/ + +.. _David Warde-Farley: http://www-etud.iro.umontreal.ca/~wardefar/ + +.. _Brian Holt: http://personal.ee.surrey.ac.uk/Personal/B.Holt + +.. _Satrajit Ghosh: http://www.mit.edu/~satra/ + +.. _Robert Layton: https://twitter.com/robertlayton + +.. _Scott White: https://twitter.com/scottblanc + +.. _David Marek: http://www.davidmarek.cz/ + +.. _Christian Osendorfer: https://osdf.github.io + +.. _Arnaud Joly: http://www.ajoly.org + +.. _Rob Zinkov: http://zinkov.com + +.. _Joel Nothman: http://joelnothman.com + +.. _Nicolas Trésegnie : http://nicolastr.com/ + +.. _Kemal Eren: http://www.kemaleren.com + +.. _Yann Dauphin: http://ynd.github.io/ + +.. _Yannick Schwartz: https://team.inria.fr/parietal/schwarty/ + +.. _Kyle Kastner: http://kastnerkyle.github.io + +.. _Daniel Nouri: http://danielnouri.org + +.. _Manoj Kumar: https://manojbits.wordpress.com + +.. _Luis Pedro Coelho: http://luispedro.org + +.. _Fares Hedyati: http://www.eecs.berkeley.edu/~fareshed + +.. _Antony Lee: https://www.ocf.berkeley.edu/~antonyl/ + +.. _Martin Billinger: http://tnsre.embs.org/author/martinbillinger + +.. _Matteo Visconti di Oleggio Castello: http://www.mvdoc.me + +.. _Trevor Stephens: http://trevorstephens.com/ + +.. _Jan Hendrik Metzen: https://jmetzen.github.io/ + +.. _Will Dawson: http://www.dawsonresearch.com + +.. _Andrew Tulloch: http://tullo.ch/ + +.. _Hanna Wallach: http://dirichlet.net/ + +.. _Yan Yi: http://seowyanyi.org + +.. _Hervé Bredin: http://herve.niderb.fr/ + +.. _Eric Martin: http://www.ericmart.in + +.. _Nicolas Goix: https://perso.telecom-paristech.fr/~goix/ + +.. _Sebastian Raschka: http://sebastianraschka.com + +.. _Brian McFee: https://bmcfee.github.io + +.. _Valentin Stolbunov: http://www.vstolbunov.com + +.. _Jaques Grobler: https://github.com/jaquesgrobler + +.. _Lars Buitinck: https://github.com/larsmans + +.. _Loic Esteve: https://github.com/lesteve + +.. _Noel Dawe: https://github.com/ndawe + +.. _Raghav RV: https://github.com/raghavrv + +.. _Tom Dupre la Tour: https://github.com/TomDLT + +.. _Nelle Varoquaux: https://github.com/nellev + +.. _Bing Tian Dai: https://github.com/btdai + +.. _Dylan Werner-Meier: https://github.com/unautre + +.. _Alyssa Batula: https://github.com/abatula + +.. _Srivatsan Ramesh: https://github.com/srivatsan-ramesh + +.. _Ron Weiss: http://www.ee.columbia.edu/~ronw + +.. _Kathleen Chen: https://github.com/kchen17 + +.. _Vincent Pham: https://github.com/vincentpham1991 + +.. _Denis Engemann: http://denis-engemann.de + +.. _Anish Shah: https://github.com/AnishShah + +.. _Neeraj Gangwar: http://neerajgangwar.in + +.. _Arthur Mensch: https://amensch.fr diff --git a/doc/whats_new/older_versions.rst b/doc/whats_new/older_versions.rst new file mode 100644 index 0000000000000..eeb672914f033 --- /dev/null +++ b/doc/whats_new/older_versions.rst @@ -0,0 +1,1386 @@ +.. include:: _contributors.rst + +.. currentmodule:: sklearn + +.. _changes_0_12.1: + +Version 0.12.1 +=============== + +**October 8, 2012** + +The 0.12.1 release is a bug-fix release with no additional features, but is +instead a set of bug fixes + +Changelog +---------- + +- Improved numerical stability in spectral embedding by `Gael + Varoquaux`_ + +- Doctest under windows 64bit by `Gael Varoquaux`_ + +- Documentation fixes for elastic net by `Andreas Müller`_ and + `Alexandre Gramfort`_ + +- Proper behavior with fortran-ordered NumPy arrays by `Gael Varoquaux`_ + +- Make GridSearchCV work with non-CSR sparse matrix by `Lars Buitinck`_ + +- Fix parallel computing in MDS by `Gael Varoquaux`_ + +- Fix Unicode support in count vectorizer by `Andreas Müller`_ + +- Fix MinCovDet breaking with X.shape = (3, 1) by :user:`Virgile Fritsch ` + +- Fix clone of SGD objects by `Peter Prettenhofer`_ + +- Stabilize GMM by :user:`Virgile Fritsch ` + +People +------ + + * 14 `Peter Prettenhofer`_ + * 12 `Gael Varoquaux`_ + * 10 `Andreas Müller`_ + * 5 `Lars Buitinck`_ + * 3 :user:`Virgile Fritsch ` + * 1 `Alexandre Gramfort`_ + * 1 `Gilles Louppe`_ + * 1 `Mathieu Blondel`_ + +.. _changes_0_12: + +Version 0.12 +============ + +**September 4, 2012** + +Changelog +--------- + +- Various speed improvements of the :ref:`decision trees ` module, by + `Gilles Louppe`_. + +- :class:`ensemble.GradientBoostingRegressor` and + :class:`ensemble.GradientBoostingClassifier` now support feature subsampling + via the ``max_features`` argument, by `Peter Prettenhofer`_. + +- Added Huber and Quantile loss functions to + :class:`ensemble.GradientBoostingRegressor`, by `Peter Prettenhofer`_. + +- :ref:`Decision trees ` and :ref:`forests of randomized trees ` + now support multi-output classification and regression problems, by + `Gilles Louppe`_. + +- Added :class:`preprocessing.LabelEncoder`, a simple utility class to + normalize labels or transform non-numerical labels, by `Mathieu Blondel`_. + +- Added the epsilon-insensitive loss and the ability to make probabilistic + predictions with the modified huber loss in :ref:`sgd`, by + `Mathieu Blondel`_. + +- Added :ref:`multidimensional_scaling`, by Nelle Varoquaux. + +- SVMlight file format loader now detects compressed (gzip/bzip2) files and + decompresses them on the fly, by `Lars Buitinck`_. + +- SVMlight file format serializer now preserves double precision floating + point values, by `Olivier Grisel`_. + +- A common testing framework for all estimators was added, by `Andreas Müller`_. + +- Understandable error messages for estimators that do not accept + sparse input by `Gael Varoquaux`_ + +- Speedups in hierarchical clustering by `Gael Varoquaux`_. In + particular building the tree now supports early stopping. This is + useful when the number of clusters is not small compared to the + number of samples. + +- Add MultiTaskLasso and MultiTaskElasticNet for joint feature selection, + by `Alexandre Gramfort`_. + +- Added :func:`metrics.auc_score` and + :func:`metrics.average_precision_score` convenience functions by `Andreas + Müller`_. + +- Improved sparse matrix support in the :ref:`feature_selection` + module by `Andreas Müller`_. + +- New word boundaries-aware character n-gram analyzer for the + :ref:`text_feature_extraction` module by :user:`@kernc `. + +- Fixed bug in spectral clustering that led to single point clusters + by `Andreas Müller`_. + +- In :class:`feature_extraction.text.CountVectorizer`, added an option to + ignore infrequent words, ``min_df`` by `Andreas Müller`_. + +- Add support for multiple targets in some linear models (ElasticNet, Lasso + and OrthogonalMatchingPursuit) by `Vlad Niculae`_ and + `Alexandre Gramfort`_. + +- Fixes in :class:`decomposition.ProbabilisticPCA` score function by Wei Li. + +- Fixed feature importance computation in + :ref:`gradient_boosting`. + +API changes summary +------------------- + +- The old ``scikits.learn`` package has disappeared; all code should import + from ``sklearn`` instead, which was introduced in 0.9. + +- In :func:`metrics.roc_curve`, the ``thresholds`` array is now returned + with it's order reversed, in order to keep it consistent with the order + of the returned ``fpr`` and ``tpr``. + +- In :class:`hmm` objects, like :class:`hmm.GaussianHMM`, + :class:`hmm.MultinomialHMM`, etc., all parameters must be passed to the + object when initialising it and not through ``fit``. Now ``fit`` will + only accept the data as an input parameter. + +- For all SVM classes, a faulty behavior of ``gamma`` was fixed. Previously, + the default gamma value was only computed the first time ``fit`` was called + and then stored. It is now recalculated on every call to ``fit``. + +- All ``Base`` classes are now abstract meta classes so that they can not be + instantiated. + +- :func:`cluster.ward_tree` now also returns the parent array. This is + necessary for early-stopping in which case the tree is not + completely built. + +- In :class:`feature_extraction.text.CountVectorizer` the parameters + ``min_n`` and ``max_n`` were joined to the parameter ``n_gram_range`` to + enable grid-searching both at once. + +- In :class:`feature_extraction.text.CountVectorizer`, words that appear + only in one document are now ignored by default. To reproduce + the previous behavior, set ``min_df=1``. + +- Fixed API inconsistency: :meth:`linear_model.SGDClassifier.predict_proba` now + returns 2d array when fit on two classes. + +- Fixed API inconsistency: :meth:`discriminant_analysis.QuadraticDiscriminantAnalysis.decision_function` + and :meth:`discriminant_analysis.LinearDiscriminantAnalysis.decision_function` now return 1d arrays + when fit on two classes. + +- Grid of alphas used for fitting :class:`linear_model.LassoCV` and + :class:`linear_model.ElasticNetCV` is now stored + in the attribute ``alphas_`` rather than overriding the init parameter + ``alphas``. + +- Linear models when alpha is estimated by cross-validation store + the estimated value in the ``alpha_`` attribute rather than just + ``alpha`` or ``best_alpha``. + +- :class:`ensemble.GradientBoostingClassifier` now supports + :meth:`ensemble.GradientBoostingClassifier.staged_predict_proba`, and + :meth:`ensemble.GradientBoostingClassifier.staged_predict`. + +- :class:`svm.sparse.SVC` and other sparse SVM classes are now deprecated. + The all classes in the :ref:`svm` module now automatically select the + sparse or dense representation base on the input. + +- All clustering algorithms now interpret the array ``X`` given to ``fit`` as + input data, in particular :class:`cluster.SpectralClustering` and + :class:`cluster.AffinityPropagation` which previously expected affinity matrices. + +- For clustering algorithms that take the desired number of clusters as a parameter, + this parameter is now called ``n_clusters``. + + +People +------ + * 267 `Andreas Müller`_ + * 94 `Gilles Louppe`_ + * 89 `Gael Varoquaux`_ + * 79 `Peter Prettenhofer`_ + * 60 `Mathieu Blondel`_ + * 57 `Alexandre Gramfort`_ + * 52 `Vlad Niculae`_ + * 45 `Lars Buitinck`_ + * 44 Nelle Varoquaux + * 37 `Jaques Grobler`_ + * 30 Alexis Mignon + * 30 Immanuel Bayer + * 27 `Olivier Grisel`_ + * 16 Subhodeep Moitra + * 13 Yannick Schwartz + * 12 :user:`@kernc ` + * 11 :user:`Virgile Fritsch ` + * 9 Daniel Duckworth + * 9 `Fabian Pedregosa`_ + * 9 `Robert Layton`_ + * 8 John Benediktsson + * 7 Marko Burjek + * 5 `Nicolas Pinto`_ + * 4 Alexandre Abraham + * 4 `Jake Vanderplas`_ + * 3 `Brian Holt`_ + * 3 `Edouard Duchesnay`_ + * 3 Florian Hoenig + * 3 flyingimmidev + * 2 Francois Savard + * 2 Hannes Schulz + * 2 Peter Welinder + * 2 `Yaroslav Halchenko`_ + * 2 Wei Li + * 1 Alex Companioni + * 1 Brandyn A. White + * 1 Bussonnier Matthias + * 1 Charles-Pierre Astolfi + * 1 Dan O'Huiginn + * 1 David Cournapeau + * 1 Keith Goodman + * 1 Ludwig Schwardt + * 1 Olivier Hervieu + * 1 Sergio Medina + * 1 Shiqiao Du + * 1 Tim Sheerman-Chase + * 1 buguen + + + +.. _changes_0_11: + +Version 0.11 +============ + +**May 7, 2012** + +Changelog +--------- + +Highlights +............. + +- Gradient boosted regression trees (:ref:`gradient_boosting`) + for classification and regression by `Peter Prettenhofer`_ + and `Scott White`_ . + +- Simple dict-based feature loader with support for categorical variables + (:class:`feature_extraction.DictVectorizer`) by `Lars Buitinck`_. + +- Added Matthews correlation coefficient (:func:`metrics.matthews_corrcoef`) + and added macro and micro average options to + :func:`metrics.precision_score`, :func:`metrics.recall_score` and + :func:`metrics.f1_score` by `Satrajit Ghosh`_. + +- :ref:`out_of_bag` of generalization error for :ref:`ensemble` + by `Andreas Müller`_. + +- Randomized sparse linear models for feature + selection, by `Alexandre Gramfort`_ and `Gael Varoquaux`_ + +- :ref:`label_propagation` for semi-supervised learning, by Clay + Woolam. **Note** the semi-supervised API is still work in progress, + and may change. + +- Added BIC/AIC model selection to classical :ref:`gmm` and unified + the API with the remainder of scikit-learn, by `Bertrand Thirion`_ + +- Added :class:`sklearn.cross_validation.StratifiedShuffleSplit`, which is + a :class:`sklearn.cross_validation.ShuffleSplit` with balanced splits, + by Yannick Schwartz. + +- :class:`sklearn.neighbors.NearestCentroid` classifier added, along with a + ``shrink_threshold`` parameter, which implements **shrunken centroid + classification**, by `Robert Layton`_. + +Other changes +.............. + +- Merged dense and sparse implementations of :ref:`sgd` module and + exposed utility extension types for sequential + datasets ``seq_dataset`` and weight vectors ``weight_vector`` + by `Peter Prettenhofer`_. + +- Added ``partial_fit`` (support for online/minibatch learning) and + warm_start to the :ref:`sgd` module by `Mathieu Blondel`_. + +- Dense and sparse implementations of :ref:`svm` classes and + :class:`linear_model.LogisticRegression` merged by `Lars Buitinck`_. + +- Regressors can now be used as base estimator in the :ref:`multiclass` + module by `Mathieu Blondel`_. + +- Added n_jobs option to :func:`metrics.pairwise.pairwise_distances` + and :func:`metrics.pairwise.pairwise_kernels` for parallel computation, + by `Mathieu Blondel`_. + +- :ref:`k_means` can now be run in parallel, using the ``n_jobs`` argument + to either :ref:`k_means` or :class:`KMeans`, by `Robert Layton`_. + +- Improved :ref:`cross_validation` and :ref:`grid_search` documentation + and introduced the new :func:`cross_validation.train_test_split` + helper function by `Olivier Grisel`_ + +- :class:`svm.SVC` members ``coef_`` and ``intercept_`` changed sign for + consistency with ``decision_function``; for ``kernel==linear``, + ``coef_`` was fixed in the one-vs-one case, by `Andreas Müller`_. + +- Performance improvements to efficient leave-one-out cross-validated + Ridge regression, esp. for the ``n_samples > n_features`` case, in + :class:`linear_model.RidgeCV`, by Reuben Fletcher-Costin. + +- Refactoring and simplification of the :ref:`text_feature_extraction` + API and fixed a bug that caused possible negative IDF, + by `Olivier Grisel`_. + +- Beam pruning option in :class:`_BaseHMM` module has been removed since it + is difficult to Cythonize. If you are interested in contributing a Cython + version, you can use the python version in the git history as a reference. + +- Classes in :ref:`neighbors` now support arbitrary Minkowski metric for + nearest neighbors searches. The metric can be specified by argument ``p``. + +API changes summary +------------------- + +- :class:`covariance.EllipticEnvelop` is now deprecated - Please use :class:`covariance.EllipticEnvelope` + instead. + +- ``NeighborsClassifier`` and ``NeighborsRegressor`` are gone in the module + :ref:`neighbors`. Use the classes :class:`KNeighborsClassifier`, + :class:`RadiusNeighborsClassifier`, :class:`KNeighborsRegressor` + and/or :class:`RadiusNeighborsRegressor` instead. + +- Sparse classes in the :ref:`sgd` module are now deprecated. + +- In :class:`mixture.GMM`, :class:`mixture.DPGMM` and :class:`mixture.VBGMM`, + parameters must be passed to an object when initialising it and not through + ``fit``. Now ``fit`` will only accept the data as an input parameter. + +- methods ``rvs`` and ``decode`` in :class:`GMM` module are now deprecated. + ``sample`` and ``score`` or ``predict`` should be used instead. + +- attribute ``_scores`` and ``_pvalues`` in univariate feature selection + objects are now deprecated. + ``scores_`` or ``pvalues_`` should be used instead. + +- In :class:`LogisticRegression`, :class:`LinearSVC`, :class:`SVC` and + :class:`NuSVC`, the ``class_weight`` parameter is now an initialization + parameter, not a parameter to fit. This makes grid searches + over this parameter possible. + +- LFW ``data`` is now always shape ``(n_samples, n_features)`` to be + consistent with the Olivetti faces dataset. Use ``images`` and + ``pairs`` attribute to access the natural images shapes instead. + +- In :class:`svm.LinearSVC`, the meaning of the ``multi_class`` parameter + changed. Options now are ``'ovr'`` and ``'crammer_singer'``, with + ``'ovr'`` being the default. This does not change the default behavior + but hopefully is less confusing. + +- Class :class:`feature_selection.text.Vectorizer` is deprecated and + replaced by :class:`feature_selection.text.TfidfVectorizer`. + +- The preprocessor / analyzer nested structure for text feature + extraction has been removed. All those features are + now directly passed as flat constructor arguments + to :class:`feature_selection.text.TfidfVectorizer` and + :class:`feature_selection.text.CountVectorizer`, in particular the + following parameters are now used: + +- ``analyzer`` can be ``'word'`` or ``'char'`` to switch the default + analysis scheme, or use a specific python callable (as previously). + +- ``tokenizer`` and ``preprocessor`` have been introduced to make it + still possible to customize those steps with the new API. + +- ``input`` explicitly control how to interpret the sequence passed to + ``fit`` and ``predict``: filenames, file objects or direct (byte or + Unicode) strings. + +- charset decoding is explicit and strict by default. + +- the ``vocabulary``, fitted or not is now stored in the + ``vocabulary_`` attribute to be consistent with the project + conventions. + +- Class :class:`feature_selection.text.TfidfVectorizer` now derives directly + from :class:`feature_selection.text.CountVectorizer` to make grid + search trivial. + +- methods ``rvs`` in :class:`_BaseHMM` module are now deprecated. + ``sample`` should be used instead. + +- Beam pruning option in :class:`_BaseHMM` module is removed since it is + difficult to be Cythonized. If you are interested, you can look in the + history codes by git. + +- The SVMlight format loader now supports files with both zero-based and + one-based column indices, since both occur "in the wild". + +- Arguments in class :class:`ShuffleSplit` are now consistent with + :class:`StratifiedShuffleSplit`. Arguments ``test_fraction`` and + ``train_fraction`` are deprecated and renamed to ``test_size`` and + ``train_size`` and can accept both ``float`` and ``int``. + +- Arguments in class :class:`Bootstrap` are now consistent with + :class:`StratifiedShuffleSplit`. Arguments ``n_test`` and + ``n_train`` are deprecated and renamed to ``test_size`` and + ``train_size`` and can accept both ``float`` and ``int``. + +- Argument ``p`` added to classes in :ref:`neighbors` to specify an + arbitrary Minkowski metric for nearest neighbors searches. + + +People +------ + * 282 `Andreas Müller`_ + * 239 `Peter Prettenhofer`_ + * 198 `Gael Varoquaux`_ + * 129 `Olivier Grisel`_ + * 114 `Mathieu Blondel`_ + * 103 Clay Woolam + * 96 `Lars Buitinck`_ + * 88 `Jaques Grobler`_ + * 82 `Alexandre Gramfort`_ + * 50 `Bertrand Thirion`_ + * 42 `Robert Layton`_ + * 28 flyingimmidev + * 26 `Jake Vanderplas`_ + * 26 Shiqiao Du + * 21 `Satrajit Ghosh`_ + * 17 `David Marek`_ + * 17 `Gilles Louppe`_ + * 14 `Vlad Niculae`_ + * 11 Yannick Schwartz + * 10 `Fabian Pedregosa`_ + * 9 fcostin + * 7 Nick Wilson + * 5 Adrien Gaidon + * 5 `Nicolas Pinto`_ + * 4 `David Warde-Farley`_ + * 5 Nelle Varoquaux + * 5 Emmanuelle Gouillart + * 3 Joonas Sillanpää + * 3 Paolo Losi + * 2 Charles McCarthy + * 2 Roy Hyunjin Han + * 2 Scott White + * 2 ibayer + * 1 Brandyn White + * 1 Carlos Scheidegger + * 1 Claire Revillet + * 1 Conrad Lee + * 1 `Edouard Duchesnay`_ + * 1 Jan Hendrik Metzen + * 1 Meng Xinfan + * 1 `Rob Zinkov`_ + * 1 Shiqiao + * 1 Udi Weinsberg + * 1 Virgile Fritsch + * 1 Xinfan Meng + * 1 Yaroslav Halchenko + * 1 jansoe + * 1 Leon Palafox + + +.. _changes_0_10: + +Version 0.10 +============ + +**January 11, 2012** + +Changelog +--------- + +- Python 2.5 compatibility was dropped; the minimum Python version needed + to use scikit-learn is now 2.6. + +- :ref:`sparse_inverse_covariance` estimation using the graph Lasso, with + associated cross-validated estimator, by `Gael Varoquaux`_ + +- New :ref:`Tree ` module by `Brian Holt`_, `Peter Prettenhofer`_, + `Satrajit Ghosh`_ and `Gilles Louppe`_. The module comes with complete + documentation and examples. + +- Fixed a bug in the RFE module by `Gilles Louppe`_ (issue #378). + +- Fixed a memory leak in :ref:`svm` module by `Brian Holt`_ (issue #367). + +- Faster tests by `Fabian Pedregosa`_ and others. + +- Silhouette Coefficient cluster analysis evaluation metric added as + :func:`sklearn.metrics.silhouette_score` by Robert Layton. + +- Fixed a bug in :ref:`k_means` in the handling of the ``n_init`` parameter: + the clustering algorithm used to be run ``n_init`` times but the last + solution was retained instead of the best solution by `Olivier Grisel`_. + +- Minor refactoring in :ref:`sgd` module; consolidated dense and sparse + predict methods; Enhanced test time performance by converting model + parameters to fortran-style arrays after fitting (only multi-class). + +- Adjusted Mutual Information metric added as + :func:`sklearn.metrics.adjusted_mutual_info_score` by Robert Layton. + +- Models like SVC/SVR/LinearSVC/LogisticRegression from libsvm/liblinear + now support scaling of C regularization parameter by the number of + samples by `Alexandre Gramfort`_. + +- New :ref:`Ensemble Methods ` module by `Gilles Louppe`_ and + `Brian Holt`_. The module comes with the random forest algorithm and the + extra-trees method, along with documentation and examples. + +- :ref:`outlier_detection`: outlier and novelty detection, by + :user:`Virgile Fritsch `. + +- :ref:`kernel_approximation`: a transform implementing kernel + approximation for fast SGD on non-linear kernels by + `Andreas Müller`_. + +- Fixed a bug due to atom swapping in :ref:`OMP` by `Vlad Niculae`_. + +- :ref:`SparseCoder` by `Vlad Niculae`_. + +- :ref:`mini_batch_kmeans` performance improvements by `Olivier Grisel`_. + +- :ref:`k_means` support for sparse matrices by `Mathieu Blondel`_. + +- Improved documentation for developers and for the :mod:`sklearn.utils` + module, by `Jake Vanderplas`_. + +- Vectorized 20newsgroups dataset loader + (:func:`sklearn.datasets.fetch_20newsgroups_vectorized`) by + `Mathieu Blondel`_. + +- :ref:`multiclass` by `Lars Buitinck`_. + +- Utilities for fast computation of mean and variance for sparse matrices + by `Mathieu Blondel`_. + +- Make :func:`sklearn.preprocessing.scale` and + :class:`sklearn.preprocessing.Scaler` work on sparse matrices by + `Olivier Grisel`_ + +- Feature importances using decision trees and/or forest of trees, + by `Gilles Louppe`_. + +- Parallel implementation of forests of randomized trees by + `Gilles Louppe`_. + +- :class:`sklearn.cross_validation.ShuffleSplit` can subsample the train + sets as well as the test sets by `Olivier Grisel`_. + +- Errors in the build of the documentation fixed by `Andreas Müller`_. + + +API changes summary +------------------- + +Here are the code migration instructions when upgrading from scikit-learn +version 0.9: + +- Some estimators that may overwrite their inputs to save memory previously + had ``overwrite_`` parameters; these have been replaced with ``copy_`` + parameters with exactly the opposite meaning. + + This particularly affects some of the estimators in :mod:`linear_model`. + The default behavior is still to copy everything passed in. + +- The SVMlight dataset loader :func:`sklearn.datasets.load_svmlight_file` no + longer supports loading two files at once; use ``load_svmlight_files`` + instead. Also, the (unused) ``buffer_mb`` parameter is gone. + +- Sparse estimators in the :ref:`sgd` module use dense parameter vector + ``coef_`` instead of ``sparse_coef_``. This significantly improves + test time performance. + +- The :ref:`covariance` module now has a robust estimator of + covariance, the Minimum Covariance Determinant estimator. + +- Cluster evaluation metrics in :mod:`metrics.cluster` have been refactored + but the changes are backwards compatible. They have been moved to the + :mod:`metrics.cluster.supervised`, along with + :mod:`metrics.cluster.unsupervised` which contains the Silhouette + Coefficient. + +- The ``permutation_test_score`` function now behaves the same way as + ``cross_val_score`` (i.e. uses the mean score across the folds.) + +- Cross Validation generators now use integer indices (``indices=True``) + by default instead of boolean masks. This make it more intuitive to + use with sparse matrix data. + +- The functions used for sparse coding, ``sparse_encode`` and + ``sparse_encode_parallel`` have been combined into + :func:`sklearn.decomposition.sparse_encode`, and the shapes of the arrays + have been transposed for consistency with the matrix factorization setting, + as opposed to the regression setting. + +- Fixed an off-by-one error in the SVMlight/LibSVM file format handling; + files generated using :func:`sklearn.datasets.dump_svmlight_file` should be + re-generated. (They should continue to work, but accidentally had one + extra column of zeros prepended.) + +- ``BaseDictionaryLearning`` class replaced by ``SparseCodingMixin``. + +- :func:`sklearn.utils.extmath.fast_svd` has been renamed + :func:`sklearn.utils.extmath.randomized_svd` and the default + oversampling is now fixed to 10 additional random vectors instead + of doubling the number of components to extract. The new behavior + follows the reference paper. + + +People +------ + +The following people contributed to scikit-learn since last release: + + * 246 `Andreas Müller`_ + * 242 `Olivier Grisel`_ + * 220 `Gilles Louppe`_ + * 183 `Brian Holt`_ + * 166 `Gael Varoquaux`_ + * 144 `Lars Buitinck`_ + * 73 `Vlad Niculae`_ + * 65 `Peter Prettenhofer`_ + * 64 `Fabian Pedregosa`_ + * 60 Robert Layton + * 55 `Mathieu Blondel`_ + * 52 `Jake Vanderplas`_ + * 44 Noel Dawe + * 38 `Alexandre Gramfort`_ + * 24 :user:`Virgile Fritsch ` + * 23 `Satrajit Ghosh`_ + * 3 Jan Hendrik Metzen + * 3 Kenneth C. Arnold + * 3 Shiqiao Du + * 3 Tim Sheerman-Chase + * 3 `Yaroslav Halchenko`_ + * 2 Bala Subrahmanyam Varanasi + * 2 DraXus + * 2 Michael Eickenberg + * 1 Bogdan Trach + * 1 Félix-Antoine Fortin + * 1 Juan Manuel Caicedo Carvajal + * 1 Nelle Varoquaux + * 1 `Nicolas Pinto`_ + * 1 Tiziano Zito + * 1 Xinfan Meng + + + +.. _changes_0_9: + +Version 0.9 +=========== + +**September 21, 2011** + +scikit-learn 0.9 was released on September 2011, three months after the 0.8 +release and includes the new modules :ref:`manifold`, :ref:`dirichlet_process` +as well as several new algorithms and documentation improvements. + +This release also includes the dictionary-learning work developed by +`Vlad Niculae`_ as part of the `Google Summer of Code +`_ program. + + + +.. |banner1| image:: ../auto_examples/manifold/images/thumb/sphx_glr_plot_compare_methods_thumb.png + :target: ../auto_examples/manifold/plot_compare_methods.html + +.. |banner2| image:: ../auto_examples/linear_model/images/thumb/sphx_glr_plot_omp_thumb.png + :target: ../auto_examples/linear_model/plot_omp.html + +.. |banner3| image:: ../auto_examples/decomposition/images/thumb/sphx_glr_plot_kernel_pca_thumb.png + :target: ../auto_examples/decomposition/plot_kernel_pca.html + +.. |center-div| raw:: html + +
    + +.. |end-div| raw:: html + +
    + + +|center-div| |banner2| |banner1| |banner3| |end-div| + +Changelog +--------- + +- New :ref:`manifold` module by `Jake Vanderplas`_ and + `Fabian Pedregosa`_. + +- New :ref:`Dirichlet Process ` Gaussian Mixture + Model by `Alexandre Passos`_ + +- :ref:`neighbors` module refactoring by `Jake Vanderplas`_ : + general refactoring, support for sparse matrices in input, speed and + documentation improvements. See the next section for a full list of API + changes. + +- Improvements on the :ref:`feature_selection` module by + `Gilles Louppe`_ : refactoring of the RFE classes, documentation + rewrite, increased efficiency and minor API changes. + +- :ref:`SparsePCA` by `Vlad Niculae`_, `Gael Varoquaux`_ and + `Alexandre Gramfort`_ + +- Printing an estimator now behaves independently of architectures + and Python version thanks to :user:`Jean Kossaifi `. + +- :ref:`Loader for libsvm/svmlight format ` by + `Mathieu Blondel`_ and `Lars Buitinck`_ + +- Documentation improvements: thumbnails in + example gallery by `Fabian Pedregosa`_. + +- Important bugfixes in :ref:`svm` module (segfaults, bad + performance) by `Fabian Pedregosa`_. + +- Added :ref:`multinomial_naive_bayes` and :ref:`bernoulli_naive_bayes` + by `Lars Buitinck`_ + +- Text feature extraction optimizations by Lars Buitinck + +- Chi-Square feature selection + (:func:`feature_selection.univariate_selection.chi2`) by `Lars Buitinck`_. + +- :ref:`sample_generators` module refactoring by `Gilles Louppe`_ + +- :ref:`multiclass` by `Mathieu Blondel`_ + +- Ball tree rewrite by `Jake Vanderplas`_ + +- Implementation of :ref:`dbscan` algorithm by Robert Layton + +- Kmeans predict and transform by Robert Layton + +- Preprocessing module refactoring by `Olivier Grisel`_ + +- Faster mean shift by Conrad Lee + +- New ``Bootstrap``, :ref:`ShuffleSplit` and various other + improvements in cross validation schemes by `Olivier Grisel`_ and + `Gael Varoquaux`_ + +- Adjusted Rand index and V-Measure clustering evaluation metrics by `Olivier Grisel`_ + +- Added :class:`Orthogonal Matching Pursuit ` by `Vlad Niculae`_ + +- Added 2D-patch extractor utilities in the :ref:`feature_extraction` module by `Vlad Niculae`_ + +- Implementation of :class:`linear_model.LassoLarsCV` + (cross-validated Lasso solver using the Lars algorithm) and + :class:`linear_model.LassoLarsIC` (BIC/AIC model + selection in Lars) by `Gael Varoquaux`_ + and `Alexandre Gramfort`_ + +- Scalability improvements to :func:`metrics.roc_curve` by Olivier Hervieu + +- Distance helper functions :func:`metrics.pairwise.pairwise_distances` + and :func:`metrics.pairwise.pairwise_kernels` by Robert Layton + +- :class:`Mini-Batch K-Means ` by Nelle Varoquaux and Peter Prettenhofer. + +- :ref:`mldata` utilities by Pietro Berkes. + +- :ref:`olivetti_faces` by `David Warde-Farley`_. + + +API changes summary +------------------- + +Here are the code migration instructions when upgrading from scikit-learn +version 0.8: + +- The ``scikits.learn`` package was renamed ``sklearn``. There is + still a ``scikits.learn`` package alias for backward compatibility. + + Third-party projects with a dependency on scikit-learn 0.9+ should + upgrade their codebase. For instance, under Linux / MacOSX just run + (make a backup first!):: + + find -name "*.py" | xargs sed -i 's/\bscikits.learn\b/sklearn/g' + +- Estimators no longer accept model parameters as ``fit`` arguments: + instead all parameters must be only be passed as constructor + arguments or using the now public ``set_params`` method inherited + from :class:`base.BaseEstimator`. + + Some estimators can still accept keyword arguments on the ``fit`` + but this is restricted to data-dependent values (e.g. a Gram matrix + or an affinity matrix that are precomputed from the ``X`` data matrix. + +- The ``cross_val`` package has been renamed to ``cross_validation`` + although there is also a ``cross_val`` package alias in place for + backward compatibility. + + Third-party projects with a dependency on scikit-learn 0.9+ should + upgrade their codebase. For instance, under Linux / MacOSX just run + (make a backup first!):: + + find -name "*.py" | xargs sed -i 's/\bcross_val\b/cross_validation/g' + +- The ``score_func`` argument of the + ``sklearn.cross_validation.cross_val_score`` function is now expected + to accept ``y_test`` and ``y_predicted`` as only arguments for + classification and regression tasks or ``X_test`` for unsupervised + estimators. + +- ``gamma`` parameter for support vector machine algorithms is set + to ``1 / n_features`` by default, instead of ``1 / n_samples``. + +- The ``sklearn.hmm`` has been marked as orphaned: it will be removed + from scikit-learn in version 0.11 unless someone steps up to + contribute documentation, examples and fix lurking numerical + stability issues. + +- ``sklearn.neighbors`` has been made into a submodule. The two previously + available estimators, ``NeighborsClassifier`` and ``NeighborsRegressor`` + have been marked as deprecated. Their functionality has been divided + among five new classes: ``NearestNeighbors`` for unsupervised neighbors + searches, ``KNeighborsClassifier`` & ``RadiusNeighborsClassifier`` + for supervised classification problems, and ``KNeighborsRegressor`` + & ``RadiusNeighborsRegressor`` for supervised regression problems. + +- ``sklearn.ball_tree.BallTree`` has been moved to + ``sklearn.neighbors.BallTree``. Using the former will generate a warning. + +- ``sklearn.linear_model.LARS()`` and related classes (LassoLARS, + LassoLARSCV, etc.) have been renamed to + ``sklearn.linear_model.Lars()``. + +- All distance metrics and kernels in ``sklearn.metrics.pairwise`` now have a Y + parameter, which by default is None. If not given, the result is the distance + (or kernel similarity) between each sample in Y. If given, the result is the + pairwise distance (or kernel similarity) between samples in X to Y. + +- ``sklearn.metrics.pairwise.l1_distance`` is now called ``manhattan_distance``, + and by default returns the pairwise distance. For the component wise distance, + set the parameter ``sum_over_features`` to ``False``. + +Backward compatibility package aliases and other deprecated classes and +functions will be removed in version 0.11. + + +People +------ + +38 people contributed to this release. + +- 387 `Vlad Niculae`_ +- 320 `Olivier Grisel`_ +- 192 `Lars Buitinck`_ +- 179 `Gael Varoquaux`_ +- 168 `Fabian Pedregosa`_ (`INRIA`_, `Parietal Team`_) +- 127 `Jake Vanderplas`_ +- 120 `Mathieu Blondel`_ +- 85 `Alexandre Passos`_ +- 67 `Alexandre Gramfort`_ +- 57 `Peter Prettenhofer`_ +- 56 `Gilles Louppe`_ +- 42 Robert Layton +- 38 Nelle Varoquaux +- 32 :user:`Jean Kossaifi ` +- 30 Conrad Lee +- 22 Pietro Berkes +- 18 andy +- 17 David Warde-Farley +- 12 Brian Holt +- 11 Robert +- 8 Amit Aides +- 8 :user:`Virgile Fritsch ` +- 7 `Yaroslav Halchenko`_ +- 6 Salvatore Masecchia +- 5 Paolo Losi +- 4 Vincent Schut +- 3 Alexis Metaireau +- 3 Bryan Silverthorn +- 3 `Andreas Müller`_ +- 2 Minwoo Jake Lee +- 1 Emmanuelle Gouillart +- 1 Keith Goodman +- 1 Lucas Wiman +- 1 `Nicolas Pinto`_ +- 1 Thouis (Ray) Jones +- 1 Tim Sheerman-Chase + + +.. _changes_0_8: + +Version 0.8 +=========== + +**May 11, 2011** + +scikit-learn 0.8 was released on May 2011, one month after the first +"international" `scikit-learn coding sprint +`_ and is +marked by the inclusion of important modules: :ref:`hierarchical_clustering`, +:ref:`cross_decomposition`, :ref:`NMF`, initial support for Python 3 and by important +enhancements and bug fixes. + + +Changelog +--------- + +Several new modules where introduced during this release: + +- New :ref:`hierarchical_clustering` module by Vincent Michel, + `Bertrand Thirion`_, `Alexandre Gramfort`_ and `Gael Varoquaux`_. + +- :ref:`kernel_pca` implementation by `Mathieu Blondel`_ + +- :ref:`labeled_faces_in_the_wild` by `Olivier Grisel`_. + +- New :ref:`cross_decomposition` module by `Edouard Duchesnay`_. + +- :ref:`NMF` module `Vlad Niculae`_ + +- Implementation of the :ref:`oracle_approximating_shrinkage` algorithm by + :user:`Virgile Fritsch ` in the :ref:`covariance` module. + + +Some other modules benefited from significant improvements or cleanups. + + +- Initial support for Python 3: builds and imports cleanly, + some modules are usable while others have failing tests by `Fabian Pedregosa`_. + +- :class:`decomposition.PCA` is now usable from the Pipeline object by `Olivier Grisel`_. + +- Guide :ref:`performance-howto` by `Olivier Grisel`_. + +- Fixes for memory leaks in libsvm bindings, 64-bit safer BallTree by Lars Buitinck. + +- bug and style fixing in :ref:`k_means` algorithm by Jan Schlüter. + +- Add attribute converged to Gaussian Mixture Models by Vincent Schut. + +- Implemented ``transform``, ``predict_log_proba`` in + :class:`discriminant_analysis.LinearDiscriminantAnalysis` By `Mathieu Blondel`_. + +- Refactoring in the :ref:`svm` module and bug fixes by `Fabian Pedregosa`_, + `Gael Varoquaux`_ and Amit Aides. + +- Refactored SGD module (removed code duplication, better variable naming), + added interface for sample weight by `Peter Prettenhofer`_. + +- Wrapped BallTree with Cython by Thouis (Ray) Jones. + +- Added function :func:`svm.l1_min_c` by Paolo Losi. + +- Typos, doc style, etc. by `Yaroslav Halchenko`_, `Gael Varoquaux`_, + `Olivier Grisel`_, Yann Malet, `Nicolas Pinto`_, Lars Buitinck and + `Fabian Pedregosa`_. + + +People +------- + +People that made this release possible preceded by number of commits: + + +- 159 `Olivier Grisel`_ +- 96 `Gael Varoquaux`_ +- 96 `Vlad Niculae`_ +- 94 `Fabian Pedregosa`_ +- 36 `Alexandre Gramfort`_ +- 32 Paolo Losi +- 31 `Edouard Duchesnay`_ +- 30 `Mathieu Blondel`_ +- 25 `Peter Prettenhofer`_ +- 22 `Nicolas Pinto`_ +- 11 :user:`Virgile Fritsch ` + - 7 Lars Buitinck + - 6 Vincent Michel + - 5 `Bertrand Thirion`_ + - 4 Thouis (Ray) Jones + - 4 Vincent Schut + - 3 Jan Schlüter + - 2 Julien Miotte + - 2 `Matthieu Perrot`_ + - 2 Yann Malet + - 2 `Yaroslav Halchenko`_ + - 1 Amit Aides + - 1 `Andreas Müller`_ + - 1 Feth Arezki + - 1 Meng Xinfan + + +.. _changes_0_7: + +Version 0.7 +=========== + +**March 2, 2011** + +scikit-learn 0.7 was released in March 2011, roughly three months +after the 0.6 release. This release is marked by the speed +improvements in existing algorithms like k-Nearest Neighbors and +K-Means algorithm and by the inclusion of an efficient algorithm for +computing the Ridge Generalized Cross Validation solution. Unlike the +preceding release, no new modules where added to this release. + +Changelog +--------- + +- Performance improvements for Gaussian Mixture Model sampling [Jan + Schlüter]. + +- Implementation of efficient leave-one-out cross-validated Ridge in + :class:`linear_model.RidgeCV` [`Mathieu Blondel`_] + +- Better handling of collinearity and early stopping in + :func:`linear_model.lars_path` [`Alexandre Gramfort`_ and `Fabian + Pedregosa`_]. + +- Fixes for liblinear ordering of labels and sign of coefficients + [Dan Yamins, Paolo Losi, `Mathieu Blondel`_ and `Fabian Pedregosa`_]. + +- Performance improvements for Nearest Neighbors algorithm in + high-dimensional spaces [`Fabian Pedregosa`_]. + +- Performance improvements for :class:`cluster.KMeans` [`Gael + Varoquaux`_ and `James Bergstra`_]. + +- Sanity checks for SVM-based classes [`Mathieu Blondel`_]. + +- Refactoring of :class:`neighbors.NeighborsClassifier` and + :func:`neighbors.kneighbors_graph`: added different algorithms for + the k-Nearest Neighbor Search and implemented a more stable + algorithm for finding barycenter weights. Also added some + developer documentation for this module, see + `notes_neighbors + `_ for more information [`Fabian Pedregosa`_]. + +- Documentation improvements: Added :class:`pca.RandomizedPCA` and + :class:`linear_model.LogisticRegression` to the class + reference. Also added references of matrices used for clustering + and other fixes [`Gael Varoquaux`_, `Fabian Pedregosa`_, `Mathieu + Blondel`_, `Olivier Grisel`_, Virgile Fritsch , Emmanuelle + Gouillart] + +- Binded decision_function in classes that make use of liblinear_, + dense and sparse variants, like :class:`svm.LinearSVC` or + :class:`linear_model.LogisticRegression` [`Fabian Pedregosa`_]. + +- Performance and API improvements to + :func:`metrics.euclidean_distances` and to + :class:`pca.RandomizedPCA` [`James Bergstra`_]. + +- Fix compilation issues under NetBSD [Kamel Ibn Hassen Derouiche] + +- Allow input sequences of different lengths in :class:`hmm.GaussianHMM` + [`Ron Weiss`_]. + +- Fix bug in affinity propagation caused by incorrect indexing [Xinfan Meng] + + +People +------ + +People that made this release possible preceded by number of commits: + +- 85 `Fabian Pedregosa`_ +- 67 `Mathieu Blondel`_ +- 20 `Alexandre Gramfort`_ +- 19 `James Bergstra`_ +- 14 Dan Yamins +- 13 `Olivier Grisel`_ +- 12 `Gael Varoquaux`_ +- 4 `Edouard Duchesnay`_ +- 4 `Ron Weiss`_ +- 2 Satrajit Ghosh +- 2 Vincent Dubourg +- 1 Emmanuelle Gouillart +- 1 Kamel Ibn Hassen Derouiche +- 1 Paolo Losi +- 1 VirgileFritsch +- 1 `Yaroslav Halchenko`_ +- 1 Xinfan Meng + + +.. _changes_0_6: + +Version 0.6 +=========== + +**December 21, 2010** + +scikit-learn 0.6 was released on December 2010. It is marked by the +inclusion of several new modules and a general renaming of old +ones. It is also marked by the inclusion of new example, including +applications to real-world datasets. + + +Changelog +--------- + +- New `stochastic gradient + `_ descent + module by Peter Prettenhofer. The module comes with complete + documentation and examples. + +- Improved svm module: memory consumption has been reduced by 50%, + heuristic to automatically set class weights, possibility to + assign weights to samples (see + :ref:`sphx_glr_auto_examples_svm_plot_weighted_samples.py` for an example). + +- New :ref:`gaussian_process` module by Vincent Dubourg. This module + also has great documentation and some very neat examples. See + example_gaussian_process_plot_gp_regression.py or + example_gaussian_process_plot_gp_probabilistic_classification_after_regression.py + for a taste of what can be done. + +- It is now possible to use liblinear’s Multi-class SVC (option + multi_class in :class:`svm.LinearSVC`) + +- New features and performance improvements of text feature + extraction. + +- Improved sparse matrix support, both in main classes + (:class:`grid_search.GridSearchCV`) as in modules + sklearn.svm.sparse and sklearn.linear_model.sparse. + +- Lots of cool new examples and a new section that uses real-world + datasets was created. These include: + :ref:`sphx_glr_auto_examples_applications_plot_face_recognition.py`, + :ref:`sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py`, + :ref:`sphx_glr_auto_examples_applications_svm_gui.py`, + :ref:`sphx_glr_auto_examples_applications_wikipedia_principal_eigenvector.py` and + others. + +- Faster :ref:`least_angle_regression` algorithm. It is now 2x + faster than the R version on worst case and up to 10x times faster + on some cases. + +- Faster coordinate descent algorithm. In particular, the full path + version of lasso (:func:`linear_model.lasso_path`) is more than + 200x times faster than before. + +- It is now possible to get probability estimates from a + :class:`linear_model.LogisticRegression` model. + +- module renaming: the glm module has been renamed to linear_model, + the gmm module has been included into the more general mixture + model and the sgd module has been included in linear_model. + +- Lots of bug fixes and documentation improvements. + + +People +------ + +People that made this release possible preceded by number of commits: + + * 207 `Olivier Grisel`_ + + * 167 `Fabian Pedregosa`_ + + * 97 `Peter Prettenhofer`_ + + * 68 `Alexandre Gramfort`_ + + * 59 `Mathieu Blondel`_ + + * 55 `Gael Varoquaux`_ + + * 33 Vincent Dubourg + + * 21 `Ron Weiss`_ + + * 9 Bertrand Thirion + + * 3 `Alexandre Passos`_ + + * 3 Anne-Laure Fouque + + * 2 Ronan Amicel + + * 1 `Christian Osendorfer`_ + + + +.. _changes_0_5: + + +Version 0.5 +=========== + +**October 11, 2010** + +Changelog +--------- + +New classes +----------- + +- Support for sparse matrices in some classifiers of modules + ``svm`` and ``linear_model`` (see :class:`svm.sparse.SVC`, + :class:`svm.sparse.SVR`, :class:`svm.sparse.LinearSVC`, + :class:`linear_model.sparse.Lasso`, :class:`linear_model.sparse.ElasticNet`) + +- New :class:`pipeline.Pipeline` object to compose different estimators. + +- Recursive Feature Elimination routines in module + :ref:`feature_selection`. + +- Addition of various classes capable of cross validation in the + linear_model module (:class:`linear_model.LassoCV`, :class:`linear_model.ElasticNetCV`, + etc.). + +- New, more efficient LARS algorithm implementation. The Lasso + variant of the algorithm is also implemented. See + :class:`linear_model.lars_path`, :class:`linear_model.Lars` and + :class:`linear_model.LassoLars`. + +- New Hidden Markov Models module (see classes + :class:`hmm.GaussianHMM`, :class:`hmm.MultinomialHMM`, + :class:`hmm.GMMHMM`) + +- New module feature_extraction (see :ref:`class reference + `) + +- New FastICA algorithm in module sklearn.fastica + + +Documentation +------------- + +- Improved documentation for many modules, now separating + narrative documentation from the class reference. As an example, + see `documentation for the SVM module + `_ and the + complete `class reference + `_. + +Fixes +----- + +- API changes: adhere variable names to PEP-8, give more + meaningful names. + +- Fixes for svm module to run on a shared memory context + (multiprocessing). + +- It is again possible to generate latex (and thus PDF) from the + sphinx docs. + +Examples +-------- + +- new examples using some of the mlcomp datasets: + ``sphx_glr_auto_examples_mlcomp_sparse_document_classification.py`` (since removed) and + :ref:`sphx_glr_auto_examples_text_document_classification_20newsgroups.py` + +- Many more examples. `See here + `_ + the full list of examples. + + +External dependencies +--------------------- + +- Joblib is now a dependency of this package, although it is + shipped with (sklearn.externals.joblib). + +Removed modules +--------------- + +- Module ann (Artificial Neural Networks) has been removed from + the distribution. Users wanting this sort of algorithms should + take a look into pybrain. + +Misc +---- + +- New sphinx theme for the web page. + + +Authors +------- + +The following is a list of authors for this release, preceded by +number of commits: + + * 262 Fabian Pedregosa + * 240 Gael Varoquaux + * 149 Alexandre Gramfort + * 116 Olivier Grisel + * 40 Vincent Michel + * 38 Ron Weiss + * 23 Matthieu Perrot + * 10 Bertrand Thirion + * 7 Yaroslav Halchenko + * 9 VirgileFritsch + * 6 Edouard Duchesnay + * 4 Mathieu Blondel + * 1 Ariel Rokem + * 1 Matthieu Brucher + +Version 0.4 +=========== + +**August 26, 2010** + +Changelog +--------- + +Major changes in this release include: + +- Coordinate Descent algorithm (Lasso, ElasticNet) refactoring & + speed improvements (roughly 100x times faster). + +- Coordinate Descent Refactoring (and bug fixing) for consistency + with R's package GLMNET. + +- New metrics module. + +- New GMM module contributed by Ron Weiss. + +- Implementation of the LARS algorithm (without Lasso variant for now). + +- feature_selection module redesign. + +- Migration to GIT as version control system. + +- Removal of obsolete attrselect module. + +- Rename of private compiled extensions (added underscore). + +- Removal of legacy unmaintained code. + +- Documentation improvements (both docstring and rst). + +- Improvement of the build system to (optionally) link with MKL. + Also, provide a lite BLAS implementation in case no system-wide BLAS is + found. + +- Lots of new examples. + +- Many, many bug fixes ... + + +Authors +------- + +The committer list for this release is the following (preceded by number +of commits): + + * 143 Fabian Pedregosa + * 35 Alexandre Gramfort + * 34 Olivier Grisel + * 11 Gael Varoquaux + * 5 Yaroslav Halchenko + * 2 Vincent Michel + * 1 Chris Filo Gorgolewski + + +Earlier versions +================ + +Earlier versions included contributions by Fred Mailhot, David Cooke, +David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. + diff --git a/doc/whats_new/v0.13.rst b/doc/whats_new/v0.13.rst new file mode 100644 index 0000000000000..c234cd6eb2a37 --- /dev/null +++ b/doc/whats_new/v0.13.rst @@ -0,0 +1,391 @@ +.. include:: _contributors.rst + +.. currentmodule:: sklearn + +.. _changes_0_13_1: + +Version 0.13.1 +============== + +**February 23, 2013** + +The 0.13.1 release only fixes some bugs and does not add any new functionality. + +Changelog +--------- + +- Fixed a testing error caused by the function :func:`cross_validation.train_test_split` being + interpreted as a test by `Yaroslav Halchenko`_. + +- Fixed a bug in the reassignment of small clusters in the :class:`cluster.MiniBatchKMeans` + by `Gael Varoquaux`_. + +- Fixed default value of ``gamma`` in :class:`decomposition.KernelPCA` by `Lars Buitinck`_. + +- Updated joblib to ``0.7.0d`` by `Gael Varoquaux`_. + +- Fixed scaling of the deviance in :class:`ensemble.GradientBoostingClassifier` by `Peter Prettenhofer`_. + +- Better tie-breaking in :class:`multiclass.OneVsOneClassifier` by `Andreas Müller`_. + +- Other small improvements to tests and documentation. + +People +------ +List of contributors for release 0.13.1 by number of commits. + * 16 `Lars Buitinck`_ + * 12 `Andreas Müller`_ + * 8 `Gael Varoquaux`_ + * 5 Robert Marchman + * 3 `Peter Prettenhofer`_ + * 2 Hrishikesh Huilgolkar + * 1 Bastiaan van den Berg + * 1 Diego Molla + * 1 `Gilles Louppe`_ + * 1 `Mathieu Blondel`_ + * 1 `Nelle Varoquaux`_ + * 1 Rafael Cunha de Almeida + * 1 Rolando Espinoza La fuente + * 1 `Vlad Niculae`_ + * 1 `Yaroslav Halchenko`_ + + +.. _changes_0_13: + +Version 0.13 +============ + +**January 21, 2013** + +New Estimator Classes +--------------------- + +- :class:`dummy.DummyClassifier` and :class:`dummy.DummyRegressor`, two + data-independent predictors by `Mathieu Blondel`_. Useful to sanity-check + your estimators. See :ref:`dummy_estimators` in the user guide. + Multioutput support added by `Arnaud Joly`_. + +- :class:`decomposition.FactorAnalysis`, a transformer implementing the + classical factor analysis, by `Christian Osendorfer`_ and `Alexandre + Gramfort`_. See :ref:`FA` in the user guide. + +- :class:`feature_extraction.FeatureHasher`, a transformer implementing the + "hashing trick" for fast, low-memory feature extraction from string fields + by `Lars Buitinck`_ and :class:`feature_extraction.text.HashingVectorizer` + for text documents by `Olivier Grisel`_ See :ref:`feature_hashing` and + :ref:`hashing_vectorizer` for the documentation and sample usage. + +- :class:`pipeline.FeatureUnion`, a transformer that concatenates + results of several other transformers by `Andreas Müller`_. See + :ref:`feature_union` in the user guide. + +- :class:`random_projection.GaussianRandomProjection`, + :class:`random_projection.SparseRandomProjection` and the function + :func:`random_projection.johnson_lindenstrauss_min_dim`. The first two are + transformers implementing Gaussian and sparse random projection matrix + by `Olivier Grisel`_ and `Arnaud Joly`_. + See :ref:`random_projection` in the user guide. + +- :class:`kernel_approximation.Nystroem`, a transformer for approximating + arbitrary kernels by `Andreas Müller`_. See + :ref:`nystroem_kernel_approx` in the user guide. + +- :class:`preprocessing.OneHotEncoder`, a transformer that computes binary + encodings of categorical features by `Andreas Müller`_. See + :ref:`preprocessing_categorical_features` in the user guide. + +- :class:`linear_model.PassiveAggressiveClassifier` and + :class:`linear_model.PassiveAggressiveRegressor`, predictors implementing + an efficient stochastic optimization for linear models by `Rob Zinkov`_ and + `Mathieu Blondel`_. See :ref:`passive_aggressive` in the user + guide. + +- :class:`ensemble.RandomTreesEmbedding`, a transformer for creating high-dimensional + sparse representations using ensembles of totally random trees by `Andreas Müller`_. + See :ref:`random_trees_embedding` in the user guide. + +- :class:`manifold.SpectralEmbedding` and function + :func:`manifold.spectral_embedding`, implementing the "laplacian + eigenmaps" transformation for non-linear dimensionality reduction by Wei + Li. See :ref:`spectral_embedding` in the user guide. + +- :class:`isotonic.IsotonicRegression` by `Fabian Pedregosa`_, `Alexandre Gramfort`_ + and `Nelle Varoquaux`_, + + +Changelog +--------- + +- :func:`metrics.zero_one_loss` (formerly ``metrics.zero_one``) now has + option for normalized output that reports the fraction of + misclassifications, rather than the raw number of misclassifications. By + Kyle Beauchamp. + +- :class:`tree.DecisionTreeClassifier` and all derived ensemble models now + support sample weighting, by `Noel Dawe`_ and `Gilles Louppe`_. + +- Speedup improvement when using bootstrap samples in forests of randomized + trees, by `Peter Prettenhofer`_ and `Gilles Louppe`_. + +- Partial dependence plots for :ref:`gradient_boosting` in + :func:`ensemble.partial_dependence.partial_dependence` by `Peter + Prettenhofer`_. See :ref:`sphx_glr_auto_examples_ensemble_plot_partial_dependence.py` for an + example. + +- The table of contents on the website has now been made expandable by + `Jaques Grobler`_. + +- :class:`feature_selection.SelectPercentile` now breaks ties + deterministically instead of returning all equally ranked features. + +- :class:`feature_selection.SelectKBest` and + :class:`feature_selection.SelectPercentile` are more numerically stable + since they use scores, rather than p-values, to rank results. This means + that they might sometimes select different features than they did + previously. + +- Ridge regression and ridge classification fitting with ``sparse_cg`` solver + no longer has quadratic memory complexity, by `Lars Buitinck`_ and + `Fabian Pedregosa`_. + +- Ridge regression and ridge classification now support a new fast solver + called ``lsqr``, by `Mathieu Blondel`_. + +- Speed up of :func:`metrics.precision_recall_curve` by Conrad Lee. + +- Added support for reading/writing svmlight files with pairwise + preference attribute (qid in svmlight file format) in + :func:`datasets.dump_svmlight_file` and + :func:`datasets.load_svmlight_file` by `Fabian Pedregosa`_. + +- Faster and more robust :func:`metrics.confusion_matrix` and + :ref:`clustering_evaluation` by Wei Li. + +- :func:`cross_validation.cross_val_score` now works with precomputed kernels + and affinity matrices, by `Andreas Müller`_. + +- LARS algorithm made more numerically stable with heuristics to drop + regressors too correlated as well as to stop the path when + numerical noise becomes predominant, by `Gael Varoquaux`_. + +- Faster implementation of :func:`metrics.precision_recall_curve` by + Conrad Lee. + +- New kernel :class:`metrics.chi2_kernel` by `Andreas Müller`_, often used + in computer vision applications. + +- Fix of longstanding bug in :class:`naive_bayes.BernoulliNB` fixed by + Shaun Jackman. + +- Implemented ``predict_proba`` in :class:`multiclass.OneVsRestClassifier`, + by Andrew Winterman. + +- Improve consistency in gradient boosting: estimators + :class:`ensemble.GradientBoostingRegressor` and + :class:`ensemble.GradientBoostingClassifier` use the estimator + :class:`tree.DecisionTreeRegressor` instead of the + :class:`tree._tree.Tree` data structure by `Arnaud Joly`_. + +- Fixed a floating point exception in the :ref:`decision trees ` + module, by Seberg. + +- Fix :func:`metrics.roc_curve` fails when y_true has only one class + by Wei Li. + +- Add the :func:`metrics.mean_absolute_error` function which computes the + mean absolute error. The :func:`metrics.mean_squared_error`, + :func:`metrics.mean_absolute_error` and + :func:`metrics.r2_score` metrics support multioutput by `Arnaud Joly`_. + +- Fixed ``class_weight`` support in :class:`svm.LinearSVC` and + :class:`linear_model.LogisticRegression` by `Andreas Müller`_. The meaning + of ``class_weight`` was reversed as erroneously higher weight meant less + positives of a given class in earlier releases. + +- Improve narrative documentation and consistency in + :mod:`sklearn.metrics` for regression and classification metrics + by `Arnaud Joly`_. + +- Fixed a bug in :class:`sklearn.svm.SVC` when using csr-matrices with + unsorted indices by Xinfan Meng and `Andreas Müller`_. + +- :class:`MiniBatchKMeans`: Add random reassignment of cluster centers + with little observations attached to them, by `Gael Varoquaux`_. + + +API changes summary +------------------- +- Renamed all occurrences of ``n_atoms`` to ``n_components`` for consistency. + This applies to :class:`decomposition.DictionaryLearning`, + :class:`decomposition.MiniBatchDictionaryLearning`, + :func:`decomposition.dict_learning`, :func:`decomposition.dict_learning_online`. + +- Renamed all occurrences of ``max_iters`` to ``max_iter`` for consistency. + This applies to :class:`semi_supervised.LabelPropagation` and + :class:`semi_supervised.label_propagation.LabelSpreading`. + +- Renamed all occurrences of ``learn_rate`` to ``learning_rate`` for + consistency in :class:`ensemble.BaseGradientBoosting` and + :class:`ensemble.GradientBoostingRegressor`. + +- The module ``sklearn.linear_model.sparse`` is gone. Sparse matrix support + was already integrated into the "regular" linear models. + +- :func:`sklearn.metrics.mean_square_error`, which incorrectly returned the + accumulated error, was removed. Use ``mean_squared_error`` instead. + +- Passing ``class_weight`` parameters to ``fit`` methods is no longer + supported. Pass them to estimator constructors instead. + +- GMMs no longer have ``decode`` and ``rvs`` methods. Use the ``score``, + ``predict`` or ``sample`` methods instead. + +- The ``solver`` fit option in Ridge regression and classification is now + deprecated and will be removed in v0.14. Use the constructor option + instead. + +- :class:`feature_extraction.text.DictVectorizer` now returns sparse + matrices in the CSR format, instead of COO. + +- Renamed ``k`` in :class:`cross_validation.KFold` and + :class:`cross_validation.StratifiedKFold` to ``n_folds``, renamed + ``n_bootstraps`` to ``n_iter`` in ``cross_validation.Bootstrap``. + +- Renamed all occurrences of ``n_iterations`` to ``n_iter`` for consistency. + This applies to :class:`cross_validation.ShuffleSplit`, + :class:`cross_validation.StratifiedShuffleSplit`, + :func:`utils.randomized_range_finder` and :func:`utils.randomized_svd`. + +- Replaced ``rho`` in :class:`linear_model.ElasticNet` and + :class:`linear_model.SGDClassifier` by ``l1_ratio``. The ``rho`` parameter + had different meanings; ``l1_ratio`` was introduced to avoid confusion. + It has the same meaning as previously ``rho`` in + :class:`linear_model.ElasticNet` and ``(1-rho)`` in + :class:`linear_model.SGDClassifier`. + +- :class:`linear_model.LassoLars` and :class:`linear_model.Lars` now + store a list of paths in the case of multiple targets, rather than + an array of paths. + +- The attribute ``gmm`` of :class:`hmm.GMMHMM` was renamed to ``gmm_`` + to adhere more strictly with the API. + +- :func:`cluster.spectral_embedding` was moved to + :func:`manifold.spectral_embedding`. + +- Renamed ``eig_tol`` in :func:`manifold.spectral_embedding`, + :class:`cluster.SpectralClustering` to ``eigen_tol``, renamed ``mode`` + to ``eigen_solver``. + +- Renamed ``mode`` in :func:`manifold.spectral_embedding` and + :class:`cluster.SpectralClustering` to ``eigen_solver``. + +- ``classes_`` and ``n_classes_`` attributes of + :class:`tree.DecisionTreeClassifier` and all derived ensemble models are + now flat in case of single output problems and nested in case of + multi-output problems. + +- The ``estimators_`` attribute of + :class:`ensemble.gradient_boosting.GradientBoostingRegressor` and + :class:`ensemble.gradient_boosting.GradientBoostingClassifier` is now an + array of :class:'tree.DecisionTreeRegressor'. + +- Renamed ``chunk_size`` to ``batch_size`` in + :class:`decomposition.MiniBatchDictionaryLearning` and + :class:`decomposition.MiniBatchSparsePCA` for consistency. + +- :class:`svm.SVC` and :class:`svm.NuSVC` now provide a ``classes_`` + attribute and support arbitrary dtypes for labels ``y``. + Also, the dtype returned by ``predict`` now reflects the dtype of + ``y`` during ``fit`` (used to be ``np.float``). + +- Changed default test_size in :func:`cross_validation.train_test_split` + to None, added possibility to infer ``test_size`` from ``train_size`` in + :class:`cross_validation.ShuffleSplit` and + :class:`cross_validation.StratifiedShuffleSplit`. + +- Renamed function :func:`sklearn.metrics.zero_one` to + :func:`sklearn.metrics.zero_one_loss`. Be aware that the default behavior + in :func:`sklearn.metrics.zero_one_loss` is different from + :func:`sklearn.metrics.zero_one`: ``normalize=False`` is changed to + ``normalize=True``. + +- Renamed function :func:`metrics.zero_one_score` to + :func:`metrics.accuracy_score`. + +- :func:`datasets.make_circles` now has the same number of inner and outer points. + +- In the Naive Bayes classifiers, the ``class_prior`` parameter was moved + from ``fit`` to ``__init__``. + +People +------ +List of contributors for release 0.13 by number of commits. + + * 364 `Andreas Müller`_ + * 143 `Arnaud Joly`_ + * 137 `Peter Prettenhofer`_ + * 131 `Gael Varoquaux`_ + * 117 `Mathieu Blondel`_ + * 108 `Lars Buitinck`_ + * 106 Wei Li + * 101 `Olivier Grisel`_ + * 65 `Vlad Niculae`_ + * 54 `Gilles Louppe`_ + * 40 `Jaques Grobler`_ + * 38 `Alexandre Gramfort`_ + * 30 `Rob Zinkov`_ + * 19 Aymeric Masurelle + * 18 Andrew Winterman + * 17 `Fabian Pedregosa`_ + * 17 Nelle Varoquaux + * 16 `Christian Osendorfer`_ + * 14 `Daniel Nouri`_ + * 13 :user:`Virgile Fritsch ` + * 13 syhw + * 12 `Satrajit Ghosh`_ + * 10 Corey Lynch + * 10 Kyle Beauchamp + * 9 Brian Cheung + * 9 Immanuel Bayer + * 9 mr.Shu + * 8 Conrad Lee + * 8 `James Bergstra`_ + * 7 Tadej Janež + * 6 Brian Cajes + * 6 `Jake Vanderplas`_ + * 6 Michael + * 6 Noel Dawe + * 6 Tiago Nunes + * 6 cow + * 5 Anze + * 5 Shiqiao Du + * 4 Christian Jauvin + * 4 Jacques Kvam + * 4 Richard T. Guy + * 4 `Robert Layton`_ + * 3 Alexandre Abraham + * 3 Doug Coleman + * 3 Scott Dickerson + * 2 ApproximateIdentity + * 2 John Benediktsson + * 2 Mark Veronda + * 2 Matti Lyra + * 2 Mikhail Korobov + * 2 Xinfan Meng + * 1 Alejandro Weinstein + * 1 `Alexandre Passos`_ + * 1 Christoph Deil + * 1 Eugene Nizhibitsky + * 1 Kenneth C. Arnold + * 1 Luis Pedro Coelho + * 1 Miroslav Batchkarov + * 1 Pavel + * 1 Sebastian Berg + * 1 Shaun Jackman + * 1 Subhodeep Moitra + * 1 bob + * 1 dengemann + * 1 emanuele + * 1 x006 + diff --git a/doc/whats_new/v0.14.rst b/doc/whats_new/v0.14.rst new file mode 100644 index 0000000000000..2b0456593e613 --- /dev/null +++ b/doc/whats_new/v0.14.rst @@ -0,0 +1,389 @@ +.. include:: _contributors.rst + +.. currentmodule:: sklearn + +.. _changes_0_14: + +Version 0.14 +=============== + +**August 7, 2013** + +Changelog +--------- + +- Missing values with sparse and dense matrices can be imputed with the + transformer :class:`preprocessing.Imputer` by `Nicolas Trésegnie`_. + +- The core implementation of decisions trees has been rewritten from + scratch, allowing for faster tree induction and lower memory + consumption in all tree-based estimators. By `Gilles Louppe`_. + +- Added :class:`ensemble.AdaBoostClassifier` and + :class:`ensemble.AdaBoostRegressor`, by `Noel Dawe`_ and + `Gilles Louppe`_. See the :ref:`AdaBoost ` section of the user + guide for details and examples. + +- Added :class:`grid_search.RandomizedSearchCV` and + :class:`grid_search.ParameterSampler` for randomized hyperparameter + optimization. By `Andreas Müller`_. + +- Added :ref:`biclustering ` algorithms + (:class:`sklearn.cluster.bicluster.SpectralCoclustering` and + :class:`sklearn.cluster.bicluster.SpectralBiclustering`), data + generation methods (:func:`sklearn.datasets.make_biclusters` and + :func:`sklearn.datasets.make_checkerboard`), and scoring metrics + (:func:`sklearn.metrics.consensus_score`). By `Kemal Eren`_. + +- Added :ref:`Restricted Boltzmann Machines` + (:class:`neural_network.BernoulliRBM`). By `Yann Dauphin`_. + +- Python 3 support by :user:`Justin Vincent `, `Lars Buitinck`_, + :user:`Subhodeep Moitra ` and `Olivier Grisel`_. All tests now pass under + Python 3.3. + +- Ability to pass one penalty (alpha value) per target in + :class:`linear_model.Ridge`, by @eickenberg and `Mathieu Blondel`_. + +- Fixed :mod:`sklearn.linear_model.stochastic_gradient.py` L2 regularization + issue (minor practical significance). + By :user:`Norbert Crombach ` and `Mathieu Blondel`_ . + +- Added an interactive version of `Andreas Müller`_'s + `Machine Learning Cheat Sheet (for scikit-learn) + `_ + to the documentation. See :ref:`Choosing the right estimator `. + By `Jaques Grobler`_. + +- :class:`grid_search.GridSearchCV` and + :func:`cross_validation.cross_val_score` now support the use of advanced + scoring function such as area under the ROC curve and f-beta scores. + See :ref:`scoring_parameter` for details. By `Andreas Müller`_ + and `Lars Buitinck`_. + Passing a function from :mod:`sklearn.metrics` as ``score_func`` is + deprecated. + +- Multi-label classification output is now supported by + :func:`metrics.accuracy_score`, :func:`metrics.zero_one_loss`, + :func:`metrics.f1_score`, :func:`metrics.fbeta_score`, + :func:`metrics.classification_report`, + :func:`metrics.precision_score` and :func:`metrics.recall_score` + by `Arnaud Joly`_. + +- Two new metrics :func:`metrics.hamming_loss` and + :func:`metrics.jaccard_similarity_score` + are added with multi-label support by `Arnaud Joly`_. + +- Speed and memory usage improvements in + :class:`feature_extraction.text.CountVectorizer` and + :class:`feature_extraction.text.TfidfVectorizer`, + by Jochen Wersdörfer and Roman Sinayev. + +- The ``min_df`` parameter in + :class:`feature_extraction.text.CountVectorizer` and + :class:`feature_extraction.text.TfidfVectorizer`, which used to be 2, + has been reset to 1 to avoid unpleasant surprises (empty vocabularies) + for novice users who try it out on tiny document collections. + A value of at least 2 is still recommended for practical use. + +- :class:`svm.LinearSVC`, :class:`linear_model.SGDClassifier` and + :class:`linear_model.SGDRegressor` now have a ``sparsify`` method that + converts their ``coef_`` into a sparse matrix, meaning stored models + trained using these estimators can be made much more compact. + +- :class:`linear_model.SGDClassifier` now produces multiclass probability + estimates when trained under log loss or modified Huber loss. + +- Hyperlinks to documentation in example code on the website by + :user:`Martin Luessi `. + +- Fixed bug in :class:`preprocessing.MinMaxScaler` causing incorrect scaling + of the features for non-default ``feature_range`` settings. By `Andreas + Müller`_. + +- ``max_features`` in :class:`tree.DecisionTreeClassifier`, + :class:`tree.DecisionTreeRegressor` and all derived ensemble estimators + now supports percentage values. By `Gilles Louppe`_. + +- Performance improvements in :class:`isotonic.IsotonicRegression` by + `Nelle Varoquaux`_. + +- :func:`metrics.accuracy_score` has an option normalize to return + the fraction or the number of correctly classified sample + by `Arnaud Joly`_. + +- Added :func:`metrics.log_loss` that computes log loss, aka cross-entropy + loss. By Jochen Wersdörfer and `Lars Buitinck`_. + +- A bug that caused :class:`ensemble.AdaBoostClassifier`'s to output + incorrect probabilities has been fixed. + +- Feature selectors now share a mixin providing consistent ``transform``, + ``inverse_transform`` and ``get_support`` methods. By `Joel Nothman`_. + +- A fitted :class:`grid_search.GridSearchCV` or + :class:`grid_search.RandomizedSearchCV` can now generally be pickled. + By `Joel Nothman`_. + +- Refactored and vectorized implementation of :func:`metrics.roc_curve` + and :func:`metrics.precision_recall_curve`. By `Joel Nothman`_. + +- The new estimator :class:`sklearn.decomposition.TruncatedSVD` + performs dimensionality reduction using SVD on sparse matrices, + and can be used for latent semantic analysis (LSA). + By `Lars Buitinck`_. + +- Added self-contained example of out-of-core learning on text data + :ref:`sphx_glr_auto_examples_applications_plot_out_of_core_classification.py`. + By :user:`Eustache Diemert `. + +- The default number of components for + :class:`sklearn.decomposition.RandomizedPCA` is now correctly documented + to be ``n_features``. This was the default behavior, so programs using it + will continue to work as they did. + +- :class:`sklearn.cluster.KMeans` now fits several orders of magnitude + faster on sparse data (the speedup depends on the sparsity). By + `Lars Buitinck`_. + +- Reduce memory footprint of FastICA by `Denis Engemann`_ and + `Alexandre Gramfort`_. + +- Verbose output in :mod:`sklearn.ensemble.gradient_boosting` now uses + a column format and prints progress in decreasing frequency. + It also shows the remaining time. By `Peter Prettenhofer`_. + +- :mod:`sklearn.ensemble.gradient_boosting` provides out-of-bag improvement + :attr:`~sklearn.ensemble.GradientBoostingRegressor.oob_improvement_` + rather than the OOB score for model selection. An example that shows + how to use OOB estimates to select the number of trees was added. + By `Peter Prettenhofer`_. + +- Most metrics now support string labels for multiclass classification + by `Arnaud Joly`_ and `Lars Buitinck`_. + +- New OrthogonalMatchingPursuitCV class by `Alexandre Gramfort`_ + and `Vlad Niculae`_. + +- Fixed a bug in :class:`sklearn.covariance.GraphLassoCV`: the + 'alphas' parameter now works as expected when given a list of + values. By Philippe Gervais. + +- Fixed an important bug in :class:`sklearn.covariance.GraphLassoCV` + that prevented all folds provided by a CV object to be used (only + the first 3 were used). When providing a CV object, execution + time may thus increase significantly compared to the previous + version (bug results are correct now). By Philippe Gervais. + +- :class:`cross_validation.cross_val_score` and the :mod:`grid_search` + module is now tested with multi-output data by `Arnaud Joly`_. + +- :func:`datasets.make_multilabel_classification` can now return + the output in label indicator multilabel format by `Arnaud Joly`_. + +- K-nearest neighbors, :class:`neighbors.KNeighborsRegressor` + and :class:`neighbors.RadiusNeighborsRegressor`, + and radius neighbors, :class:`neighbors.RadiusNeighborsRegressor` and + :class:`neighbors.RadiusNeighborsClassifier` support multioutput data + by `Arnaud Joly`_. + +- Random state in LibSVM-based estimators (:class:`svm.SVC`, :class:`NuSVC`, + :class:`OneClassSVM`, :class:`svm.SVR`, :class:`svm.NuSVR`) can now be + controlled. This is useful to ensure consistency in the probability + estimates for the classifiers trained with ``probability=True``. By + `Vlad Niculae`_. + +- Out-of-core learning support for discrete naive Bayes classifiers + :class:`sklearn.naive_bayes.MultinomialNB` and + :class:`sklearn.naive_bayes.BernoulliNB` by adding the ``partial_fit`` + method by `Olivier Grisel`_. + +- New website design and navigation by `Gilles Louppe`_, `Nelle Varoquaux`_, + Vincent Michel and `Andreas Müller`_. + +- Improved documentation on :ref:`multi-class, multi-label and multi-output + classification ` by `Yannick Schwartz`_ and `Arnaud Joly`_. + +- Better input and error handling in the :mod:`metrics` module by + `Arnaud Joly`_ and `Joel Nothman`_. + +- Speed optimization of the :mod:`hmm` module by :user:`Mikhail Korobov ` + +- Significant speed improvements for :class:`sklearn.cluster.DBSCAN` + by `cleverless `_ + + +API changes summary +------------------- + +- The :func:`auc_score` was renamed :func:`roc_auc_score`. + +- Testing scikit-learn with ``sklearn.test()`` is deprecated. Use + ``nosetests sklearn`` from the command line. + +- Feature importances in :class:`tree.DecisionTreeClassifier`, + :class:`tree.DecisionTreeRegressor` and all derived ensemble estimators + are now computed on the fly when accessing the ``feature_importances_`` + attribute. Setting ``compute_importances=True`` is no longer required. + By `Gilles Louppe`_. + +- :class:`linear_model.lasso_path` and + :class:`linear_model.enet_path` can return its results in the same + format as that of :class:`linear_model.lars_path`. This is done by + setting the ``return_models`` parameter to ``False``. By + `Jaques Grobler`_ and `Alexandre Gramfort`_ + +- :class:`grid_search.IterGrid` was renamed to + :class:`grid_search.ParameterGrid`. + +- Fixed bug in :class:`KFold` causing imperfect class balance in some + cases. By `Alexandre Gramfort`_ and Tadej Janež. + +- :class:`sklearn.neighbors.BallTree` has been refactored, and a + :class:`sklearn.neighbors.KDTree` has been + added which shares the same interface. The Ball Tree now works with + a wide variety of distance metrics. Both classes have many new + methods, including single-tree and dual-tree queries, breadth-first + and depth-first searching, and more advanced queries such as + kernel density estimation and 2-point correlation functions. + By `Jake Vanderplas`_ + +- Support for scipy.spatial.cKDTree within neighbors queries has been + removed, and the functionality replaced with the new :class:`KDTree` + class. + +- :class:`sklearn.neighbors.KernelDensity` has been added, which performs + efficient kernel density estimation with a variety of kernels. + +- :class:`sklearn.decomposition.KernelPCA` now always returns output with + ``n_components`` components, unless the new parameter ``remove_zero_eig`` + is set to ``True``. This new behavior is consistent with the way + kernel PCA was always documented; previously, the removal of components + with zero eigenvalues was tacitly performed on all data. + +- ``gcv_mode="auto"`` no longer tries to perform SVD on a densified + sparse matrix in :class:`sklearn.linear_model.RidgeCV`. + +- Sparse matrix support in :class:`sklearn.decomposition.RandomizedPCA` + is now deprecated in favor of the new ``TruncatedSVD``. + +- :class:`cross_validation.KFold` and + :class:`cross_validation.StratifiedKFold` now enforce `n_folds >= 2` + otherwise a ``ValueError`` is raised. By `Olivier Grisel`_. + +- :func:`datasets.load_files`'s ``charset`` and ``charset_errors`` + parameters were renamed ``encoding`` and ``decode_errors``. + +- Attribute ``oob_score_`` in :class:`sklearn.ensemble.GradientBoostingRegressor` + and :class:`sklearn.ensemble.GradientBoostingClassifier` + is deprecated and has been replaced by ``oob_improvement_`` . + +- Attributes in OrthogonalMatchingPursuit have been deprecated + (copy_X, Gram, ...) and precompute_gram renamed precompute + for consistency. See #2224. + +- :class:`sklearn.preprocessing.StandardScaler` now converts integer input + to float, and raises a warning. Previously it rounded for dense integer + input. + +- :class:`sklearn.multiclass.OneVsRestClassifier` now has a + ``decision_function`` method. This will return the distance of each + sample from the decision boundary for each class, as long as the + underlying estimators implement the ``decision_function`` method. + By `Kyle Kastner`_. + +- Better input validation, warning on unexpected shapes for y. + +People +------ +List of contributors for release 0.14 by number of commits. + + * 277 Gilles Louppe + * 245 Lars Buitinck + * 187 Andreas Mueller + * 124 Arnaud Joly + * 112 Jaques Grobler + * 109 Gael Varoquaux + * 107 Olivier Grisel + * 102 Noel Dawe + * 99 Kemal Eren + * 79 Joel Nothman + * 75 Jake VanderPlas + * 73 Nelle Varoquaux + * 71 Vlad Niculae + * 65 Peter Prettenhofer + * 64 Alexandre Gramfort + * 54 Mathieu Blondel + * 38 Nicolas Trésegnie + * 35 eustache + * 27 Denis Engemann + * 25 Yann N. Dauphin + * 19 Justin Vincent + * 17 Robert Layton + * 15 Doug Coleman + * 14 Michael Eickenberg + * 13 Robert Marchman + * 11 Fabian Pedregosa + * 11 Philippe Gervais + * 10 Jim Holmström + * 10 Tadej Janež + * 10 syhw + * 9 Mikhail Korobov + * 9 Steven De Gryze + * 8 sergeyf + * 7 Ben Root + * 7 Hrishikesh Huilgolkar + * 6 Kyle Kastner + * 6 Martin Luessi + * 6 Rob Speer + * 5 Federico Vaggi + * 5 Raul Garreta + * 5 Rob Zinkov + * 4 Ken Geis + * 3 A. Flaxman + * 3 Denton Cockburn + * 3 Dougal Sutherland + * 3 Ian Ozsvald + * 3 Johannes Schönberger + * 3 Robert McGibbon + * 3 Roman Sinayev + * 3 Szabo Roland + * 2 Diego Molla + * 2 Imran Haque + * 2 Jochen Wersdörfer + * 2 Sergey Karayev + * 2 Yannick Schwartz + * 2 jamestwebber + * 1 Abhijeet Kolhe + * 1 Alexander Fabisch + * 1 Bastiaan van den Berg + * 1 Benjamin Peterson + * 1 Daniel Velkov + * 1 Fazlul Shahriar + * 1 Felix Brockherde + * 1 Félix-Antoine Fortin + * 1 Harikrishnan S + * 1 Jack Hale + * 1 JakeMick + * 1 James McDermott + * 1 John Benediktsson + * 1 John Zwinck + * 1 Joshua Vredevoogd + * 1 Justin Pati + * 1 Kevin Hughes + * 1 Kyle Kelley + * 1 Matthias Ekman + * 1 Miroslav Shubernetskiy + * 1 Naoki Orii + * 1 Norbert Crombach + * 1 Rafael Cunha de Almeida + * 1 Rolando Espinoza La fuente + * 1 Seamus Abshere + * 1 Sergey Feldman + * 1 Sergio Medina + * 1 Stefano Lattarini + * 1 Steve Koch + * 1 Sturla Molden + * 1 Thomas Jarosch + * 1 Yaroslav Halchenko + diff --git a/doc/whats_new/v0.15.rst b/doc/whats_new/v0.15.rst new file mode 100644 index 0000000000000..a2eafc63b0617 --- /dev/null +++ b/doc/whats_new/v0.15.rst @@ -0,0 +1,623 @@ +.. include:: _contributors.rst + +.. currentmodule:: sklearn + +.. _changes_0_15_2: + +Version 0.15.2 +============== + +**September 4, 2014** + +Bug fixes +--------- + +- Fixed handling of the ``p`` parameter of the Minkowski distance that was + previously ignored in nearest neighbors models. By :user:`Nikolay + Mayorov `. + +- Fixed duplicated alphas in :class:`linear_model.LassoLars` with early + stopping on 32 bit Python. By `Olivier Grisel`_ and `Fabian Pedregosa`_. + +- Fixed the build under Windows when scikit-learn is built with MSVC while + NumPy is built with MinGW. By `Olivier Grisel`_ and :user:`Federico + Vaggi `. + +- Fixed an array index overflow bug in the coordinate descent solver. By + `Gael Varoquaux`_. + +- Better handling of numpy 1.9 deprecation warnings. By `Gael Varoquaux`_. + +- Removed unnecessary data copy in :class:`cluster.KMeans`. + By `Gael Varoquaux`_. + +- Explicitly close open files to avoid ``ResourceWarnings`` under Python 3. + By Calvin Giles. + +- The ``transform`` of :class:`discriminant_analysis.LinearDiscriminantAnalysis` + now projects the input on the most discriminant directions. By Martin Billinger. + +- Fixed potential overflow in ``_tree.safe_realloc`` by `Lars Buitinck`_. + +- Performance optimization in :class:`isotonic.IsotonicRegression`. + By Robert Bradshaw. + +- ``nose`` is non-longer a runtime dependency to import ``sklearn``, only for + running the tests. By `Joel Nothman`_. + +- Many documentation and website fixes by `Joel Nothman`_, `Lars Buitinck`_ + :user:`Matt Pico `, and others. + +.. _changes_0_15_1: + +Version 0.15.1 +============== + +**August 1, 2014** + +Bug fixes +--------- + +- Made :func:`cross_validation.cross_val_score` use + :class:`cross_validation.KFold` instead of + :class:`cross_validation.StratifiedKFold` on multi-output classification + problems. By :user:`Nikolay Mayorov `. + +- Support unseen labels :class:`preprocessing.LabelBinarizer` to restore + the default behavior of 0.14.1 for backward compatibility. By + :user:`Hamzeh Alsalhi `. + +- Fixed the :class:`cluster.KMeans` stopping criterion that prevented early + convergence detection. By Edward Raff and `Gael Varoquaux`_. + +- Fixed the behavior of :class:`multiclass.OneVsOneClassifier`. + in case of ties at the per-class vote level by computing the correct + per-class sum of prediction scores. By `Andreas Müller`_. + +- Made :func:`cross_validation.cross_val_score` and + :class:`grid_search.GridSearchCV` accept Python lists as input data. + This is especially useful for cross-validation and model selection of + text processing pipelines. By `Andreas Müller`_. + +- Fixed data input checks of most estimators to accept input data that + implements the NumPy ``__array__`` protocol. This is the case for + for ``pandas.Series`` and ``pandas.DataFrame`` in recent versions of + pandas. By `Gael Varoquaux`_. + +- Fixed a regression for :class:`linear_model.SGDClassifier` with + ``class_weight="auto"`` on data with non-contiguous labels. By + `Olivier Grisel`_. + + +.. _changes_0_15: + +Version 0.15 +============ + +**July 15, 2014** + +Highlights +----------- + +- Many speed and memory improvements all across the code + +- Huge speed and memory improvements to random forests (and extra + trees) that also benefit better from parallel computing. + +- Incremental fit to :class:`BernoulliRBM ` + +- Added :class:`cluster.AgglomerativeClustering` for hierarchical + agglomerative clustering with average linkage, complete linkage and + ward strategies. + +- Added :class:`linear_model.RANSACRegressor` for robust regression + models. + +- Added dimensionality reduction with :class:`manifold.TSNE` which can be + used to visualize high-dimensional data. + + +Changelog +--------- + +New features +............ + +- Added :class:`ensemble.BaggingClassifier` and + :class:`ensemble.BaggingRegressor` meta-estimators for ensembling + any kind of base estimator. See the :ref:`Bagging ` section of + the user guide for details and examples. By `Gilles Louppe`_. + +- New unsupervised feature selection algorithm + :class:`feature_selection.VarianceThreshold`, by `Lars Buitinck`_. + +- Added :class:`linear_model.RANSACRegressor` meta-estimator for the robust + fitting of regression models. By :user:`Johannes Schönberger `. + +- Added :class:`cluster.AgglomerativeClustering` for hierarchical + agglomerative clustering with average linkage, complete linkage and + ward strategies, by `Nelle Varoquaux`_ and `Gael Varoquaux`_. + +- Shorthand constructors :func:`pipeline.make_pipeline` and + :func:`pipeline.make_union` were added by `Lars Buitinck`_. + +- Shuffle option for :class:`cross_validation.StratifiedKFold`. + By :user:`Jeffrey Blackburne `. + +- Incremental learning (``partial_fit``) for Gaussian Naive Bayes by + Imran Haque. + +- Added ``partial_fit`` to :class:`BernoulliRBM + ` + By :user:`Danny Sullivan `. + +- Added :func:`learning_curve ` utility to + chart performance with respect to training size. See + :ref:`sphx_glr_auto_examples_model_selection_plot_learning_curve.py`. By Alexander Fabisch. + +- Add positive option in :class:`LassoCV ` and + :class:`ElasticNetCV `. + By Brian Wignall and `Alexandre Gramfort`_. + +- Added :class:`linear_model.MultiTaskElasticNetCV` and + :class:`linear_model.MultiTaskLassoCV`. By `Manoj Kumar`_. + +- Added :class:`manifold.TSNE`. By Alexander Fabisch. + +Enhancements +............ + +- Add sparse input support to :class:`ensemble.AdaBoostClassifier` and + :class:`ensemble.AdaBoostRegressor` meta-estimators. + By :user:`Hamzeh Alsalhi `. + +- Memory improvements of decision trees, by `Arnaud Joly`_. + +- Decision trees can now be built in best-first manner by using ``max_leaf_nodes`` + as the stopping criteria. Refactored the tree code to use either a + stack or a priority queue for tree building. + By `Peter Prettenhofer`_ and `Gilles Louppe`_. + +- Decision trees can now be fitted on fortran- and c-style arrays, and + non-continuous arrays without the need to make a copy. + If the input array has a different dtype than ``np.float32``, a fortran- + style copy will be made since fortran-style memory layout has speed + advantages. By `Peter Prettenhofer`_ and `Gilles Louppe`_. + +- Speed improvement of regression trees by optimizing the + the computation of the mean square error criterion. This lead + to speed improvement of the tree, forest and gradient boosting tree + modules. By `Arnaud Joly`_ + +- The ``img_to_graph`` and ``grid_tograph`` functions in + :mod:`sklearn.feature_extraction.image` now return ``np.ndarray`` + instead of ``np.matrix`` when ``return_as=np.ndarray``. See the + Notes section for more information on compatibility. + +- Changed the internal storage of decision trees to use a struct array. + This fixed some small bugs, while improving code and providing a small + speed gain. By `Joel Nothman`_. + +- Reduce memory usage and overhead when fitting and predicting with forests + of randomized trees in parallel with ``n_jobs != 1`` by leveraging new + threading backend of joblib 0.8 and releasing the GIL in the tree fitting + Cython code. By `Olivier Grisel`_ and `Gilles Louppe`_. + +- Speed improvement of the :mod:`sklearn.ensemble.gradient_boosting` module. + By `Gilles Louppe`_ and `Peter Prettenhofer`_. + +- Various enhancements to the :mod:`sklearn.ensemble.gradient_boosting` + module: a ``warm_start`` argument to fit additional trees, + a ``max_leaf_nodes`` argument to fit GBM style trees, + a ``monitor`` fit argument to inspect the estimator during training, and + refactoring of the verbose code. By `Peter Prettenhofer`_. + +- Faster :class:`sklearn.ensemble.ExtraTrees` by caching feature values. + By `Arnaud Joly`_. + +- Faster depth-based tree building algorithm such as decision tree, + random forest, extra trees or gradient tree boosting (with depth based + growing strategy) by avoiding trying to split on found constant features + in the sample subset. By `Arnaud Joly`_. + +- Add ``min_weight_fraction_leaf`` pre-pruning parameter to tree-based + methods: the minimum weighted fraction of the input samples required to be + at a leaf node. By `Noel Dawe`_. + +- Added :func:`metrics.pairwise_distances_argmin_min`, by Philippe Gervais. + +- Added predict method to :class:`cluster.AffinityPropagation` and + :class:`cluster.MeanShift`, by `Mathieu Blondel`_. + +- Vector and matrix multiplications have been optimised throughout the + library by `Denis Engemann`_, and `Alexandre Gramfort`_. + In particular, they should take less memory with older NumPy versions + (prior to 1.7.2). + +- Precision-recall and ROC examples now use train_test_split, and have more + explanation of why these metrics are useful. By `Kyle Kastner`_ + +- The training algorithm for :class:`decomposition.NMF` is faster for + sparse matrices and has much lower memory complexity, meaning it will + scale up gracefully to large datasets. By `Lars Buitinck`_. + +- Added svd_method option with default value to "randomized" to + :class:`decomposition.FactorAnalysis` to save memory and + significantly speedup computation by `Denis Engemann`_, and + `Alexandre Gramfort`_. + +- Changed :class:`cross_validation.StratifiedKFold` to try and + preserve as much of the original ordering of samples as possible so as + not to hide overfitting on datasets with a non-negligible level of + samples dependency. + By `Daniel Nouri`_ and `Olivier Grisel`_. + +- Add multi-output support to :class:`gaussian_process.GaussianProcess` + by John Novak. + +- Support for precomputed distance matrices in nearest neighbor estimators + by `Robert Layton`_ and `Joel Nothman`_. + +- Norm computations optimized for NumPy 1.6 and later versions by + `Lars Buitinck`_. In particular, the k-means algorithm no longer + needs a temporary data structure the size of its input. + +- :class:`dummy.DummyClassifier` can now be used to predict a constant + output value. By `Manoj Kumar`_. + +- :class:`dummy.DummyRegressor` has now a strategy parameter which allows + to predict the mean, the median of the training set or a constant + output value. By :user:`Maheshakya Wijewardena `. + +- Multi-label classification output in multilabel indicator format + is now supported by :func:`metrics.roc_auc_score` and + :func:`metrics.average_precision_score` by `Arnaud Joly`_. + +- Significant performance improvements (more than 100x speedup for + large problems) in :class:`isotonic.IsotonicRegression` by + `Andrew Tulloch`_. + +- Speed and memory usage improvements to the SGD algorithm for linear + models: it now uses threads, not separate processes, when ``n_jobs>1``. + By `Lars Buitinck`_. + +- Grid search and cross validation allow NaNs in the input arrays so that + preprocessors such as :class:`preprocessing.Imputer + ` can be trained within the cross validation loop, + avoiding potentially skewed results. + +- Ridge regression can now deal with sample weights in feature space + (only sample space until then). By :user:`Michael Eickenberg `. + Both solutions are provided by the Cholesky solver. + +- Several classification and regression metrics now support weighted + samples with the new ``sample_weight`` argument: + :func:`metrics.accuracy_score`, + :func:`metrics.zero_one_loss`, + :func:`metrics.precision_score`, + :func:`metrics.average_precision_score`, + :func:`metrics.f1_score`, + :func:`metrics.fbeta_score`, + :func:`metrics.recall_score`, + :func:`metrics.roc_auc_score`, + :func:`metrics.explained_variance_score`, + :func:`metrics.mean_squared_error`, + :func:`metrics.mean_absolute_error`, + :func:`metrics.r2_score`. + By `Noel Dawe`_. + +- Speed up of the sample generator + :func:`datasets.make_multilabel_classification`. By `Joel Nothman`_. + +Documentation improvements +........................... + +- The :ref:`Working With Text Data ` tutorial + has now been worked in to the main documentation's tutorial section. + Includes exercises and skeletons for tutorial presentation. + Original tutorial created by several authors including + `Olivier Grisel`_, Lars Buitinck and many others. + Tutorial integration into the scikit-learn documentation + by `Jaques Grobler`_ + +- Added :ref:`Computational Performance ` + documentation. Discussion and examples of prediction latency / throughput + and different factors that have influence over speed. Additional tips for + building faster models and choosing a relevant compromise between speed + and predictive power. + By :user:`Eustache Diemert `. + +Bug fixes +......... + +- Fixed bug in :class:`decomposition.MiniBatchDictionaryLearning` : + ``partial_fit`` was not working properly. + +- Fixed bug in :class:`linear_model.stochastic_gradient` : + ``l1_ratio`` was used as ``(1.0 - l1_ratio)`` . + +- Fixed bug in :class:`multiclass.OneVsOneClassifier` with string + labels + +- Fixed a bug in :class:`LassoCV ` and + :class:`ElasticNetCV `: they would not + pre-compute the Gram matrix with ``precompute=True`` or + ``precompute="auto"`` and ``n_samples > n_features``. By `Manoj Kumar`_. + +- Fixed incorrect estimation of the degrees of freedom in + :func:`feature_selection.f_regression` when variates are not centered. + By :user:`Virgile Fritsch `. + +- Fixed a race condition in parallel processing with + ``pre_dispatch != "all"`` (for instance, in ``cross_val_score``). + By `Olivier Grisel`_. + +- Raise error in :class:`cluster.FeatureAgglomeration` and + :class:`cluster.WardAgglomeration` when no samples are given, + rather than returning meaningless clustering. + +- Fixed bug in :class:`gradient_boosting.GradientBoostingRegressor` with + ``loss='huber'``: ``gamma`` might have not been initialized. + +- Fixed feature importances as computed with a forest of randomized trees + when fit with ``sample_weight != None`` and/or with ``bootstrap=True``. + By `Gilles Louppe`_. + +API changes summary +------------------- + +- :mod:`sklearn.hmm` is deprecated. Its removal is planned + for the 0.17 release. + +- Use of :class:`covariance.EllipticEnvelop` has now been removed after + deprecation. + Please use :class:`covariance.EllipticEnvelope` instead. + +- :class:`cluster.Ward` is deprecated. Use + :class:`cluster.AgglomerativeClustering` instead. + +- :class:`cluster.WardClustering` is deprecated. Use +- :class:`cluster.AgglomerativeClustering` instead. + +- :class:`cross_validation.Bootstrap` is deprecated. + :class:`cross_validation.KFold` or + :class:`cross_validation.ShuffleSplit` are recommended instead. + +- Direct support for the sequence of sequences (or list of lists) multilabel + format is deprecated. To convert to and from the supported binary + indicator matrix format, use + :class:`MultiLabelBinarizer `. + By `Joel Nothman`_. + +- Add score method to :class:`PCA ` following the model of + probabilistic PCA and deprecate + :class:`ProbabilisticPCA ` model whose + score implementation is not correct. The computation now also exploits the + matrix inversion lemma for faster computation. By `Alexandre Gramfort`_. + +- The score method of :class:`FactorAnalysis ` + now returns the average log-likelihood of the samples. Use score_samples + to get log-likelihood of each sample. By `Alexandre Gramfort`_. + +- Generating boolean masks (the setting ``indices=False``) + from cross-validation generators is deprecated. + Support for masks will be removed in 0.17. + The generators have produced arrays of indices by default since 0.10. + By `Joel Nothman`_. + +- 1-d arrays containing strings with ``dtype=object`` (as used in Pandas) + are now considered valid classification targets. This fixes a regression + from version 0.13 in some classifiers. By `Joel Nothman`_. + +- Fix wrong ``explained_variance_ratio_`` attribute in + :class:`RandomizedPCA `. + By `Alexandre Gramfort`_. + +- Fit alphas for each ``l1_ratio`` instead of ``mean_l1_ratio`` in + :class:`linear_model.ElasticNetCV` and :class:`linear_model.LassoCV`. + This changes the shape of ``alphas_`` from ``(n_alphas,)`` to + ``(n_l1_ratio, n_alphas)`` if the ``l1_ratio`` provided is a 1-D array like + object of length greater than one. + By `Manoj Kumar`_. + +- Fix :class:`linear_model.ElasticNetCV` and :class:`linear_model.LassoCV` + when fitting intercept and input data is sparse. The automatic grid + of alphas was not computed correctly and the scaling with normalize + was wrong. By `Manoj Kumar`_. + +- Fix wrong maximal number of features drawn (``max_features``) at each split + for decision trees, random forests and gradient tree boosting. + Previously, the count for the number of drawn features started only after + one non constant features in the split. This bug fix will affect + computational and generalization performance of those algorithms in the + presence of constant features. To get back previous generalization + performance, you should modify the value of ``max_features``. + By `Arnaud Joly`_. + +- Fix wrong maximal number of features drawn (``max_features``) at each split + for :class:`ensemble.ExtraTreesClassifier` and + :class:`ensemble.ExtraTreesRegressor`. Previously, only non constant + features in the split was counted as drawn. Now constant features are + counted as drawn. Furthermore at least one feature must be non constant + in order to make a valid split. This bug fix will affect + computational and generalization performance of extra trees in the + presence of constant features. To get back previous generalization + performance, you should modify the value of ``max_features``. + By `Arnaud Joly`_. + +- Fix :func:`utils.compute_class_weight` when ``class_weight=="auto"``. + Previously it was broken for input of non-integer ``dtype`` and the + weighted array that was returned was wrong. By `Manoj Kumar`_. + +- Fix :class:`cross_validation.Bootstrap` to return ``ValueError`` + when ``n_train + n_test > n``. By :user:`Ronald Phlypo `. + + +People +------ + +List of contributors for release 0.15 by number of commits. + +* 312 Olivier Grisel +* 275 Lars Buitinck +* 221 Gael Varoquaux +* 148 Arnaud Joly +* 134 Johannes Schönberger +* 119 Gilles Louppe +* 113 Joel Nothman +* 111 Alexandre Gramfort +* 95 Jaques Grobler +* 89 Denis Engemann +* 83 Peter Prettenhofer +* 83 Alexander Fabisch +* 62 Mathieu Blondel +* 60 Eustache Diemert +* 60 Nelle Varoquaux +* 49 Michael Bommarito +* 45 Manoj-Kumar-S +* 28 Kyle Kastner +* 26 Andreas Mueller +* 22 Noel Dawe +* 21 Maheshakya Wijewardena +* 21 Brooke Osborn +* 21 Hamzeh Alsalhi +* 21 Jake VanderPlas +* 21 Philippe Gervais +* 19 Bala Subrahmanyam Varanasi +* 12 Ronald Phlypo +* 10 Mikhail Korobov +* 8 Thomas Unterthiner +* 8 Jeffrey Blackburne +* 8 eltermann +* 8 bwignall +* 7 Ankit Agrawal +* 7 CJ Carey +* 6 Daniel Nouri +* 6 Chen Liu +* 6 Michael Eickenberg +* 6 ugurthemaster +* 5 Aaron Schumacher +* 5 Baptiste Lagarde +* 5 Rajat Khanduja +* 5 Robert McGibbon +* 5 Sergio Pascual +* 4 Alexis Metaireau +* 4 Ignacio Rossi +* 4 Virgile Fritsch +* 4 Sebastian Säger +* 4 Ilambharathi Kanniah +* 4 sdenton4 +* 4 Robert Layton +* 4 Alyssa +* 4 Amos Waterland +* 3 Andrew Tulloch +* 3 murad +* 3 Steven Maude +* 3 Karol Pysniak +* 3 Jacques Kvam +* 3 cgohlke +* 3 cjlin +* 3 Michael Becker +* 3 hamzeh +* 3 Eric Jacobsen +* 3 john collins +* 3 kaushik94 +* 3 Erwin Marsi +* 2 csytracy +* 2 LK +* 2 Vlad Niculae +* 2 Laurent Direr +* 2 Erik Shilts +* 2 Raul Garreta +* 2 Yoshiki Vázquez Baeza +* 2 Yung Siang Liau +* 2 abhishek thakur +* 2 James Yu +* 2 Rohit Sivaprasad +* 2 Roland Szabo +* 2 amormachine +* 2 Alexis Mignon +* 2 Oscar Carlsson +* 2 Nantas Nardelli +* 2 jess010 +* 2 kowalski87 +* 2 Andrew Clegg +* 2 Federico Vaggi +* 2 Simon Frid +* 2 Félix-Antoine Fortin +* 1 Ralf Gommers +* 1 t-aft +* 1 Ronan Amicel +* 1 Rupesh Kumar Srivastava +* 1 Ryan Wang +* 1 Samuel Charron +* 1 Samuel St-Jean +* 1 Fabian Pedregosa +* 1 Skipper Seabold +* 1 Stefan Walk +* 1 Stefan van der Walt +* 1 Stephan Hoyer +* 1 Allen Riddell +* 1 Valentin Haenel +* 1 Vijay Ramesh +* 1 Will Myers +* 1 Yaroslav Halchenko +* 1 Yoni Ben-Meshulam +* 1 Yury V. Zaytsev +* 1 adrinjalali +* 1 ai8rahim +* 1 alemagnani +* 1 alex +* 1 benjamin wilson +* 1 chalmerlowe +* 1 dzikie drożdże +* 1 jamestwebber +* 1 matrixorz +* 1 popo +* 1 samuela +* 1 François Boulogne +* 1 Alexander Measure +* 1 Ethan White +* 1 Guilherme Trein +* 1 Hendrik Heuer +* 1 IvicaJovic +* 1 Jan Hendrik Metzen +* 1 Jean Michel Rouly +* 1 Eduardo Ariño de la Rubia +* 1 Jelle Zijlstra +* 1 Eddy L O Jansson +* 1 Denis +* 1 John +* 1 John Schmidt +* 1 Jorge Cañardo Alastuey +* 1 Joseph Perla +* 1 Joshua Vredevoogd +* 1 José Ricardo +* 1 Julien Miotte +* 1 Kemal Eren +* 1 Kenta Sato +* 1 David Cournapeau +* 1 Kyle Kelley +* 1 Daniele Medri +* 1 Laurent Luce +* 1 Laurent Pierron +* 1 Luis Pedro Coelho +* 1 DanielWeitzenfeld +* 1 Craig Thompson +* 1 Chyi-Kwei Yau +* 1 Matthew Brett +* 1 Matthias Feurer +* 1 Max Linke +* 1 Chris Filo Gorgolewski +* 1 Charles Earl +* 1 Michael Hanke +* 1 Michele Orrù +* 1 Bryan Lunt +* 1 Brian Kearns +* 1 Paul Butler +* 1 Paweł Mandera +* 1 Peter +* 1 Andrew Ash +* 1 Pietro Zambelli +* 1 staubda + diff --git a/doc/whats_new/v0.16.rst b/doc/whats_new/v0.16.rst new file mode 100644 index 0000000000000..33d8cc47e939a --- /dev/null +++ b/doc/whats_new/v0.16.rst @@ -0,0 +1,541 @@ +.. include:: _contributors.rst + +.. currentmodule:: sklearn + +.. _changes_0_16_1: + +Version 0.16.1 +=============== + +**April 14, 2015** + +Changelog +--------- + +Bug fixes +......... + +- Allow input data larger than ``block_size`` in + :class:`covariance.LedoitWolf` by `Andreas Müller`_. + +- Fix a bug in :class:`isotonic.IsotonicRegression` deduplication that + caused unstable result in :class:`calibration.CalibratedClassifierCV` by + `Jan Hendrik Metzen`_. + +- Fix sorting of labels in func:`preprocessing.label_binarize` by Michael Heilman. + +- Fix several stability and convergence issues in + :class:`cross_decomposition.CCA` and + :class:`cross_decomposition.PLSCanonical` by `Andreas Müller`_ + +- Fix a bug in :class:`cluster.KMeans` when ``precompute_distances=False`` + on fortran-ordered data. + +- Fix a speed regression in :class:`ensemble.RandomForestClassifier`'s ``predict`` + and ``predict_proba`` by `Andreas Müller`_. + +- Fix a regression where ``utils.shuffle`` converted lists and dataframes to arrays, by `Olivier Grisel`_ + +.. _changes_0_16: + +Version 0.16 +============ + +**March 26, 2015** + +Highlights +----------- + +- Speed improvements (notably in :class:`cluster.DBSCAN`), reduced memory + requirements, bug-fixes and better default settings. + +- Multinomial Logistic regression and a path algorithm in + :class:`linear_model.LogisticRegressionCV`. + +- Out-of core learning of PCA via :class:`decomposition.IncrementalPCA`. + +- Probability callibration of classifiers using + :class:`calibration.CalibratedClassifierCV`. + +- :class:`cluster.Birch` clustering method for large-scale datasets. + +- Scalable approximate nearest neighbors search with Locality-sensitive + hashing forests in :class:`neighbors.LSHForest`. + +- Improved error messages and better validation when using malformed input data. + +- More robust integration with pandas dataframes. + +Changelog +--------- + +New features +............ + +- The new :class:`neighbors.LSHForest` implements locality-sensitive hashing + for approximate nearest neighbors search. By :user:`Maheshakya Wijewardena`. + +- Added :class:`svm.LinearSVR`. This class uses the liblinear implementation + of Support Vector Regression which is much faster for large + sample sizes than :class:`svm.SVR` with linear kernel. By + `Fabian Pedregosa`_ and Qiang Luo. + +- Incremental fit for :class:`GaussianNB `. + +- Added ``sample_weight`` support to :class:`dummy.DummyClassifier` and + :class:`dummy.DummyRegressor`. By `Arnaud Joly`_. + +- Added the :func:`metrics.label_ranking_average_precision_score` metrics. + By `Arnaud Joly`_. + +- Add the :func:`metrics.coverage_error` metrics. By `Arnaud Joly`_. + +- Added :class:`linear_model.LogisticRegressionCV`. By + `Manoj Kumar`_, `Fabian Pedregosa`_, `Gael Varoquaux`_ + and `Alexandre Gramfort`_. + +- Added ``warm_start`` constructor parameter to make it possible for any + trained forest model to grow additional trees incrementally. By + :user:`Laurent Direr`. + +- Added ``sample_weight`` support to :class:`ensemble.GradientBoostingClassifier` and + :class:`ensemble.GradientBoostingRegressor`. By `Peter Prettenhofer`_. + +- Added :class:`decomposition.IncrementalPCA`, an implementation of the PCA + algorithm that supports out-of-core learning with a ``partial_fit`` + method. By `Kyle Kastner`_. + +- Averaged SGD for :class:`SGDClassifier ` + and :class:`SGDRegressor ` By + :user:`Danny Sullivan `. + +- Added :func:`cross_val_predict ` + function which computes cross-validated estimates. By `Luis Pedro Coelho`_ + +- Added :class:`linear_model.TheilSenRegressor`, a robust + generalized-median-based estimator. By :user:`Florian Wilhelm `. + +- Added :func:`metrics.median_absolute_error`, a robust metric. + By `Gael Varoquaux`_ and :user:`Florian Wilhelm `. + +- Add :class:`cluster.Birch`, an online clustering algorithm. By + `Manoj Kumar`_, `Alexandre Gramfort`_ and `Joel Nothman`_. + +- Added shrinkage support to :class:`discriminant_analysis.LinearDiscriminantAnalysis` + using two new solvers. By :user:`Clemens Brunner ` and `Martin Billinger`_. + +- Added :class:`kernel_ridge.KernelRidge`, an implementation of + kernelized ridge regression. + By `Mathieu Blondel`_ and `Jan Hendrik Metzen`_. + +- All solvers in :class:`linear_model.Ridge` now support `sample_weight`. + By `Mathieu Blondel`_. + +- Added :class:`cross_validation.PredefinedSplit` cross-validation + for fixed user-provided cross-validation folds. + By :user:`Thomas Unterthiner `. + +- Added :class:`calibration.CalibratedClassifierCV`, an approach for + calibrating the predicted probabilities of a classifier. + By `Alexandre Gramfort`_, `Jan Hendrik Metzen`_, `Mathieu Blondel`_ + and :user:`Balazs Kegl `. + + +Enhancements +............ + +- Add option ``return_distance`` in :func:`hierarchical.ward_tree` + to return distances between nodes for both structured and unstructured + versions of the algorithm. By `Matteo Visconti di Oleggio Castello`_. + The same option was added in :func:`hierarchical.linkage_tree`. + By `Manoj Kumar`_ + +- Add support for sample weights in scorer objects. Metrics with sample + weight support will automatically benefit from it. By `Noel Dawe`_ and + `Vlad Niculae`_. + +- Added ``newton-cg`` and `lbfgs` solver support in + :class:`linear_model.LogisticRegression`. By `Manoj Kumar`_. + +- Add ``selection="random"`` parameter to implement stochastic coordinate + descent for :class:`linear_model.Lasso`, :class:`linear_model.ElasticNet` + and related. By `Manoj Kumar`_. + +- Add ``sample_weight`` parameter to + :func:`metrics.jaccard_similarity_score` and :func:`metrics.log_loss`. + By :user:`Jatin Shah `. + +- Support sparse multilabel indicator representation in + :class:`preprocessing.LabelBinarizer` and + :class:`multiclass.OneVsRestClassifier` (by :user:`Hamzeh Alsalhi ` with thanks + to Rohit Sivaprasad), as well as evaluation metrics (by + `Joel Nothman`_). + +- Add ``sample_weight`` parameter to `metrics.jaccard_similarity_score`. + By `Jatin Shah`. + +- Add support for multiclass in `metrics.hinge_loss`. Added ``labels=None`` + as optional parameter. By `Saurabh Jha`. + +- Add ``sample_weight`` parameter to `metrics.hinge_loss`. + By `Saurabh Jha`. + +- Add ``multi_class="multinomial"`` option in + :class:`linear_model.LogisticRegression` to implement a Logistic + Regression solver that minimizes the cross-entropy or multinomial loss + instead of the default One-vs-Rest setting. Supports `lbfgs` and + `newton-cg` solvers. By `Lars Buitinck`_ and `Manoj Kumar`_. Solver option + `newton-cg` by Simon Wu. + +- ``DictVectorizer`` can now perform ``fit_transform`` on an iterable in a + single pass, when giving the option ``sort=False``. By :user:`Dan + Blanchard `. + +- :class:`GridSearchCV` and :class:`RandomizedSearchCV` can now be + configured to work with estimators that may fail and raise errors on + individual folds. This option is controlled by the `error_score` + parameter. This does not affect errors raised on re-fit. By + :user:`Michal Romaniuk `. + +- Add ``digits`` parameter to `metrics.classification_report` to allow + report to show different precision of floating point numbers. By + :user:`Ian Gilmore `. + +- Add a quantile prediction strategy to the :class:`dummy.DummyRegressor`. + By :user:`Aaron Staple `. + +- Add ``handle_unknown`` option to :class:`preprocessing.OneHotEncoder` to + handle unknown categorical features more gracefully during transform. + By `Manoj Kumar`_. + +- Added support for sparse input data to decision trees and their ensembles. + By `Fares Hedyati`_ and `Arnaud Joly`_. + +- Optimized :class:`cluster.AffinityPropagation` by reducing the number of + memory allocations of large temporary data-structures. By `Antony Lee`_. + +- Parellization of the computation of feature importances in random forest. + By `Olivier Grisel`_ and `Arnaud Joly`_. + +- Add ``n_iter_`` attribute to estimators that accept a ``max_iter`` attribute + in their constructor. By `Manoj Kumar`_. + +- Added decision function for :class:`multiclass.OneVsOneClassifier` + By `Raghav RV`_ and :user:`Kyle Beauchamp `. + +- :func:`neighbors.kneighbors_graph` and :func:`radius_neighbors_graph` + support non-Euclidean metrics. By `Manoj Kumar`_ + +- Parameter ``connectivity`` in :class:`cluster.AgglomerativeClustering` + and family now accept callables that return a connectivity matrix. + By `Manoj Kumar`_. + +- Sparse support for :func:`paired_distances`. By `Joel Nothman`_. + +- :class:`cluster.DBSCAN` now supports sparse input and sample weights and + has been optimized: the inner loop has been rewritten in Cython and + radius neighbors queries are now computed in batch. By `Joel Nothman`_ + and `Lars Buitinck`_. + +- Add ``class_weight`` parameter to automatically weight samples by class + frequency for :class:`ensemble.RandomForestClassifier`, + :class:`tree.DecisionTreeClassifier`, :class:`ensemble.ExtraTreesClassifier` + and :class:`tree.ExtraTreeClassifier`. By `Trevor Stephens`_. + +- :class:`grid_search.RandomizedSearchCV` now does sampling without + replacement if all parameters are given as lists. By `Andreas Müller`_. + +- Parallelized calculation of :func:`pairwise_distances` is now supported + for scipy metrics and custom callables. By `Joel Nothman`_. + +- Allow the fitting and scoring of all clustering algorithms in + :class:`pipeline.Pipeline`. By `Andreas Müller`_. + +- More robust seeding and improved error messages in :class:`cluster.MeanShift` + by `Andreas Müller`_. + +- Make the stopping criterion for :class:`mixture.GMM`, + :class:`mixture.DPGMM` and :class:`mixture.VBGMM` less dependent on the + number of samples by thresholding the average log-likelihood change + instead of its sum over all samples. By `Hervé Bredin`_. + +- The outcome of :func:`manifold.spectral_embedding` was made deterministic + by flipping the sign of eigenvectors. By :user:`Hasil Sharma `. + +- Significant performance and memory usage improvements in + :class:`preprocessing.PolynomialFeatures`. By `Eric Martin`_. + +- Numerical stability improvements for :class:`preprocessing.StandardScaler` + and :func:`preprocessing.scale`. By `Nicolas Goix`_ + +- :class:`svm.SVC` fitted on sparse input now implements ``decision_function``. + By `Rob Zinkov`_ and `Andreas Müller`_. + +- :func:`cross_validation.train_test_split` now preserves the input type, + instead of converting to numpy arrays. + + +Documentation improvements +.......................... + +- Added example of using :class:`FeatureUnion` for heterogeneous input. + By :user:`Matt Terry ` + +- Documentation on scorers was improved, to highlight the handling of loss + functions. By :user:`Matt Pico `. + +- A discrepancy between liblinear output and scikit-learn's wrappers + is now noted. By `Manoj Kumar`_. + +- Improved documentation generation: examples referring to a class or + function are now shown in a gallery on the class/function's API reference + page. By `Joel Nothman`_. + +- More explicit documentation of sample generators and of data + transformation. By `Joel Nothman`_. + +- :class:`sklearn.neighbors.BallTree` and :class:`sklearn.neighbors.KDTree` + used to point to empty pages stating that they are aliases of BinaryTree. + This has been fixed to show the correct class docs. By `Manoj Kumar`_. + +- Added silhouette plots for analysis of KMeans clustering using + :func:`metrics.silhouette_samples` and :func:`metrics.silhouette_score`. + See :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py` + +Bug fixes +......... +- Metaestimators now support ducktyping for the presence of ``decision_function``, + ``predict_proba`` and other methods. This fixes behavior of + :class:`grid_search.GridSearchCV`, + :class:`grid_search.RandomizedSearchCV`, :class:`pipeline.Pipeline`, + :class:`feature_selection.RFE`, :class:`feature_selection.RFECV` when nested. + By `Joel Nothman`_ + +- The ``scoring`` attribute of grid-search and cross-validation methods is no longer + ignored when a :class:`grid_search.GridSearchCV` is given as a base estimator or + the base estimator doesn't have predict. + +- The function :func:`hierarchical.ward_tree` now returns the children in + the same order for both the structured and unstructured versions. By + `Matteo Visconti di Oleggio Castello`_. + +- :class:`feature_selection.RFECV` now correctly handles cases when + ``step`` is not equal to 1. By :user:`Nikolay Mayorov ` + +- The :class:`decomposition.PCA` now undoes whitening in its + ``inverse_transform``. Also, its ``components_`` now always have unit + length. By :user:`Michael Eickenberg `. + +- Fix incomplete download of the dataset when + :func:`datasets.download_20newsgroups` is called. By `Manoj Kumar`_. + +- Various fixes to the Gaussian processes subpackage by Vincent Dubourg + and Jan Hendrik Metzen. + +- Calling ``partial_fit`` with ``class_weight=='auto'`` throws an + appropriate error message and suggests a work around. + By :user:`Danny Sullivan `. + +- :class:`RBFSampler ` with ``gamma=g`` + formerly approximated :func:`rbf_kernel ` + with ``gamma=g/2.``; the definition of ``gamma`` is now consistent, + which may substantially change your results if you use a fixed value. + (If you cross-validated over ``gamma``, it probably doesn't matter + too much.) By :user:`Dougal Sutherland `. + +- Pipeline object delegate the ``classes_`` attribute to the underlying + estimator. It allows, for instance, to make bagging of a pipeline object. + By `Arnaud Joly`_ + +- :class:`neighbors.NearestCentroid` now uses the median as the centroid + when metric is set to ``manhattan``. It was using the mean before. + By `Manoj Kumar`_ + +- Fix numerical stability issues in :class:`linear_model.SGDClassifier` + and :class:`linear_model.SGDRegressor` by clipping large gradients and + ensuring that weight decay rescaling is always positive (for large + l2 regularization and large learning rate values). + By `Olivier Grisel`_ + +- When `compute_full_tree` is set to "auto", the full tree is + built when n_clusters is high and is early stopped when n_clusters is + low, while the behavior should be vice-versa in + :class:`cluster.AgglomerativeClustering` (and friends). + This has been fixed By `Manoj Kumar`_ + +- Fix lazy centering of data in :func:`linear_model.enet_path` and + :func:`linear_model.lasso_path`. It was centered around one. It has + been changed to be centered around the origin. By `Manoj Kumar`_ + +- Fix handling of precomputed affinity matrices in + :class:`cluster.AgglomerativeClustering` when using connectivity + constraints. By :user:`Cathy Deng ` + +- Correct ``partial_fit`` handling of ``class_prior`` for + :class:`sklearn.naive_bayes.MultinomialNB` and + :class:`sklearn.naive_bayes.BernoulliNB`. By `Trevor Stephens`_. + +- Fixed a crash in :func:`metrics.precision_recall_fscore_support` + when using unsorted ``labels`` in the multi-label setting. + By `Andreas Müller`_. + +- Avoid skipping the first nearest neighbor in the methods ``radius_neighbors``, + ``kneighbors``, ``kneighbors_graph`` and ``radius_neighbors_graph`` in + :class:`sklearn.neighbors.NearestNeighbors` and family, when the query + data is not the same as fit data. By `Manoj Kumar`_. + +- Fix log-density calculation in the :class:`mixture.GMM` with + tied covariance. By `Will Dawson`_ + +- Fixed a scaling error in :class:`feature_selection.SelectFdr` + where a factor ``n_features`` was missing. By `Andrew Tulloch`_ + +- Fix zero division in :class:`neighbors.KNeighborsRegressor` and related + classes when using distance weighting and having identical data points. + By `Garret-R `_. + +- Fixed round off errors with non positive-definite covariance matrices + in GMM. By :user:`Alexis Mignon `. + +- Fixed a error in the computation of conditional probabilities in + :class:`naive_bayes.BernoulliNB`. By `Hanna Wallach`_. + +- Make the method ``radius_neighbors`` of + :class:`neighbors.NearestNeighbors` return the samples lying on the + boundary for ``algorithm='brute'``. By `Yan Yi`_. + +- Flip sign of ``dual_coef_`` of :class:`svm.SVC` + to make it consistent with the documentation and + ``decision_function``. By Artem Sobolev. + +- Fixed handling of ties in :class:`isotonic.IsotonicRegression`. + We now use the weighted average of targets (secondary method). By + `Andreas Müller`_ and `Michael Bommarito `_. + +API changes summary +------------------- + +- :class:`GridSearchCV ` and + :func:`cross_val_score ` and other + meta-estimators don't convert pandas DataFrames into arrays any more, + allowing DataFrame specific operations in custom estimators. + +- :func:`multiclass.fit_ovr`, :func:`multiclass.predict_ovr`, + :func:`predict_proba_ovr`, + :func:`multiclass.fit_ovo`, :func:`multiclass.predict_ovo`, + :func:`multiclass.fit_ecoc` and :func:`multiclass.predict_ecoc` + are deprecated. Use the underlying estimators instead. + +- Nearest neighbors estimators used to take arbitrary keyword arguments + and pass these to their distance metric. This will no longer be supported + in scikit-learn 0.18; use the ``metric_params`` argument instead. + +- `n_jobs` parameter of the fit method shifted to the constructor of the + LinearRegression class. + +- The ``predict_proba`` method of :class:`multiclass.OneVsRestClassifier` + now returns two probabilities per sample in the multiclass case; this + is consistent with other estimators and with the method's documentation, + but previous versions accidentally returned only the positive + probability. Fixed by Will Lamond and `Lars Buitinck`_. + +- Change default value of precompute in :class:`ElasticNet` and :class:`Lasso` + to False. Setting precompute to "auto" was found to be slower when + n_samples > n_features since the computation of the Gram matrix is + computationally expensive and outweighs the benefit of fitting the Gram + for just one alpha. + ``precompute="auto"`` is now deprecated and will be removed in 0.18 + By `Manoj Kumar`_. + +- Expose ``positive`` option in :func:`linear_model.enet_path` and + :func:`linear_model.enet_path` which constrains coefficients to be + positive. By `Manoj Kumar`_. + +- Users should now supply an explicit ``average`` parameter to + :func:`sklearn.metrics.f1_score`, :func:`sklearn.metrics.fbeta_score`, + :func:`sklearn.metrics.recall_score` and + :func:`sklearn.metrics.precision_score` when performing multiclass + or multilabel (i.e. not binary) classification. By `Joel Nothman`_. + +- `scoring` parameter for cross validation now accepts `'f1_micro'`, + `'f1_macro'` or `'f1_weighted'`. `'f1'` is now for binary classification + only. Similar changes apply to `'precision'` and `'recall'`. + By `Joel Nothman`_. + +- The ``fit_intercept``, ``normalize`` and ``return_models`` parameters in + :func:`linear_model.enet_path` and :func:`linear_model.lasso_path` have + been removed. They were deprecated since 0.14 + +- From now onwards, all estimators will uniformly raise ``NotFittedError`` + (:class:`utils.validation.NotFittedError`), when any of the ``predict`` + like methods are called before the model is fit. By `Raghav RV`_. + +- Input data validation was refactored for more consistent input + validation. The ``check_arrays`` function was replaced by ``check_array`` + and ``check_X_y``. By `Andreas Müller`_. + +- Allow ``X=None`` in the methods ``radius_neighbors``, ``kneighbors``, + ``kneighbors_graph`` and ``radius_neighbors_graph`` in + :class:`sklearn.neighbors.NearestNeighbors` and family. If set to None, + then for every sample this avoids setting the sample itself as the + first nearest neighbor. By `Manoj Kumar`_. + +- Add parameter ``include_self`` in :func:`neighbors.kneighbors_graph` + and :func:`neighbors.radius_neighbors_graph` which has to be explicitly + set by the user. If set to True, then the sample itself is considered + as the first nearest neighbor. + +- `thresh` parameter is deprecated in favor of new `tol` parameter in + :class:`GMM`, :class:`DPGMM` and :class:`VBGMM`. See `Enhancements` + section for details. By `Hervé Bredin`_. + +- Estimators will treat input with dtype object as numeric when possible. + By `Andreas Müller`_ + +- Estimators now raise `ValueError` consistently when fitted on empty + data (less than 1 sample or less than 1 feature for 2D input). + By `Olivier Grisel`_. + + +- The ``shuffle`` option of :class:`.linear_model.SGDClassifier`, + :class:`linear_model.SGDRegressor`, :class:`linear_model.Perceptron`, + :class:`linear_model.PassiveAgressiveClassifier` and + :class:`linear_model.PassiveAgressiveRegressor` now defaults to ``True``. + +- :class:`cluster.DBSCAN` now uses a deterministic initialization. The + `random_state` parameter is deprecated. By :user:`Erich Schubert `. + +Code Contributors +----------------- +A. Flaxman, Aaron Schumacher, Aaron Staple, abhishek thakur, Akshay, akshayah3, +Aldrian Obaja, Alexander Fabisch, Alexandre Gramfort, Alexis Mignon, Anders +Aagaard, Andreas Mueller, Andreas van Cranenburgh, Andrew Tulloch, Andrew +Walker, Antony Lee, Arnaud Joly, banilo, Barmaley.exe, Ben Davies, Benedikt +Koehler, bhsu, Boris Feld, Borja Ayerdi, Boyuan Deng, Brent Pedersen, Brian +Wignall, Brooke Osborn, Calvin Giles, Cathy Deng, Celeo, cgohlke, chebee7i, +Christian Stade-Schuldt, Christof Angermueller, Chyi-Kwei Yau, CJ Carey, +Clemens Brunner, Daiki Aminaka, Dan Blanchard, danfrankj, Danny Sullivan, David +Fletcher, Dmitrijs Milajevs, Dougal J. Sutherland, Erich Schubert, Fabian +Pedregosa, Florian Wilhelm, floydsoft, Félix-Antoine Fortin, Gael Varoquaux, +Garrett-R, Gilles Louppe, gpassino, gwulfs, Hampus Bengtsson, Hamzeh Alsalhi, +Hanna Wallach, Harry Mavroforakis, Hasil Sharma, Helder, Herve Bredin, +Hsiang-Fu Yu, Hugues SALAMIN, Ian Gilmore, Ilambharathi Kanniah, Imran Haque, +isms, Jake VanderPlas, Jan Dlabal, Jan Hendrik Metzen, Jatin Shah, Javier López +Peña, jdcaballero, Jean Kossaifi, Jeff Hammerbacher, Joel Nothman, Jonathan +Helmus, Joseph, Kaicheng Zhang, Kevin Markham, Kyle Beauchamp, Kyle Kastner, +Lagacherie Matthieu, Lars Buitinck, Laurent Direr, leepei, Loic Esteve, Luis +Pedro Coelho, Lukas Michelbacher, maheshakya, Manoj Kumar, Manuel, Mario +Michael Krell, Martin, Martin Billinger, Martin Ku, Mateusz Susik, Mathieu +Blondel, Matt Pico, Matt Terry, Matteo Visconti dOC, Matti Lyra, Max Linke, +Mehdi Cherti, Michael Bommarito, Michael Eickenberg, Michal Romaniuk, MLG, +mr.Shu, Nelle Varoquaux, Nicola Montecchio, Nicolas, Nikolay Mayorov, Noel +Dawe, Okal Billy, Olivier Grisel, Óscar Nájera, Paolo Puggioni, Peter +Prettenhofer, Pratap Vardhan, pvnguyen, queqichao, Rafael Carrascosa, Raghav R +V, Rahiel Kasim, Randall Mason, Rob Zinkov, Robert Bradshaw, Saket Choudhary, +Sam Nicholls, Samuel Charron, Saurabh Jha, sethdandridge, sinhrks, snuderl, +Stefan Otte, Stefan van der Walt, Steve Tjoa, swu, Sylvain Zimmer, tejesh95, +terrycojones, Thomas Delteil, Thomas Unterthiner, Tomas Kazmar, trevorstephens, +tttthomasssss, Tzu-Ming Kuo, ugurcaliskan, ugurthemaster, Vinayak Mehta, +Vincent Dubourg, Vjacheslav Murashkin, Vlad Niculae, wadawson, Wei Xue, Will +Lamond, Wu Jiang, x0l, Xinfan Meng, Yan Yi, Yu-Chin + diff --git a/doc/whats_new/v0.17.rst b/doc/whats_new/v0.17.rst new file mode 100644 index 0000000000000..35e895e5d4188 --- /dev/null +++ b/doc/whats_new/v0.17.rst @@ -0,0 +1,511 @@ +.. include:: _contributors.rst + +.. currentmodule:: sklearn + +.. _changes_0_17_1: + +Version 0.17.1 +============== + +**February 18, 2016** + +Changelog +--------- + +Bug fixes +......... + + +- Upgrade vendored joblib to version 0.9.4 that fixes an important bug in + ``joblib.Parallel`` that can silently yield to wrong results when working + on datasets larger than 1MB: + https://github.com/joblib/joblib/blob/0.9.4/CHANGES.rst + +- Fixed reading of Bunch pickles generated with scikit-learn + version <= 0.16. This can affect users who have already + downloaded a dataset with scikit-learn 0.16 and are loading it + with scikit-learn 0.17. See :issue:`6196` for + how this affected :func:`datasets.fetch_20newsgroups`. By `Loic + Esteve`_. + +- Fixed a bug that prevented using ROC AUC score to perform grid search on + several CPU / cores on large arrays. See :issue:`6147` + By `Olivier Grisel`_. + +- Fixed a bug that prevented to properly set the ``presort`` parameter + in :class:`ensemble.GradientBoostingRegressor`. See :issue:`5857` + By Andrew McCulloh. + +- Fixed a joblib error when evaluating the perplexity of a + :class:`decomposition.LatentDirichletAllocation` model. See :issue:`6258` + By Chyi-Kwei Yau. + + +.. _changes_0_17: + +Version 0.17 +============ + +**November 5, 2015** + +Changelog +--------- + +New features +............ + +- All the Scaler classes but :class:`preprocessing.RobustScaler` can be fitted online by + calling `partial_fit`. By :user:`Giorgio Patrini `. + +- The new class :class:`ensemble.VotingClassifier` implements a + "majority rule" / "soft voting" ensemble classifier to combine + estimators for classification. By `Sebastian Raschka`_. + +- The new class :class:`preprocessing.RobustScaler` provides an + alternative to :class:`preprocessing.StandardScaler` for feature-wise + centering and range normalization that is robust to outliers. + By :user:`Thomas Unterthiner `. + +- The new class :class:`preprocessing.MaxAbsScaler` provides an + alternative to :class:`preprocessing.MinMaxScaler` for feature-wise + range normalization when the data is already centered or sparse. + By :user:`Thomas Unterthiner `. + +- The new class :class:`preprocessing.FunctionTransformer` turns a Python + function into a ``Pipeline``-compatible transformer object. + By Joe Jevnik. + +- The new classes :class:`cross_validation.LabelKFold` and + :class:`cross_validation.LabelShuffleSplit` generate train-test folds, + respectively similar to :class:`cross_validation.KFold` and + :class:`cross_validation.ShuffleSplit`, except that the folds are + conditioned on a label array. By `Brian McFee`_, :user:`Jean + Kossaifi ` and `Gilles Louppe`_. + +- :class:`decomposition.LatentDirichletAllocation` implements the Latent + Dirichlet Allocation topic model with online variational + inference. By :user:`Chyi-Kwei Yau `, with code based on an implementation + by Matt Hoffman. (:issue:`3659`) + +- The new solver ``sag`` implements a Stochastic Average Gradient descent + and is available in both :class:`linear_model.LogisticRegression` and + :class:`linear_model.Ridge`. This solver is very efficient for large + datasets. By :user:`Danny Sullivan ` and `Tom Dupre la Tour`_. + (:issue:`4738`) + +- The new solver ``cd`` implements a Coordinate Descent in + :class:`decomposition.NMF`. Previous solver based on Projected Gradient is + still available setting new parameter ``solver`` to ``pg``, but is + deprecated and will be removed in 0.19, along with + :class:`decomposition.ProjectedGradientNMF` and parameters ``sparseness``, + ``eta``, ``beta`` and ``nls_max_iter``. New parameters ``alpha`` and + ``l1_ratio`` control L1 and L2 regularization, and ``shuffle`` adds a + shuffling step in the ``cd`` solver. + By `Tom Dupre la Tour`_ and `Mathieu Blondel`_. + +Enhancements +............ +- :class:`manifold.TSNE` now supports approximate optimization via the + Barnes-Hut method, leading to much faster fitting. By Christopher Erick Moody. + (:issue:`4025`) + +- :class:`cluster.mean_shift_.MeanShift` now supports parallel execution, + as implemented in the ``mean_shift`` function. By :user:`Martino + Sorbaro `. + +- :class:`naive_bayes.GaussianNB` now supports fitting with ``sample_weight``. + By `Jan Hendrik Metzen`_. + +- :class:`dummy.DummyClassifier` now supports a prior fitting strategy. + By `Arnaud Joly`_. + +- Added a ``fit_predict`` method for :class:`mixture.GMM` and subclasses. + By :user:`Cory Lorenz `. + +- Added the :func:`metrics.label_ranking_loss` metric. + By `Arnaud Joly`_. + +- Added the :func:`metrics.cohen_kappa_score` metric. + +- Added a ``warm_start`` constructor parameter to the bagging ensemble + models to increase the size of the ensemble. By :user:`Tim Head `. + +- Added option to use multi-output regression metrics without averaging. + By Konstantin Shmelkov and :user:`Michael Eickenberg`. + +- Added ``stratify`` option to :func:`cross_validation.train_test_split` + for stratified splitting. By Miroslav Batchkarov. + +- The :func:`tree.export_graphviz` function now supports aesthetic + improvements for :class:`tree.DecisionTreeClassifier` and + :class:`tree.DecisionTreeRegressor`, including options for coloring nodes + by their majority class or impurity, showing variable names, and using + node proportions instead of raw sample counts. By `Trevor Stephens`_. + +- Improved speed of ``newton-cg`` solver in + :class:`linear_model.LogisticRegression`, by avoiding loss computation. + By `Mathieu Blondel`_ and `Tom Dupre la Tour`_. + +- The ``class_weight="auto"`` heuristic in classifiers supporting + ``class_weight`` was deprecated and replaced by the ``class_weight="balanced"`` + option, which has a simpler formula and interpretation. + By `Hanna Wallach`_ and `Andreas Müller`_. + +- Add ``class_weight`` parameter to automatically weight samples by class + frequency for :class:`linear_model.PassiveAgressiveClassifier`. By + `Trevor Stephens`_. + +- Added backlinks from the API reference pages to the user guide. By + `Andreas Müller`_. + +- The ``labels`` parameter to :func:`sklearn.metrics.f1_score`, + :func:`sklearn.metrics.fbeta_score`, + :func:`sklearn.metrics.recall_score` and + :func:`sklearn.metrics.precision_score` has been extended. + It is now possible to ignore one or more labels, such as where + a multiclass problem has a majority class to ignore. By `Joel Nothman`_. + +- Add ``sample_weight`` support to :class:`linear_model.RidgeClassifier`. + By `Trevor Stephens`_. + +- Provide an option for sparse output from + :func:`sklearn.metrics.pairwise.cosine_similarity`. By + :user:`Jaidev Deshpande `. + +- Add :func:`minmax_scale` to provide a function interface for + :class:`MinMaxScaler`. By :user:`Thomas Unterthiner `. + +- ``dump_svmlight_file`` now handles multi-label datasets. + By Chih-Wei Chang. + +- RCV1 dataset loader (:func:`sklearn.datasets.fetch_rcv1`). + By `Tom Dupre la Tour`_. + +- The "Wisconsin Breast Cancer" classical two-class classification dataset + is now included in scikit-learn, available with + :func:`sklearn.dataset.load_breast_cancer`. + +- Upgraded to joblib 0.9.3 to benefit from the new automatic batching of + short tasks. This makes it possible for scikit-learn to benefit from + parallelism when many very short tasks are executed in parallel, for + instance by the :class:`grid_search.GridSearchCV` meta-estimator + with ``n_jobs > 1`` used with a large grid of parameters on a small + dataset. By `Vlad Niculae`_, `Olivier Grisel`_ and `Loic Esteve`_. + +- For more details about changes in joblib 0.9.3 see the release notes: + https://github.com/joblib/joblib/blob/master/CHANGES.rst#release-093 + +- Improved speed (3 times per iteration) of + :class:`decomposition.DictLearning` with coordinate descent method + from :class:`linear_model.Lasso`. By :user:`Arthur Mensch `. + +- Parallel processing (threaded) for queries of nearest neighbors + (using the ball-tree) by Nikolay Mayorov. + +- Allow :func:`datasets.make_multilabel_classification` to output + a sparse ``y``. By Kashif Rasul. + +- :class:`cluster.DBSCAN` now accepts a sparse matrix of precomputed + distances, allowing memory-efficient distance precomputation. By + `Joel Nothman`_. + +- :class:`tree.DecisionTreeClassifier` now exposes an ``apply`` method + for retrieving the leaf indices samples are predicted as. By + :user:`Daniel Galvez ` and `Gilles Louppe`_. + +- Speed up decision tree regressors, random forest regressors, extra trees + regressors and gradient boosting estimators by computing a proxy + of the impurity improvement during the tree growth. The proxy quantity is + such that the split that maximizes this value also maximizes the impurity + improvement. By `Arnaud Joly`_, :user:`Jacob Schreiber ` + and `Gilles Louppe`_. + +- Speed up tree based methods by reducing the number of computations needed + when computing the impurity measure taking into account linear + relationship of the computed statistics. The effect is particularly + visible with extra trees and on datasets with categorical or sparse + features. By `Arnaud Joly`_. + +- :class:`ensemble.GradientBoostingRegressor` and + :class:`ensemble.GradientBoostingClassifier` now expose an ``apply`` + method for retrieving the leaf indices each sample ends up in under + each try. By :user:`Jacob Schreiber `. + +- Add ``sample_weight`` support to :class:`linear_model.LinearRegression`. + By Sonny Hu. (:issue:`#4881`) + +- Add ``n_iter_without_progress`` to :class:`manifold.TSNE` to control + the stopping criterion. By Santi Villalba. (:issue:`5186`) + +- Added optional parameter ``random_state`` in :class:`linear_model.Ridge` + , to set the seed of the pseudo random generator used in ``sag`` solver. By `Tom Dupre la Tour`_. + +- Added optional parameter ``warm_start`` in + :class:`linear_model.LogisticRegression`. If set to True, the solvers + ``lbfgs``, ``newton-cg`` and ``sag`` will be initialized with the + coefficients computed in the previous fit. By `Tom Dupre la Tour`_. + +- Added ``sample_weight`` support to :class:`linear_model.LogisticRegression` for + the ``lbfgs``, ``newton-cg``, and ``sag`` solvers. By `Valentin Stolbunov`_. + Support added to the ``liblinear`` solver. By `Manoj Kumar`_. + +- Added optional parameter ``presort`` to :class:`ensemble.GradientBoostingRegressor` + and :class:`ensemble.GradientBoostingClassifier`, keeping default behavior + the same. This allows gradient boosters to turn off presorting when building + deep trees or using sparse data. By :user:`Jacob Schreiber `. + +- Altered :func:`metrics.roc_curve` to drop unnecessary thresholds by + default. By :user:`Graham Clenaghan `. + +- Added :class:`feature_selection.SelectFromModel` meta-transformer which can + be used along with estimators that have `coef_` or `feature_importances_` + attribute to select important features of the input data. By + :user:`Maheshakya Wijewardena `, `Joel Nothman`_ and `Manoj Kumar`_. + +- Added :func:`metrics.pairwise.laplacian_kernel`. By `Clyde Fare `_. + +- :class:`covariance.GraphLasso` allows separate control of the convergence criterion + for the Elastic-Net subproblem via the ``enet_tol`` parameter. + +- Improved verbosity in :class:`decomposition.DictionaryLearning`. + +- :class:`ensemble.RandomForestClassifier` and + :class:`ensemble.RandomForestRegressor` no longer explicitly store the + samples used in bagging, resulting in a much reduced memory footprint for + storing random forest models. + +- Added ``positive`` option to :class:`linear_model.Lars` and + :func:`linear_model.lars_path` to force coefficients to be positive. + (:issue:`5131`) + +- Added the ``X_norm_squared`` parameter to :func:`metrics.pairwise.euclidean_distances` + to provide precomputed squared norms for ``X``. + +- Added the ``fit_predict`` method to :class:`pipeline.Pipeline`. + +- Added the :func:`preprocessing.min_max_scale` function. + +Bug fixes +......... + +- Fixed non-determinism in :class:`dummy.DummyClassifier` with sparse + multi-label output. By `Andreas Müller`_. + +- Fixed the output shape of :class:`linear_model.RANSACRegressor` to + ``(n_samples, )``. By `Andreas Müller`_. + +- Fixed bug in :class:`decomposition.DictLearning` when ``n_jobs < 0``. By + `Andreas Müller`_. + +- Fixed bug where :class:`grid_search.RandomizedSearchCV` could consume a + lot of memory for large discrete grids. By `Joel Nothman`_. + +- Fixed bug in :class:`linear_model.LogisticRegressionCV` where `penalty` was ignored + in the final fit. By `Manoj Kumar`_. + +- Fixed bug in :class:`ensemble.forest.ForestClassifier` while computing + oob_score and X is a sparse.csc_matrix. By :user:`Ankur Ankan `. + +- All regressors now consistently handle and warn when given ``y`` that is of + shape ``(n_samples, 1)``. By `Andreas Müller`_ and Henry Lin. + (:issue:`5431`) + +- Fix in :class:`cluster.KMeans` cluster reassignment for sparse input by + `Lars Buitinck`_. + +- Fixed a bug in :class:`lda.LDA` that could cause asymmetric covariance + matrices when using shrinkage. By `Martin Billinger`_. + +- Fixed :func:`cross_validation.cross_val_predict` for estimators with + sparse predictions. By Buddha Prakash. + +- Fixed the ``predict_proba`` method of :class:`linear_model.LogisticRegression` + to use soft-max instead of one-vs-rest normalization. By `Manoj Kumar`_. + (:issue:`5182`) + +- Fixed the :func:`partial_fit` method of :class:`linear_model.SGDClassifier` + when called with ``average=True``. By :user:`Andrew Lamb `. + (:issue:`5282`) + +- Dataset fetchers use different filenames under Python 2 and Python 3 to + avoid pickling compatibility issues. By `Olivier Grisel`_. + (:issue:`5355`) + +- Fixed a bug in :class:`naive_bayes.GaussianNB` which caused classification + results to depend on scale. By `Jake Vanderplas`_. + +- Fixed temporarily :class:`linear_model.Ridge`, which was incorrect + when fitting the intercept in the case of sparse data. The fix + automatically changes the solver to 'sag' in this case. + :issue:`5360` by `Tom Dupre la Tour`_. + +- Fixed a performance bug in :class:`decomposition.RandomizedPCA` on data + with a large number of features and fewer samples. (:issue:`4478`) + By `Andreas Müller`_, `Loic Esteve`_ and :user:`Giorgio Patrini `. + +- Fixed bug in :class:`cross_decomposition.PLS` that yielded unstable and + platform dependent output, and failed on `fit_transform`. + By :user:`Arthur Mensch `. + +- Fixes to the ``Bunch`` class used to store datasets. + +- Fixed :func:`ensemble.plot_partial_dependence` ignoring the + ``percentiles`` parameter. + +- Providing a ``set`` as vocabulary in ``CountVectorizer`` no longer + leads to inconsistent results when pickling. + +- Fixed the conditions on when a precomputed Gram matrix needs to + be recomputed in :class:`linear_model.LinearRegression`, + :class:`linear_model.OrthogonalMatchingPursuit`, + :class:`linear_model.Lasso` and :class:`linear_model.ElasticNet`. + +- Fixed inconsistent memory layout in the coordinate descent solver + that affected :class:`linear_model.DictionaryLearning` and + :class:`covariance.GraphLasso`. (:issue:`5337`) + By `Olivier Grisel`_. + +- :class:`manifold.LocallyLinearEmbedding` no longer ignores the ``reg`` + parameter. + +- Nearest Neighbor estimators with custom distance metrics can now be pickled. + (:issue:`4362`) + +- Fixed a bug in :class:`pipeline.FeatureUnion` where ``transformer_weights`` + were not properly handled when performing grid-searches. + +- Fixed a bug in :class:`linear_model.LogisticRegression` and + :class:`linear_model.LogisticRegressionCV` when using + ``class_weight='balanced'```or ``class_weight='auto'``. + By `Tom Dupre la Tour`_. + +- Fixed bug :issue:`5495` when + doing OVR(SVC(decision_function_shape="ovr")). Fixed by + :user:`Elvis Dohmatob `. + + +API changes summary +------------------- +- Attribute `data_min`, `data_max` and `data_range` in + :class:`preprocessing.MinMaxScaler` are deprecated and won't be available + from 0.19. Instead, the class now exposes `data_min_`, `data_max_` + and `data_range_`. By :user:`Giorgio Patrini `. + +- All Scaler classes now have an `scale_` attribute, the feature-wise + rescaling applied by their `transform` methods. The old attribute `std_` + in :class:`preprocessing.StandardScaler` is deprecated and superseded + by `scale_`; it won't be available in 0.19. By :user:`Giorgio Patrini `. + +- :class:`svm.SVC`` and :class:`svm.NuSVC` now have an ``decision_function_shape`` + parameter to make their decision function of shape ``(n_samples, n_classes)`` + by setting ``decision_function_shape='ovr'``. This will be the default behavior + starting in 0.19. By `Andreas Müller`_. + +- Passing 1D data arrays as input to estimators is now deprecated as it + caused confusion in how the array elements should be interpreted + as features or as samples. All data arrays are now expected + to be explicitly shaped ``(n_samples, n_features)``. + By :user:`Vighnesh Birodkar `. + +- :class:`lda.LDA` and :class:`qda.QDA` have been moved to + :class:`discriminant_analysis.LinearDiscriminantAnalysis` and + :class:`discriminant_analysis.QuadraticDiscriminantAnalysis`. + +- The ``store_covariance`` and ``tol`` parameters have been moved from + the fit method to the constructor in + :class:`discriminant_analysis.LinearDiscriminantAnalysis` and the + ``store_covariances`` and ``tol`` parameters have been moved from the + fit method to the constructor in + :class:`discriminant_analysis.QuadraticDiscriminantAnalysis`. + +- Models inheriting from ``_LearntSelectorMixin`` will no longer support the + transform methods. (i.e, RandomForests, GradientBoosting, LogisticRegression, + DecisionTrees, SVMs and SGD related models). Wrap these models around the + metatransfomer :class:`feature_selection.SelectFromModel` to remove + features (according to `coefs_` or `feature_importances_`) + which are below a certain threshold value instead. + +- :class:`cluster.KMeans` re-runs cluster-assignments in case of non-convergence, + to ensure consistency of ``predict(X)`` and ``labels_``. By + :user:`Vighnesh Birodkar `. + +- Classifier and Regressor models are now tagged as such using the + ``_estimator_type`` attribute. + +- Cross-validation iterators always provide indices into training and test set, + not boolean masks. + +- The ``decision_function`` on all regressors was deprecated and will be + removed in 0.19. Use ``predict`` instead. + +- :func:`datasets.load_lfw_pairs` is deprecated and will be removed in 0.19. + Use :func:`datasets.fetch_lfw_pairs` instead. + +- The deprecated ``hmm`` module was removed. + +- The deprecated ``Bootstrap`` cross-validation iterator was removed. + +- The deprecated ``Ward`` and ``WardAgglomerative`` classes have been removed. + Use :class:`clustering.AgglomerativeClustering` instead. + +- :func:`cross_validation.check_cv` is now a public function. + +- The property ``residues_`` of :class:`linear_model.LinearRegression` is deprecated + and will be removed in 0.19. + +- The deprecated ``n_jobs`` parameter of :class:`linear_model.LinearRegression` has been moved + to the constructor. + +- Removed deprecated ``class_weight`` parameter from :class:`linear_model.SGDClassifier`'s ``fit`` + method. Use the construction parameter instead. + +- The deprecated support for the sequence of sequences (or list of lists) multilabel + format was removed. To convert to and from the supported binary + indicator matrix format, use + :class:`MultiLabelBinarizer `. + +- The behavior of calling the ``inverse_transform`` method of ``Pipeline.pipeline`` will + change in 0.19. It will no longer reshape one-dimensional input to two-dimensional input. + +- The deprecated attributes ``indicator_matrix_``, ``multilabel_`` and ``classes_`` of + :class:`preprocessing.LabelBinarizer` were removed. + +- Using ``gamma=0`` in :class:`svm.SVC` and :class:`svm.SVR` to automatically set the + gamma to ``1. / n_features`` is deprecated and will be removed in 0.19. + Use ``gamma="auto"`` instead. + +Code Contributors +----------------- +Aaron Schumacher, Adithya Ganesh, akitty, Alexandre Gramfort, Alexey Grigorev, +Ali Baharev, Allen Riddell, Ando Saabas, Andreas Mueller, Andrew Lamb, Anish +Shah, Ankur Ankan, Anthony Erlinger, Ari Rouvinen, Arnaud Joly, Arnaud Rachez, +Arthur Mensch, banilo, Barmaley.exe, benjaminirving, Boyuan Deng, Brett Naul, +Brian McFee, Buddha Prakash, Chi Zhang, Chih-Wei Chang, Christof Angermueller, +Christoph Gohlke, Christophe Bourguignat, Christopher Erick Moody, Chyi-Kwei +Yau, Cindy Sridharan, CJ Carey, Clyde-fare, Cory Lorenz, Dan Blanchard, Daniel +Galvez, Daniel Kronovet, Danny Sullivan, Data1010, David, David D Lowe, David +Dotson, djipey, Dmitry Spikhalskiy, Donne Martin, Dougal J. Sutherland, Dougal +Sutherland, edson duarte, Eduardo Caro, Eric Larson, Eric Martin, Erich +Schubert, Fernando Carrillo, Frank C. Eckert, Frank Zalkow, Gael Varoquaux, +Ganiev Ibraim, Gilles Louppe, Giorgio Patrini, giorgiop, Graham Clenaghan, +Gryllos Prokopis, gwulfs, Henry Lin, Hsuan-Tien Lin, Immanuel Bayer, Ishank +Gulati, Jack Martin, Jacob Schreiber, Jaidev Deshpande, Jake Vanderplas, Jan +Hendrik Metzen, Jean Kossaifi, Jeffrey04, Jeremy, jfraj, Jiali Mei, +Joe Jevnik, Joel Nothman, John Kirkham, John Wittenauer, Joseph, Joshua Loyal, +Jungkook Park, KamalakerDadi, Kashif Rasul, Keith Goodman, Kian Ho, Konstantin +Shmelkov, Kyler Brown, Lars Buitinck, Lilian Besson, Loic Esteve, Louis Tiao, +maheshakya, Maheshakya Wijewardena, Manoj Kumar, MarkTab marktab.net, Martin +Ku, Martin Spacek, MartinBpr, martinosorb, MaryanMorel, Masafumi Oyamada, +Mathieu Blondel, Matt Krump, Matti Lyra, Maxim Kolganov, mbillinger, mhg, +Michael Heilman, Michael Patterson, Miroslav Batchkarov, Nelle Varoquaux, +Nicolas, Nikolay Mayorov, Olivier Grisel, Omer Katz, Óscar Nájera, Pauli +Virtanen, Peter Fischer, Peter Prettenhofer, Phil Roth, pianomania, Preston +Parry, Raghav RV, Rob Zinkov, Robert Layton, Rohan Ramanath, Saket Choudhary, +Sam Zhang, santi, saurabh.bansod, scls19fr, Sebastian Raschka, Sebastian +Saeger, Shivan Sornarajah, SimonPL, sinhrks, Skipper Seabold, Sonny Hu, sseg, +Stephen Hoover, Steven De Gryze, Steven Seguin, Theodore Vasiloudis, Thomas +Unterthiner, Tiago Freitas Pereira, Tian Wang, Tim Head, Timothy Hopper, +tokoroten, Tom Dupré la Tour, Trevor Stephens, Valentin Stolbunov, Vighnesh +Birodkar, Vinayak Mehta, Vincent, Vincent Michel, vstolbunov, wangz10, Wei Xue, +Yucheng Low, Yury Zhauniarovich, Zac Stewart, zhai_pro, Zichen Wang + diff --git a/doc/whats_new/v0.18.rst b/doc/whats_new/v0.18.rst new file mode 100644 index 0000000000000..ad240d5782793 --- /dev/null +++ b/doc/whats_new/v0.18.rst @@ -0,0 +1,816 @@ +.. include:: _contributors.rst + +.. currentmodule:: sklearn + +.. _changes_0_18_2: + +Version 0.18.2 +============== + +**June 20, 2017** + +.. topic:: Last release with Python 2.6 support + + Scikit-learn 0.18 is the last major release of scikit-learn to support Python 2.6. + Later versions of scikit-learn will require Python 2.7 or above. + + +Changelog +--------- + +- Fixes for compatibility with NumPy 1.13.0: :issue:`7946` :issue:`8355` by + `Loic Esteve`_. + +- Minor compatibility changes in the examples :issue:`9010` :issue:`8040` + :issue:`9149`. + +Code Contributors +----------------- +Aman Dalmia, Loic Esteve, Nate Guerin, Sergei Lebedev + + +.. _changes_0_18_1: + +Version 0.18.1 +============== + +**November 11, 2016** + +Changelog +--------- + +Enhancements +............ + +- Improved ``sample_without_replacement`` speed by utilizing + numpy.random.permutation for most cases. As a result, + samples may differ in this release for a fixed random state. + Affected estimators: + + - :class:`ensemble.BaggingClassifier` + - :class:`ensemble.BaggingRegressor` + - :class:`linear_model.RANSACRegressor` + - :class:`model_selection.RandomizedSearchCV` + - :class:`random_projection.SparseRandomProjection` + + This also affects the :meth:`datasets.make_classification` + method. + +Bug fixes +......... + +- Fix issue where ``min_grad_norm`` and ``n_iter_without_progress`` + parameters were not being utilised by :class:`manifold.TSNE`. + :issue:`6497` by :user:`Sebastian Säger ` + +- Fix bug for svm's decision values when ``decision_function_shape`` + is ``ovr`` in :class:`svm.SVC`. + :class:`svm.SVC`'s decision_function was incorrect from versions + 0.17.0 through 0.18.0. + :issue:`7724` by `Bing Tian Dai`_ + +- Attribute ``explained_variance_ratio`` of + :class:`discriminant_analysis.LinearDiscriminantAnalysis` calculated + with SVD and Eigen solver are now of the same length. :issue:`7632` + by :user:`JPFrancoia ` + +- Fixes issue in :ref:`univariate_feature_selection` where score + functions were not accepting multi-label targets. :issue:`7676` + by :user:`Mohammed Affan ` + +- Fixed setting parameters when calling ``fit`` multiple times on + :class:`feature_selection.SelectFromModel`. :issue:`7756` by `Andreas Müller`_ + +- Fixes issue in ``partial_fit`` method of + :class:`multiclass.OneVsRestClassifier` when number of classes used in + ``partial_fit`` was less than the total number of classes in the + data. :issue:`7786` by `Srivatsan Ramesh`_ + +- Fixes issue in :class:`calibration.CalibratedClassifierCV` where + the sum of probabilities of each class for a data was not 1, and + ``CalibratedClassifierCV`` now handles the case where the training set + has less number of classes than the total data. :issue:`7799` by + `Srivatsan Ramesh`_ + +- Fix a bug where :class:`sklearn.feature_selection.SelectFdr` did not + exactly implement Benjamini-Hochberg procedure. It formerly may have + selected fewer features than it should. + :issue:`7490` by :user:`Peng Meng `. + +- :class:`sklearn.manifold.LocallyLinearEmbedding` now correctly handles + integer inputs. :issue:`6282` by `Jake Vanderplas`_. + +- The ``min_weight_fraction_leaf`` parameter of tree-based classifiers and + regressors now assumes uniform sample weights by default if the + ``sample_weight`` argument is not passed to the ``fit`` function. + Previously, the parameter was silently ignored. :issue:`7301` + by :user:`Nelson Liu `. + +- Numerical issue with :class:`linear_model.RidgeCV` on centered data when + `n_features > n_samples`. :issue:`6178` by `Bertrand Thirion`_ + +- Tree splitting criterion classes' cloning/pickling is now memory safe + :issue:`7680` by :user:`Ibraim Ganiev `. + +- Fixed a bug where :class:`decomposition.NMF` sets its ``n_iters_`` + attribute in `transform()`. :issue:`7553` by :user:`Ekaterina + Krivich `. + +- :class:`sklearn.linear_model.LogisticRegressionCV` now correctly handles + string labels. :issue:`5874` by `Raghav RV`_. + +- Fixed a bug where :func:`sklearn.model_selection.train_test_split` raised + an error when ``stratify`` is a list of string labels. :issue:`7593` by + `Raghav RV`_. + +- Fixed a bug where :class:`sklearn.model_selection.GridSearchCV` and + :class:`sklearn.model_selection.RandomizedSearchCV` were not pickleable + because of a pickling bug in ``np.ma.MaskedArray``. :issue:`7594` by + `Raghav RV`_. + +- All cross-validation utilities in :mod:`sklearn.model_selection` now + permit one time cross-validation splitters for the ``cv`` parameter. Also + non-deterministic cross-validation splitters (where multiple calls to + ``split`` produce dissimilar splits) can be used as ``cv`` parameter. + The :class:`sklearn.model_selection.GridSearchCV` will cross-validate each + parameter setting on the split produced by the first ``split`` call + to the cross-validation splitter. :issue:`7660` by `Raghav RV`_. + +- Fix bug where :meth:`preprocessing.MultiLabelBinarizer.fit_transform` + returned an invalid CSR matrix. + :issue:`7750` by :user:`CJ Carey `. + +- Fixed a bug where :func:`metrics.pairwise.cosine_distances` could return a + small negative distance. :issue:`7732` by :user:`Artsion `. + +API changes summary +------------------- + +Trees and forests + +- The ``min_weight_fraction_leaf`` parameter of tree-based classifiers and + regressors now assumes uniform sample weights by default if the + ``sample_weight`` argument is not passed to the ``fit`` function. + Previously, the parameter was silently ignored. :issue:`7301` by :user:`Nelson + Liu `. + +- Tree splitting criterion classes' cloning/pickling is now memory safe. + :issue:`7680` by :user:`Ibraim Ganiev `. + + +Linear, kernelized and related models + +- Length of ``explained_variance_ratio`` of + :class:`discriminant_analysis.LinearDiscriminantAnalysis` + changed for both Eigen and SVD solvers. The attribute has now a length + of min(n_components, n_classes - 1). :issue:`7632` + by :user:`JPFrancoia ` + +- Numerical issue with :class:`linear_model.RidgeCV` on centered data when + ``n_features > n_samples``. :issue:`6178` by `Bertrand Thirion`_ + +.. _changes_0_18: + +Version 0.18 +============ + +**September 28, 2016** + +.. topic:: Last release with Python 2.6 support + + Scikit-learn 0.18 will be the last version of scikit-learn to support Python 2.6. + Later versions of scikit-learn will require Python 2.7 or above. + +.. _model_selection_changes: + +Model Selection Enhancements and API Changes +-------------------------------------------- + +- **The model_selection module** + + The new module :mod:`sklearn.model_selection`, which groups together the + functionalities of formerly :mod:`sklearn.cross_validation`, + :mod:`sklearn.grid_search` and :mod:`sklearn.learning_curve`, introduces new + possibilities such as nested cross-validation and better manipulation of + parameter searches with Pandas. + + Many things will stay the same but there are some key differences. Read + below to know more about the changes. + +- **Data-independent CV splitters enabling nested cross-validation** + + The new cross-validation splitters, defined in the + :mod:`sklearn.model_selection`, are no longer initialized with any + data-dependent parameters such as ``y``. Instead they expose a + :func:`split` method that takes in the data and yields a generator for the + different splits. + + This change makes it possible to use the cross-validation splitters to + perform nested cross-validation, facilitated by + :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` utilities. + +- **The enhanced cv_results_ attribute** + + The new ``cv_results_`` attribute (of :class:`model_selection.GridSearchCV` + and :class:`model_selection.RandomizedSearchCV`) introduced in lieu of the + ``grid_scores_`` attribute is a dict of 1D arrays with elements in each + array corresponding to the parameter settings (i.e. search candidates). + + The ``cv_results_`` dict can be easily imported into ``pandas`` as a + ``DataFrame`` for exploring the search results. + + The ``cv_results_`` arrays include scores for each cross-validation split + (with keys such as ``'split0_test_score'``), as well as their mean + (``'mean_test_score'``) and standard deviation (``'std_test_score'``). + + The ranks for the search candidates (based on their mean + cross-validation score) is available at ``cv_results_['rank_test_score']``. + + The parameter values for each parameter is stored separately as numpy + masked object arrays. The value, for that search candidate, is masked if + the corresponding parameter is not applicable. Additionally a list of all + the parameter dicts are stored at ``cv_results_['params']``. + +- **Parameters n_folds and n_iter renamed to n_splits** + + Some parameter names have changed: + The ``n_folds`` parameter in new :class:`model_selection.KFold`, + :class:`model_selection.GroupKFold` (see below for the name change), + and :class:`model_selection.StratifiedKFold` is now renamed to + ``n_splits``. The ``n_iter`` parameter in + :class:`model_selection.ShuffleSplit`, the new class + :class:`model_selection.GroupShuffleSplit` and + :class:`model_selection.StratifiedShuffleSplit` is now renamed to + ``n_splits``. + +- **Rename of splitter classes which accepts group labels along with data** + + The cross-validation splitters ``LabelKFold``, + ``LabelShuffleSplit``, ``LeaveOneLabelOut`` and ``LeavePLabelOut`` have + been renamed to :class:`model_selection.GroupKFold`, + :class:`model_selection.GroupShuffleSplit`, + :class:`model_selection.LeaveOneGroupOut` and + :class:`model_selection.LeavePGroupsOut` respectively. + + Note the change from singular to plural form in + :class:`model_selection.LeavePGroupsOut`. + +- **Fit parameter labels renamed to groups** + + The ``labels`` parameter in the :func:`split` method of the newly renamed + splitters :class:`model_selection.GroupKFold`, + :class:`model_selection.LeaveOneGroupOut`, + :class:`model_selection.LeavePGroupsOut`, + :class:`model_selection.GroupShuffleSplit` is renamed to ``groups`` + following the new nomenclature of their class names. + +- **Parameter n_labels renamed to n_groups** + + The parameter ``n_labels`` in the newly renamed + :class:`model_selection.LeavePGroupsOut` is changed to ``n_groups``. + +- Training scores and Timing information + + ``cv_results_`` also includes the training scores for each + cross-validation split (with keys such as ``'split0_train_score'``), as + well as their mean (``'mean_train_score'``) and standard deviation + (``'std_train_score'``). To avoid the cost of evaluating training score, + set ``return_train_score=False``. + + Additionally the mean and standard deviation of the times taken to split, + train and score the model across all the cross-validation splits is + available at the key ``'mean_time'`` and ``'std_time'`` respectively. + +Changelog +--------- + +New features +............ + +Classifiers and Regressors + +- The Gaussian Process module has been reimplemented and now offers classification + and regression estimators through :class:`gaussian_process.GaussianProcessClassifier` + and :class:`gaussian_process.GaussianProcessRegressor`. Among other things, the new + implementation supports kernel engineering, gradient-based hyperparameter optimization or + sampling of functions from GP prior and GP posterior. Extensive documentation and + examples are provided. By `Jan Hendrik Metzen`_. + +- Added new supervised learning algorithm: :ref:`Multi-layer Perceptron ` + :issue:`3204` by :user:`Issam H. Laradji ` + +- Added :class:`linear_model.HuberRegressor`, a linear model robust to outliers. + :issue:`5291` by `Manoj Kumar`_. + +- Added the :class:`multioutput.MultiOutputRegressor` meta-estimator. It + converts single output regressors to multi-output regressors by fitting + one regressor per output. By :user:`Tim Head `. + +Other estimators + +- New :class:`mixture.GaussianMixture` and :class:`mixture.BayesianGaussianMixture` + replace former mixture models, employing faster inference + for sounder results. :issue:`7295` by :user:`Wei Xue ` and + :user:`Thierry Guillemot `. + +- Class :class:`decomposition.RandomizedPCA` is now factored into :class:`decomposition.PCA` + and it is available calling with parameter ``svd_solver='randomized'``. + The default number of ``n_iter`` for ``'randomized'`` has changed to 4. The old + behavior of PCA is recovered by ``svd_solver='full'``. An additional solver + calls ``arpack`` and performs truncated (non-randomized) SVD. By default, + the best solver is selected depending on the size of the input and the + number of components requested. :issue:`5299` by :user:`Giorgio Patrini `. + +- Added two functions for mutual information estimation: + :func:`feature_selection.mutual_info_classif` and + :func:`feature_selection.mutual_info_regression`. These functions can be + used in :class:`feature_selection.SelectKBest` and + :class:`feature_selection.SelectPercentile` as score functions. + By :user:`Andrea Bravi ` and :user:`Nikolay Mayorov `. + +- Added the :class:`ensemble.IsolationForest` class for anomaly detection based on + random forests. By `Nicolas Goix`_. + +- Added ``algorithm="elkan"`` to :class:`cluster.KMeans` implementing + Elkan's fast K-Means algorithm. By `Andreas Müller`_. + +Model selection and evaluation + +- Added :func:`metrics.cluster.fowlkes_mallows_score`, the Fowlkes Mallows + Index which measures the similarity of two clusterings of a set of points + By :user:`Arnaud Fouchet ` and :user:`Thierry Guillemot `. + +- Added :func:`metrics.calinski_harabaz_score`, which computes the Calinski + and Harabaz score to evaluate the resulting clustering of a set of points. + By :user:`Arnaud Fouchet ` and :user:`Thierry Guillemot `. + +- Added new cross-validation splitter + :class:`model_selection.TimeSeriesSplit` to handle time series data. + :issue:`6586` by :user:`YenChen Lin ` + +- The cross-validation iterators are replaced by cross-validation splitters + available from :mod:`sklearn.model_selection`, allowing for nested + cross-validation. See :ref:`model_selection_changes` for more information. + :issue:`4294` by `Raghav RV`_. + +Enhancements +............ + +Trees and ensembles + +- Added a new splitting criterion for :class:`tree.DecisionTreeRegressor`, + the mean absolute error. This criterion can also be used in + :class:`ensemble.ExtraTreesRegressor`, + :class:`ensemble.RandomForestRegressor`, and the gradient boosting + estimators. :issue:`6667` by :user:`Nelson Liu `. + +- Added weighted impurity-based early stopping criterion for decision tree + growth. :issue:`6954` by :user:`Nelson Liu ` + +- The random forest, extra tree and decision tree estimators now has a + method ``decision_path`` which returns the decision path of samples in + the tree. By `Arnaud Joly`_. + +- A new example has been added unveiling the decision tree structure. + By `Arnaud Joly`_. + +- Random forest, extra trees, decision trees and gradient boosting estimator + accept the parameter ``min_samples_split`` and ``min_samples_leaf`` + provided as a percentage of the training samples. By :user:`yelite ` and `Arnaud Joly`_. + +- Gradient boosting estimators accept the parameter ``criterion`` to specify + to splitting criterion used in built decision trees. + :issue:`6667` by :user:`Nelson Liu `. + +- The memory footprint is reduced (sometimes greatly) for + :class:`ensemble.bagging.BaseBagging` and classes that inherit from it, + i.e, :class:`ensemble.BaggingClassifier`, + :class:`ensemble.BaggingRegressor`, and :class:`ensemble.IsolationForest`, + by dynamically generating attribute ``estimators_samples_`` only when it is + needed. By :user:`David Staub `. + +- Added ``n_jobs`` and ``sample_weight`` parameters for + :class:`ensemble.VotingClassifier` to fit underlying estimators in parallel. + :issue:`5805` by :user:`Ibraim Ganiev `. + +Linear, kernelized and related models + +- In :class:`linear_model.LogisticRegression`, the SAG solver is now + available in the multinomial case. :issue:`5251` by `Tom Dupre la Tour`_. + +- :class:`linear_model.RANSACRegressor`, :class:`svm.LinearSVC` and + :class:`svm.LinearSVR` now support ``sample_weight``. + By :user:`Imaculate `. + +- Add parameter ``loss`` to :class:`linear_model.RANSACRegressor` to measure the + error on the samples for every trial. By `Manoj Kumar`_. + +- Prediction of out-of-sample events with Isotonic Regression + (:class:`isotonic.IsotonicRegression`) is now much faster (over 1000x in tests with synthetic + data). By :user:`Jonathan Arfa `. + +- Isotonic regression (:class:`isotonic.IsotonicRegression`) now uses a better algorithm to avoid + `O(n^2)` behavior in pathological cases, and is also generally faster + (:issue:`#6691`). By `Antony Lee`_. + +- :class:`naive_bayes.GaussianNB` now accepts data-independent class-priors + through the parameter ``priors``. By :user:`Guillaume Lemaitre `. + +- :class:`linear_model.ElasticNet` and :class:`linear_model.Lasso` + now works with ``np.float32`` input data without converting it + into ``np.float64``. This allows to reduce the memory + consumption. :issue:`6913` by :user:`YenChen Lin `. + +- :class:`semi_supervised.LabelPropagation` and :class:`semi_supervised.LabelSpreading` + now accept arbitrary kernel functions in addition to strings ``knn`` and ``rbf``. + :issue:`5762` by :user:`Utkarsh Upadhyay `. + +Decomposition, manifold learning and clustering + +- Added ``inverse_transform`` function to :class:`decomposition.NMF` to compute + data matrix of original shape. By :user:`Anish Shah `. + +- :class:`cluster.KMeans` and :class:`cluster.MiniBatchKMeans` now works + with ``np.float32`` and ``np.float64`` input data without converting it. + This allows to reduce the memory consumption by using ``np.float32``. + :issue:`6846` by :user:`Sebastian Säger ` and + :user:`YenChen Lin `. + +Preprocessing and feature selection + +- :class:`preprocessing.RobustScaler` now accepts ``quantile_range`` parameter. + :issue:`5929` by :user:`Konstantin Podshumok `. + +- :class:`feature_extraction.FeatureHasher` now accepts string values. + :issue:`6173` by :user:`Ryad Zenine ` and + :user:`Devashish Deshpande `. + +- Keyword arguments can now be supplied to ``func`` in + :class:`preprocessing.FunctionTransformer` by means of the ``kw_args`` + parameter. By `Brian McFee`_. + +- :class:`feature_selection.SelectKBest` and :class:`feature_selection.SelectPercentile` + now accept score functions that take X, y as input and return only the scores. + By :user:`Nikolay Mayorov `. + +Model evaluation and meta-estimators + +- :class:`multiclass.OneVsOneClassifier` and :class:`multiclass.OneVsRestClassifier` + now support ``partial_fit``. By :user:`Asish Panda ` and + :user:`Philipp Dowling `. + +- Added support for substituting or disabling :class:`pipeline.Pipeline` + and :class:`pipeline.FeatureUnion` components using the ``set_params`` + interface that powers :mod:`sklearn.grid_search`. + See :ref:`sphx_glr_auto_examples_plot_compare_reduction.py` + By `Joel Nothman`_ and :user:`Robert McGibbon `. + +- The new ``cv_results_`` attribute of :class:`model_selection.GridSearchCV` + (and :class:`model_selection.RandomizedSearchCV`) can be easily imported + into pandas as a ``DataFrame``. Ref :ref:`model_selection_changes` for + more information. :issue:`6697` by `Raghav RV`_. + +- Generalization of :func:`model_selection.cross_val_predict`. + One can pass method names such as `predict_proba` to be used in the cross + validation framework instead of the default `predict`. + By :user:`Ori Ziv ` and :user:`Sears Merritt `. + +- The training scores and time taken for training followed by scoring for + each search candidate are now available at the ``cv_results_`` dict. + See :ref:`model_selection_changes` for more information. + :issue:`7325` by :user:`Eugene Chen ` and `Raghav RV`_. + +Metrics + +- Added ``labels`` flag to :class:`metrics.log_loss` to explicitly provide + the labels when the number of classes in ``y_true`` and ``y_pred`` differ. + :issue:`7239` by :user:`Hong Guangguo ` with help from + :user:`Mads Jensen ` and :user:`Nelson Liu `. + +- Support sparse contingency matrices in cluster evaluation + (:mod:`metrics.cluster.supervised`) to scale to a large number of + clusters. + :issue:`7419` by :user:`Gregory Stupp ` and `Joel Nothman`_. + +- Add ``sample_weight`` parameter to :func:`metrics.matthews_corrcoef`. + By :user:`Jatin Shah ` and `Raghav RV`_. + +- Speed up :func:`metrics.silhouette_score` by using vectorized operations. + By `Manoj Kumar`_. + +- Add ``sample_weight`` parameter to :func:`metrics.confusion_matrix`. + By :user:`Bernardo Stein `. + +Miscellaneous + +- Added ``n_jobs`` parameter to :class:`feature_selection.RFECV` to compute + the score on the test folds in parallel. By `Manoj Kumar`_ + +- Codebase does not contain C/C++ cython generated files: they are + generated during build. Distribution packages will still contain generated + C/C++ files. By :user:`Arthur Mensch `. + +- Reduce the memory usage for 32-bit float input arrays of + :func:`utils.sparse_func.mean_variance_axis` and + :func:`utils.sparse_func.incr_mean_variance_axis` by supporting cython + fused types. By :user:`YenChen Lin `. + +- The :func:`ignore_warnings` now accept a category argument to ignore only + the warnings of a specified type. By :user:`Thierry Guillemot `. + +- Added parameter ``return_X_y`` and return type ``(data, target) : tuple`` option to + :func:`load_iris` dataset + :issue:`7049`, + :func:`load_breast_cancer` dataset + :issue:`7152`, + :func:`load_digits` dataset, + :func:`load_diabetes` dataset, + :func:`load_linnerud` dataset, + :func:`load_boston` dataset + :issue:`7154` by + :user:`Manvendra Singh`. + +- Simplification of the ``clone`` function, deprecate support for estimators + that modify parameters in ``__init__``. :issue:`5540` by `Andreas Müller`_. + +- When unpickling a scikit-learn estimator in a different version than the one + the estimator was trained with, a ``UserWarning`` is raised, see :ref:`the documentation + on model persistence ` for more details. (:issue:`7248`) + By `Andreas Müller`_. + +Bug fixes +......... + +Trees and ensembles + +- Random forest, extra trees, decision trees and gradient boosting + won't accept anymore ``min_samples_split=1`` as at least 2 samples + are required to split a decision tree node. By `Arnaud Joly`_ + +- :class:`ensemble.VotingClassifier` now raises ``NotFittedError`` if ``predict``, + ``transform`` or ``predict_proba`` are called on the non-fitted estimator. + by `Sebastian Raschka`_. + +- Fix bug where :class:`ensemble.AdaBoostClassifier` and + :class:`ensemble.AdaBoostRegressor` would perform poorly if the + ``random_state`` was fixed + (:issue:`7411`). By `Joel Nothman`_. + +- Fix bug in ensembles with randomization where the ensemble would not + set ``random_state`` on base estimators in a pipeline or similar nesting. + (:issue:`7411`). Note, results for :class:`ensemble.BaggingClassifier` + :class:`ensemble.BaggingRegressor`, :class:`ensemble.AdaBoostClassifier` + and :class:`ensemble.AdaBoostRegressor` will now differ from previous + versions. By `Joel Nothman`_. + +Linear, kernelized and related models + +- Fixed incorrect gradient computation for ``loss='squared_epsilon_insensitive'`` in + :class:`linear_model.SGDClassifier` and :class:`linear_model.SGDRegressor` + (:issue:`6764`). By :user:`Wenhua Yang `. + +- Fix bug in :class:`linear_model.LogisticRegressionCV` where + ``solver='liblinear'`` did not accept ``class_weights='balanced``. + (:issue:`6817`). By `Tom Dupre la Tour`_. + +- Fix bug in :class:`neighbors.RadiusNeighborsClassifier` where an error + occurred when there were outliers being labelled and a weight function + specified (:issue:`6902`). By + `LeonieBorne `_. + +- Fix :class:`linear_model.ElasticNet` sparse decision function to match + output with dense in the multioutput case. + +Decomposition, manifold learning and clustering + +- :class:`decomposition.RandomizedPCA` default number of `iterated_power` is 4 instead of 3. + :issue:`5141` by :user:`Giorgio Patrini `. + +- :func:`utils.extmath.randomized_svd` performs 4 power iterations by default, instead or 0. + In practice this is enough for obtaining a good approximation of the + true eigenvalues/vectors in the presence of noise. When `n_components` is + small (``< .1 * min(X.shape)``) `n_iter` is set to 7, unless the user specifies + a higher number. This improves precision with few components. + :issue:`5299` by :user:`Giorgio Patrini`. + +- Whiten/non-whiten inconsistency between components of :class:`decomposition.PCA` + and :class:`decomposition.RandomizedPCA` (now factored into PCA, see the + New features) is fixed. `components_` are stored with no whitening. + :issue:`5299` by :user:`Giorgio Patrini `. + +- Fixed bug in :func:`manifold.spectral_embedding` where diagonal of unnormalized + Laplacian matrix was incorrectly set to 1. :issue:`4995` by :user:`Peter Fischer `. + +- Fixed incorrect initialization of :func:`utils.arpack.eigsh` on all + occurrences. Affects :class:`cluster.bicluster.SpectralBiclustering`, + :class:`decomposition.KernelPCA`, :class:`manifold.LocallyLinearEmbedding`, + and :class:`manifold.SpectralEmbedding` (:issue:`5012`). By + :user:`Peter Fischer `. + +- Attribute ``explained_variance_ratio_`` calculated with the SVD solver + of :class:`discriminant_analysis.LinearDiscriminantAnalysis` now returns + correct results. By :user:`JPFrancoia ` + +Preprocessing and feature selection + +- :func:`preprocessing.data._transform_selected` now always passes a copy + of ``X`` to transform function when ``copy=True`` (:issue:`7194`). By `Caio + Oliveira `_. + +Model evaluation and meta-estimators + +- :class:`model_selection.StratifiedKFold` now raises error if all n_labels + for individual classes is less than n_folds. + :issue:`6182` by :user:`Devashish Deshpande `. + +- Fixed bug in :class:`model_selection.StratifiedShuffleSplit` + where train and test sample could overlap in some edge cases, + see :issue:`6121` for + more details. By `Loic Esteve`_. + +- Fix in :class:`sklearn.model_selection.StratifiedShuffleSplit` to + return splits of size ``train_size`` and ``test_size`` in all cases + (:issue:`6472`). By `Andreas Müller`_. + +- Cross-validation of :class:`OneVsOneClassifier` and + :class:`OneVsRestClassifier` now works with precomputed kernels. + :issue:`7350` by :user:`Russell Smith `. + +- Fix incomplete ``predict_proba`` method delegation from + :class:`model_selection.GridSearchCV` to + :class:`linear_model.SGDClassifier` (:issue:`7159`) + by `Yichuan Liu `_. + +Metrics + +- Fix bug in :func:`metrics.silhouette_score` in which clusters of + size 1 were incorrectly scored. They should get a score of 0. + By `Joel Nothman`_. + +- Fix bug in :func:`metrics.silhouette_samples` so that it now works with + arbitrary labels, not just those ranging from 0 to n_clusters - 1. + +- Fix bug where expected and adjusted mutual information were incorrect if + cluster contingency cells exceeded ``2**16``. By `Joel Nothman`_. + +- :func:`metrics.pairwise.pairwise_distances` now converts arrays to + boolean arrays when required in ``scipy.spatial.distance``. + :issue:`5460` by `Tom Dupre la Tour`_. + +- Fix sparse input support in :func:`metrics.silhouette_score` as well as + example examples/text/document_clustering.py. By :user:`YenChen Lin `. + +- :func:`metrics.roc_curve` and :func:`metrics.precision_recall_curve` no + longer round ``y_score`` values when creating ROC curves; this was causing + problems for users with very small differences in scores (:issue:`7353`). + +Miscellaneous + +- :func:`model_selection.tests._search._check_param_grid` now works correctly with all types + that extends/implements `Sequence` (except string), including range (Python 3.x) and xrange + (Python 2.x). :issue:`7323` by Viacheslav Kovalevskyi. + +- :func:`utils.extmath.randomized_range_finder` is more numerically stable when many + power iterations are requested, since it applies LU normalization by default. + If ``n_iter<2`` numerical issues are unlikely, thus no normalization is applied. + Other normalization options are available: ``'none', 'LU'`` and ``'QR'``. + :issue:`5141` by :user:`Giorgio Patrini `. + +- Fix a bug where some formats of ``scipy.sparse`` matrix, and estimators + with them as parameters, could not be passed to :func:`base.clone`. + By `Loic Esteve`_. + +- :func:`datasets.load_svmlight_file` now is able to read long int QID values. + :issue:`7101` by :user:`Ibraim Ganiev `. + + +API changes summary +------------------- + +Linear, kernelized and related models + +- ``residual_metric`` has been deprecated in :class:`linear_model.RANSACRegressor`. + Use ``loss`` instead. By `Manoj Kumar`_. + +- Access to public attributes ``.X_`` and ``.y_`` has been deprecated in + :class:`isotonic.IsotonicRegression`. By :user:`Jonathan Arfa `. + +Decomposition, manifold learning and clustering + +- The old :class:`mixture.DPGMM` is deprecated in favor of the new + :class:`mixture.BayesianGaussianMixture` (with the parameter + ``weight_concentration_prior_type='dirichlet_process'``). + The new class solves the computational + problems of the old class and computes the Gaussian mixture with a + Dirichlet process prior faster than before. + :issue:`7295` by :user:`Wei Xue ` and :user:`Thierry Guillemot `. + +- The old :class:`mixture.VBGMM` is deprecated in favor of the new + :class:`mixture.BayesianGaussianMixture` (with the parameter + ``weight_concentration_prior_type='dirichlet_distribution'``). + The new class solves the computational + problems of the old class and computes the Variational Bayesian Gaussian + mixture faster than before. + :issue:`6651` by :user:`Wei Xue ` and :user:`Thierry Guillemot `. + +- The old :class:`mixture.GMM` is deprecated in favor of the new + :class:`mixture.GaussianMixture`. The new class computes the Gaussian mixture + faster than before and some of computational problems have been solved. + :issue:`6666` by :user:`Wei Xue ` and :user:`Thierry Guillemot `. + +Model evaluation and meta-estimators + +- The :mod:`sklearn.cross_validation`, :mod:`sklearn.grid_search` and + :mod:`sklearn.learning_curve` have been deprecated and the classes and + functions have been reorganized into the :mod:`sklearn.model_selection` + module. Ref :ref:`model_selection_changes` for more information. + :issue:`4294` by `Raghav RV`_. + +- The ``grid_scores_`` attribute of :class:`model_selection.GridSearchCV` + and :class:`model_selection.RandomizedSearchCV` is deprecated in favor of + the attribute ``cv_results_``. + Ref :ref:`model_selection_changes` for more information. + :issue:`6697` by `Raghav RV`_. + +- The parameters ``n_iter`` or ``n_folds`` in old CV splitters are replaced + by the new parameter ``n_splits`` since it can provide a consistent + and unambiguous interface to represent the number of train-test splits. + :issue:`7187` by :user:`YenChen Lin `. + +- ``classes`` parameter was renamed to ``labels`` in + :func:`metrics.hamming_loss`. :issue:`7260` by :user:`Sebastián Vanrell `. + +- The splitter classes ``LabelKFold``, ``LabelShuffleSplit``, + ``LeaveOneLabelOut`` and ``LeavePLabelsOut`` are renamed to + :class:`model_selection.GroupKFold`, + :class:`model_selection.GroupShuffleSplit`, + :class:`model_selection.LeaveOneGroupOut` + and :class:`model_selection.LeavePGroupsOut` respectively. + Also the parameter ``labels`` in the :func:`split` method of the newly + renamed splitters :class:`model_selection.LeaveOneGroupOut` and + :class:`model_selection.LeavePGroupsOut` is renamed to + ``groups``. Additionally in :class:`model_selection.LeavePGroupsOut`, + the parameter ``n_labels`` is renamed to ``n_groups``. + :issue:`6660` by `Raghav RV`_. + +- Error and loss names for ``scoring`` parameters are now prefixed by + ``'neg_'``, such as ``neg_mean_squared_error``. The unprefixed versions + are deprecated and will be removed in version 0.20. + :issue:`7261` by :user:`Tim Head `. + +Code Contributors +----------------- +Aditya Joshi, Alejandro, Alexander Fabisch, Alexander Loginov, Alexander +Minyushkin, Alexander Rudy, Alexandre Abadie, Alexandre Abraham, Alexandre +Gramfort, Alexandre Saint, alexfields, Alvaro Ulloa, alyssaq, Amlan Kar, +Andreas Mueller, andrew giessel, Andrew Jackson, Andrew McCulloh, Andrew +Murray, Anish Shah, Arafat, Archit Sharma, Ariel Rokem, Arnaud Joly, Arnaud +Rachez, Arthur Mensch, Ash Hoover, asnt, b0noI, Behzad Tabibian, Bernardo, +Bernhard Kratzwald, Bhargav Mangipudi, blakeflei, Boyuan Deng, Brandon Carter, +Brett Naul, Brian McFee, Caio Oliveira, Camilo Lamus, Carol Willing, Cass, +CeShine Lee, Charles Truong, Chyi-Kwei Yau, CJ Carey, codevig, Colin Ni, Dan +Shiebler, Daniel, Daniel Hnyk, David Ellis, David Nicholson, David Staub, David +Thaler, David Warshaw, Davide Lasagna, Deborah, definitelyuncertain, Didi +Bar-Zev, djipey, dsquareindia, edwinENSAE, Elias Kuthe, Elvis DOHMATOB, Ethan +White, Fabian Pedregosa, Fabio Ticconi, fisache, Florian Wilhelm, Francis, +Francis O'Donovan, Gael Varoquaux, Ganiev Ibraim, ghg, Gilles Louppe, Giorgio +Patrini, Giovanni Cherubin, Giovanni Lanzani, Glenn Qian, Gordon +Mohr, govin-vatsan, Graham Clenaghan, Greg Reda, Greg Stupp, Guillaume +Lemaitre, Gustav Mörtberg, halwai, Harizo Rajaona, Harry Mavroforakis, +hashcode55, hdmetor, Henry Lin, Hobson Lane, Hugo Bowne-Anderson, +Igor Andriushchenko, Imaculate, Inki Hwang, Isaac Sijaranamual, +Ishank Gulati, Issam Laradji, Iver Jordal, jackmartin, Jacob Schreiber, Jake +Vanderplas, James Fiedler, James Routley, Jan Zikes, Janna Brettingen, jarfa, Jason +Laska, jblackburne, jeff levesque, Jeffrey Blackburne, Jeffrey04, Jeremy Hintz, +jeremynixon, Jeroen, Jessica Yung, Jill-Jênn Vie, Jimmy Jia, Jiyuan Qian, Joel +Nothman, johannah, John, John Boersma, John Kirkham, John Moeller, +jonathan.striebel, joncrall, Jordi, Joseph Munoz, Joshua Cook, JPFrancoia, +jrfiedler, JulianKahnert, juliathebrave, kaichogami, KamalakerDadi, Kenneth +Lyons, Kevin Wang, kingjr, kjell, Konstantin Podshumok, Kornel Kielczewski, +Krishna Kalyan, krishnakalyan3, Kvle Putnam, Kyle Jackson, Lars Buitinck, +ldavid, LeiG, LeightonZhang, Leland McInnes, Liang-Chi Hsieh, Lilian Besson, +lizsz, Loic Esteve, Louis Tiao, Léonie Borne, Mads Jensen, Maniteja Nandana, +Manoj Kumar, Manvendra Singh, Marco, Mario Krell, Mark Bao, Mark Szepieniec, +Martin Madsen, MartinBpr, MaryanMorel, Massil, Matheus, Mathieu Blondel, +Mathieu Dubois, Matteo, Matthias Ekman, Max Moroz, Michael Scherer, michiaki +ariga, Mikhail Korobov, Moussa Taifi, mrandrewandrade, Mridul Seth, nadya-p, +Naoya Kanai, Nate George, Nelle Varoquaux, Nelson Liu, Nick James, +NickleDave, Nico, Nicolas Goix, Nikolay Mayorov, ningchi, nlathia, +okbalefthanded, Okhlopkov, Olivier Grisel, Panos Louridas, Paul Strickland, +Perrine Letellier, pestrickland, Peter Fischer, Pieter, Ping-Yao, Chang, +practicalswift, Preston Parry, Qimu Zheng, Rachit Kansal, Raghav RV, +Ralf Gommers, Ramana.S, Rammig, Randy Olson, Rob Alexander, Robert Lutz, +Robin Schucker, Rohan Jain, Ruifeng Zheng, Ryan Yu, Rémy Léone, saihttam, +Saiwing Yeung, Sam Shleifer, Samuel St-Jean, Sartaj Singh, Sasank Chilamkurthy, +saurabh.bansod, Scott Andrews, Scott Lowe, seales, Sebastian Raschka, Sebastian +Saeger, Sebastián Vanrell, Sergei Lebedev, shagun Sodhani, shanmuga cv, +Shashank Shekhar, shawpan, shengxiduan, Shota, shuckle16, Skipper Seabold, +sklearn-ci, SmedbergM, srvanrell, Sébastien Lerique, Taranjeet, themrmax, +Thierry, Thierry Guillemot, Thomas, Thomas Hallock, Thomas Moreau, Tim Head, +tKammy, toastedcornflakes, Tom, TomDLT, Toshihiro Kamishima, tracer0tong, Trent +Hauck, trevorstephens, Tue Vo, Varun, Varun Jewalikar, Viacheslav, Vighnesh +Birodkar, Vikram, Villu Ruusmann, Vinayak Mehta, walter, waterponey, Wenhua +Yang, Wenjian Huang, Will Welch, wyseguy7, xyguo, yanlend, Yaroslav Halchenko, +yelite, Yen, YenChenLin, Yichuan Liu, Yoav Ram, Yoshiki, Zheng RuiFeng, zivori, Óscar Nájera + diff --git a/doc/whats_new/v0.19.rst b/doc/whats_new/v0.19.rst new file mode 100644 index 0000000000000..eb29ab1599b31 --- /dev/null +++ b/doc/whats_new/v0.19.rst @@ -0,0 +1,923 @@ +.. include:: _contributors.rst + +.. currentmodule:: sklearn + +.. _changes_0_19: + +Version 0.19 +============ + +**Release Candidate (0.19b2) July 17, 2017** + +Highlights +---------- + +We are excited to release a number of great new features including +:class:`neighbors.LocalOutlierFactor` for anomaly detection, +:class:`preprocessing.QuantileTransformer` for robust feature transformation, +and the :class:`multioutput.ClassifierChain` meta-estimator to simply account +for dependencies between classes in multilabel problems. We have some new +algorithms in existing estimators, such as multiplicative update in +:class:`decomposition.NMF` and multinomial +:class:`linear_model.LogisticRegression` with L1 loss (use ``solver='saga'``). + +Cross validation is now able to return the results from multiple metric +evaluations. The new :func:`model_selection.cross_validate` can return many +scores on the test data as well as training set performance and timings, and we +have extended the ``scoring`` and ``refit`` parameters for grid/randomized +search :ref:`to handle multiple metrics `. + +You can also learn faster. For instance, the :ref:`new option to cache +transformations ` in :class:`pipeline.Pipeline` makes grid +search over pipelines including slow transformations much more efficient. And +you can predict faster: if you're sure you know what you're doing, you can turn +off validating that the input is finite using :func:`config_context`. + +We've made some important fixes too. We've fixed a longstanding implementation +error in :func:`metrics.average_precision_score`, so please be cautious with +prior results reported from that function. A number of errors in the +:class:`manifold.TSNE` implementation have been fixed, particularly in the +default Barnes-Hut approximation. :class:`semi_supervised.LabelSpreading` and +:class:`semi_supervised.LabelPropagation` have had substantial fixes. +LabelPropagation was previously broken. LabelSpreading should now correctly +respect its alpha parameter. + +Changed models +-------------- + +The following estimators and functions, when fit with the same data and +parameters, may produce different models from the previous version. This often +occurs due to changes in the modelling logic (bug fixes or enhancements), or in +random sampling procedures. + +- :class:`cluster.KMeans` with sparse X and initial centroids given (bug fix) +- :class:`cross_decomposition.PLSRegression` + with ``scale=True`` (bug fix) +- :class:`ensemble.GradientBoostingClassifier` and + :class:`ensemble.GradientBoostingRegressor` where ``min_impurity_split`` is used (bug fix) +- gradient boosting ``loss='quantile'`` (bug fix) +- :class:`ensemble.IsolationForest` (bug fix) +- :class:`feature_selection.SelectFdr` (bug fix) +- :class:`linear_model.RANSACRegressor` (bug fix) +- :class:`linear_model.LassoLars` (bug fix) +- :class:`linear_model.LassoLarsIC` (bug fix) +- :class:`manifold.TSNE` (bug fix) +- :class:`neighbors.NearestCentroid` (bug fix) +- :class:`semi_supervised.LabelSpreading` (bug fix) +- :class:`semi_supervised.LabelPropagation` (bug fix) +- tree based models where ``min_weight_fraction_leaf`` is used (enhancement) + +Details are listed in the changelog below. + +(While we are trying to better inform users by providing this information, we +cannot assure that this list is complete.) + +Changelog +--------- + +New features +............ + +Classifiers and regressors + +- Added :class:`multioutput.ClassifierChain` for multi-label + classification. By `Adam Kleczewski `_. + +- Added solver ``'saga'`` that implements the improved version of Stochastic + Average Gradient, in :class:`linear_model.LogisticRegression` and + :class:`linear_model.Ridge`. It allows the use of L1 penalty with + multinomial logistic loss, and behaves marginally better than 'sag' + during the first epochs of ridge and logistic regression. + :issue:`8446` by `Arthur Mensch`_. + +Other estimators + +- Added the :class:`neighbors.LocalOutlierFactor` class for anomaly + detection based on nearest neighbors. + :issue:`5279` by `Nicolas Goix`_ and `Alexandre Gramfort`_. + +- Added :class:`preprocessing.QuantileTransformer` class and + :func:`preprocessing.quantile_transform` function for features + normalization based on quantiles. + :issue:`8363` by :user:`Denis Engemann `, + :user:`Guillaume Lemaitre `, `Olivier Grisel`_, `Raghav RV`_, + :user:`Thierry Guillemot `, and `Gael Varoquaux`_. + +- The new solver ``'mu'`` implements a Multiplicate Update in + :class:`decomposition.NMF`, allowing the optimization of all + beta-divergences, including the Frobenius norm, the generalized + Kullback-Leibler divergence and the Itakura-Saito divergence. + :issue:`5295` by `Tom Dupre la Tour`_. + +Model selection and evaluation + +- :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` now support simultaneous + evaluation of multiple metrics. Refer to the + :ref:`multimetric_grid_search` section of the user guide for more + information. :issue:`7388` by `Raghav RV`_ + +- Added the :func:`model_selection.cross_validate` which allows evaluation + of multiple metrics. This function returns a dict with more useful + information from cross-validation such as the train scores, fit times and + score times. + Refer to :ref:`multimetric_cross_validation` section of the userguide + for more information. :issue:`7388` by `Raghav RV`_ + +- Added :func:`metrics.mean_squared_log_error`, which computes + the mean square error of the logarithmic transformation of targets, + particularly useful for targets with an exponential trend. + :issue:`7655` by :user:`Karan Desai `. + +- Added :func:`metrics.dcg_score` and :func:`metrics.ndcg_score`, which + compute Discounted cumulative gain (DCG) and Normalized discounted + cumulative gain (NDCG). + :issue:`7739` by :user:`David Gasquez `. + +- Added the :class:`model_selection.RepeatedKFold` and + :class:`model_selection.RepeatedStratifiedKFold`. + :issue:`8120` by `Neeraj Gangwar`_. + +Miscellaneous + +- Validation that input data contains no NaN or inf can now be suppressed + using :func:`config_context`, at your own risk. This will save on runtime, + and may be particularly useful for prediction time. :issue:`7548` by + `Joel Nothman`_. + +- Added a test to ensure parameter listing in docstrings match the + function/class signature. :issue:`9206` by `Alexandre Gramfort`_ and + `Raghav RV`_. + +Enhancements +............ + +Trees and ensembles + +- The ``min_weight_fraction_leaf`` constraint in tree construction is now + more efficient, taking a fast path to declare a node a leaf if its weight + is less than 2 * the minimum. Note that the constructed tree will be + different from previous versions where ``min_weight_fraction_leaf`` is + used. :issue:`7441` by :user:`Nelson Liu `. + +- :class:`ensemble.GradientBoostingClassifier` and :class:`ensemble.GradientBoostingRegressor` + now support sparse input for prediction. + :issue:`6101` by :user:`Ibraim Ganiev `. + +- :class:`ensemble.VotingClassifier` now allows changing estimators by using + :meth:`ensemble.VotingClassifier.set_params`. An estimator can also be + removed by setting it to ``None``. + :issue:`7674` by :user:`Yichuan Liu `. + +- :func:`tree.export_graphviz` now shows configurable number of decimal + places. :issue:`8698` by :user:`Guillaume Lemaitre `. + +- Added ``flatten_transform`` parameter to :class:`ensemble.VotingClassifier` + to change output shape of `transform` method to 2 dimensional. + :issue:`7794` by :user:`Ibraim Ganiev ` and + :user:`Herilalaina Rakotoarison `. + +Linear, kernelized and related models + +- :class:`linear_model.SGDClassifier`, :class:`linear_model.SGDRegressor`, + :class:`linear_model.PassiveAggressiveClassifier`, + :class:`linear_model.PassiveAggressiveRegressor` and + :class:`linear_model.Perceptron` now expose ``max_iter`` and + ``tol`` parameters, to handle convergence more precisely. + ``n_iter`` parameter is deprecated, and the fitted estimator exposes + a ``n_iter_`` attribute, with actual number of iterations before + convergence. :issue:`5036` by `Tom Dupre la Tour`_. + +- Added ``average`` parameter to perform weight averaging in + :class:`linear_model.PassiveAggressiveClassifier`. :issue:`4939` + by :user:`Andrea Esuli `. + +- :class:`linear_model.RANSACRegressor` no longer throws an error + when calling ``fit`` if no inliers are found in its first iteration. + Furthermore, causes of skipped iterations are tracked in newly added + attributes, ``n_skips_*``. + :issue:`7914` by :user:`Michael Horrell `. + +- In :class:`gaussian_process.GaussianProcessRegressor`, method ``predict`` + is a lot faster with ``return_std=True``. :issue:`8591` by + :user:`Hadrien Bertrand `. + +- Added ``return_std`` to ``predict`` method of + :class:`linear_model.ARDRegression` and + :class:`linear_model.BayesianRidge`. + :issue:`7838` by :user:`Sergey Feldman `. + +- Memory usage enhancements: Prevent cast from float32 to float64 in: + :class:`linear_model.MultiTaskElasticNet`; + :class:`linear_model.LogisticRegression` when using newton-cg solver; and + :class:`linear_model.Ridge` when using svd, sparse_cg, cholesky or lsqr + solvers. :issue:`8835`, :issue:`8061` by :user:`Joan Massich ` and :user:`Nicolas + Cordier ` and :user:`Thierry Guillemot `. + +Other predictors + +- Custom metrics for the :mod:`neighbors` binary trees now have + fewer constraints: they must take two 1d-arrays and return a float. + :issue:`6288` by `Jake Vanderplas`_. + +- ``algorithm='auto`` in :mod:`neighbors` estimators now chooses the most + appropriate algorithm for all input types and metrics. :issue:`9145` by + :user:`Herilalaina Rakotoarison ` and :user:`Reddy Chinthala + `. + +Decomposition, manifold learning and clustering + +- :class:`cluster.MiniBatchKMeans` and :class:`cluster.KMeans` + now use significantly less memory when assigning data points to their + nearest cluster center. :issue:`7721` by :user:`Jon Crall `. + +- :class:`decomposition.PCA`, :class:`decomposition.IncrementalPCA` and + :class:`decomposition.TruncatedSVD` now expose the singular values + from the underlying SVD. They are stored in the attribute + ``singular_values_``, like in :class:`decomposition.IncrementalPCA`. + :issue:`7685` by :user:`Tommy Löfstedt ` + +- :class:`decomposition.NMF` now faster when ``beta_loss=0``. + :issue:`9277` by :user:`hongkahjun`. + +- Memory improvements for method ``barnes_hut`` in :class:`manifold.TSNE` + :issue:`7089` by :user:`Thomas Moreau ` and `Olivier Grisel`_. + +- Optimization schedule improvements for Barnes-Hut :class:`manifold.TSNE` + so the results are closer to the one from the reference implementation + `lvdmaaten/bhtsne `_ by :user:`Thomas + Moreau ` and `Olivier Grisel`_. + +- Memory usage enhancements: Prevent cast from float32 to float64 in + :class:`decomposition.PCA` and + :func:`decomposition.randomized_svd_low_rank`. + :issue:`9067` by `Raghav RV`_. + +Preprocessing and feature selection + +- Added ``norm_order`` parameter to :class:`feature_selection.SelectFromModel` + to enable selection of the norm order when ``coef_`` is more than 1D. + :issue:`6181` by :user:`Antoine Wendlinger `. + +- Added ability to use sparse matrices in :func:`feature_selection.f_regression` + with ``center=True``. :issue:`8065` by :user:`Daniel LeJeune `. + +- Small performance improvement to n-gram creation in + :mod:`feature_extraction.text` by binding methods for loops and + special-casing unigrams. :issue:`7567` by :user:`Jaye Doepke ` + +- Relax assumption on the data for the + :class:`kernel_approximation.SkewedChi2Sampler`. Since the Skewed-Chi2 + kernel is defined on the open interval :math:`(-skewedness; +\infty)^d`, + the transform function should not check whether ``X < 0`` but whether ``X < + -self.skewedness``. :issue:`7573` by :user:`Romain Brault `. + +- Made default kernel parameters kernel-dependent in + :class:`kernel_approximation.Nystroem`. + :issue:`5229` by :user:`Saurabh Bansod ` and `Andreas Müller`_. + +Model evaluation and meta-estimators + +- :class:`pipeline.Pipeline` is now able to cache transformers + within a pipeline by using the ``memory`` constructor parameter. + :issue:`7990` by :user:`Guillaume Lemaitre `. + +- :class:`pipeline.Pipeline` steps can now be accessed as attributes of its + ``named_steps`` attribute. :issue:`8586` by :user:`Herilalaina + Rakotoarison `. + +- Added ``sample_weight`` parameter to :meth:`pipeline.Pipeline.score`. + :issue:`7723` by :user:`Mikhail Korobov `. + +- Added ability to set ``n_jobs`` parameter to :func:`pipeline.make_union`. + A ``TypeError`` will be raised for any other kwargs. :issue:`8028` + by :user:`Alexander Booth `. + +- :class:`model_selection.GridSearchCV`, + :class:`model_selection.RandomizedSearchCV` and + :func:`model_selection.cross_val_score` now allow estimators with callable + kernels which were previously prohibited. + :issue:`8005` by `Andreas Müller`_ . + +- :func:`model_selection.cross_val_predict` now returns output of the + correct shape for all values of the argument ``method``. + :issue:`7863` by :user:`Aman Dalmia `. + +- Added ``shuffle`` and ``random_state`` parameters to shuffle training + data before taking prefixes of it based on training sizes in + :func:`model_selection.learning_curve`. + :issue:`7506` by :user:`Narine Kokhlikyan `. + +- :class:`model_selection.StratifiedShuffleSplit` now works with multioutput + multiclass (or multilabel) data. :issue:`9044` by `Vlad Niculae`_. + +- Speed improvements to :class:`model_selection.StratifiedShuffleSplit`. + :issue:`5991` by :user:`Arthur Mensch ` and `Joel Nothman`_. + +- Add ``shuffle`` parameter to :func:`model_selection.train_test_split`. + :issue:`8845` by :user:`themrmax ` + +- :class:`multioutput.MultiOutputRegressor` and :class:`multioutput.MultiOutputClassifier` + now support online learning using ``partial_fit``. + :issue: `8053` by :user:`Peng Yu `. + +- Add ``max_train_size`` parameter to :class:`model_selection.TimeSeriesSplit` + :issue:`8282` by :user:`Aman Dalmia `. + +- More clustering metrics are now available through :func:`metrics.get_scorer` + and ``scoring`` parameters. :issue:`8117` by `Raghav RV`_. + +- A scorer based on :func:`metrics.explained_variance_score` is also available. + :issue:`9259` by :user:`Hanmin Qin `. + +Metrics + +- :func:`metrics.matthews_corrcoef` now support multiclass classification. + :issue:`8094` by :user:`Jon Crall `. + +- Add ``sample_weight`` parameter to :func:`metrics.cohen_kappa_score`. + :issue:`8335` by :user:`Victor Poughon `. + +Miscellaneous + +- :func:`utils.check_estimator` now attempts to ensure that methods + transform, predict, etc. do not set attributes on the estimator. + :issue:`7533` by :user:`Ekaterina Krivich `. + +- Added type checking to the ``accept_sparse`` parameter in + :mod:`utils.validation` methods. This parameter now accepts only boolean, + string, or list/tuple of strings. ``accept_sparse=None`` is deprecated and + should be replaced by ``accept_sparse=False``. + :issue:`7880` by :user:`Josh Karnofsky `. + +- Make it possible to load a chunk of an svmlight formatted file by + passing a range of bytes to :func:`datasets.load_svmlight_file`. + :issue:`935` by :user:`Olivier Grisel `. + +- :class:`dummy.DummyClassifier` and :class:`dummy.DummyRegressor` + now accept non-finite features. :issue:`8931` by :user:`Attractadore`. + +Bug fixes +......... + +Trees and ensembles + +- Fixed a memory leak in trees when using trees with ``criterion='mae'``. + :issue:`8002` by `Raghav RV`_. + +- Fixed a bug where :class:`ensemble.IsolationForest` uses an + an incorrect formula for the average path length + :issue:`8549` by `Peter Wang `_. + +- Fixed a bug where :class:`ensemble.AdaBoostClassifier` throws + ``ZeroDivisionError`` while fitting data with single class labels. + :issue:`7501` by :user:`Dominik Krzeminski `. + +- Fixed a bug in :class:`ensemble.GradientBoostingClassifier` and + :class:`ensemble.GradientBoostingRegressor` where a float being compared + to ``0.0`` using ``==`` caused a divide by zero error. :issue:`7970` by + :user:`He Chen `. + +- Fix a bug where :class:`ensemble.GradientBoostingClassifier` and + :class:`ensemble.GradientBoostingRegressor` ignored the + ``min_impurity_split`` parameter. + :issue:`8006` by :user:`Sebastian Pölsterl `. + +- Fixed ``oob_score`` in :class:`ensemble.BaggingClassifier`. + :issue:`8936` by :user:`Michael Lewis ` + +- Fixed excessive memory usage in prediction for random forests estimators. + :issue:`8672` by :user:`Mike Benfield `. + +- Fixed a bug where ``sample_weight`` as a list broke random forests in Python 2 + :issue:`8068` by :user:`xor`. + +- Fixed a bug where :class:`ensemble.IsolationForest` fails when + ``max_features`` is less than 1. + :issue:`5732` by :user:`Ishank Gulati `. + +- Fix a bug where gradient boosting with ``loss='quantile'`` computed + negative errors for negative values of ``ytrue - ypred`` leading to wrong + values when calling ``__call__``. + :issue:`8087` by :user:`Alexis Mignon ` + +- Fix a bug where :class:`ensemble.VotingClassifier` raises an error + when a numpy array is passed in for weights. :issue:`7983` by + :user:`Vincent Pham `. + +- Fixed a bug where :func:`tree.export_graphviz` raised an error + when the length of features_names does not match n_features in the decision + tree. :issue:`8512` by :user:`Li Li `. + +Linear, kernelized and related models + +- Fixed a bug where :func:`linear_model.RANSACRegressor.fit` may run until + ``max_iter`` if it finds a large inlier group early. :issue:`8251` by + :user:`aivision2020`. + +- Fixed a bug where :class:`naive_bayes.MultinomialNB` and + :class:`naive_bayes.BernoulliNB` failed when ``alpha=0``. :issue:`5814` by + :user:`Yichuan Liu ` and :user:`Herilalaina Rakotoarison + `. + +- Fixed a bug where :class:`linear_model.LassoLars` does not give + the same result as the LassoLars implementation available + in R (lars library). :issue:`7849` by :user:`Jair Montoya Martinez `. + +- Fixed a bug in :class:`linear_model.RandomizedLasso`, + :class:`linear_model.Lars`, :class:`linear_model.LassoLars`, + :class:`linear_model.LarsCV` and :class:`linear_model.LassoLarsCV`, + where the parameter ``precompute`` was not used consistently across + classes, and some values proposed in the docstring could raise errors. + :issue:`5359` by `Tom Dupre la Tour`_. + +- Fix inconsistent results between :class:`linear_model.RidgeCV` and + :class:`linear_model.Ridge` when using ``normalize=True``. :issue:`9302` + by `Alexandre Gramfort`_. + +- Fix a bug where :func:`linear_model.LassoLars.fit` sometimes + left ``coef_`` as a list, rather than an ndarray. + :issue:`8160` by :user:`CJ Carey `. + +- Fix :func:`linear_model.BayesianRidge.fit` to return + ridge parameter ``alpha_`` and ``lambda_`` consistent with calculated + coefficients ``coef_`` and ``intercept_``. + :issue:`8224` by :user:`Peter Gedeck `. + +- Fixed a bug in :class:`svm.OneClassSVM` where it returned floats instead of + integer classes. :issue:`8676` by :user:`Vathsala Achar `. + +- Fix AIC/BIC criterion computation in :class:`linear_model.LassoLarsIC`. + :issue:`9022` by `Alexandre Gramfort`_ and :user:`Mehmet Basbug `. + +- Fixed a memory leak in our LibLinear implementation. :issue:`9024` by + :user:`Sergei Lebedev ` + +- Fix bug where stratified CV splitters did not work with + :class:`linear_model.LassoCV`. :issue:`8973` by + :user:`Paulo Haddad `. + +- Fixed a bug in :class:`gaussian_process.GaussianProcessRegressor` + when the standard deviation and covariance predicted without fit + would fail with a unmeaningful error by default. + :issue:`6573` by :user:`Quazi Marufur Rahman ` and + `Manoj Kumar`_. + +Other predictors + +- Fix :class:`semi_supervised.BaseLabelPropagation` to correctly implement + ``LabelPropagation`` and ``LabelSpreading`` as done in the referenced + papers. :issue:`9239` + by :user:`Andre Ambrosio Boechat `, :user:`Utkarsh Upadhyay + `, and `Joel Nothman`_. + +Decomposition, manifold learning and clustering + +- Fixed the implementation of :class:`manifold.TSNE`: +- ``early_exageration`` parameter had no effect and is now used for the + first 250 optimization iterations. +- Fixed the ``AssertionError: Tree consistency failed`` exception + reported in :issue:`8992`. +- Improve the learning schedule to match the one from the reference + implementation `lvdmaaten/bhtsne `_. + by :user:`Thomas Moreau ` and `Olivier Grisel`_. + +- Fix a bug in :class:`decomposition.LatentDirichletAllocation` + where the ``perplexity`` method was returning incorrect results because + the ``transform`` method returns normalized document topic distributions + as of version 0.18. :issue:`7954` by :user:`Gary Foreman `. + +- Fix output shape and bugs with n_jobs > 1 in + :class:`decomposition.SparseCoder` transform and + :func:`decomposition.sparse_encode` + for one-dimensional data and one component. + This also impacts the output shape of :class:`decomposition.DictionaryLearning`. + :issue:`8086` by `Andreas Müller`_. + +- Fixed the implementation of ``explained_variance_`` + in :class:`decomposition.PCA`, + :class:`decomposition.RandomizedPCA` and + :class:`decomposition.IncrementalPCA`. + :issue:`9105` by `Hanmin Qin `_. + +- Fixed the implementation of ``noise_variance_`` in :class:`decomposition.PCA`. + :issue:`9108` by `Hanmin Qin `_. + +- Fixed a bug where :class:`cluster.DBSCAN` gives incorrect + result when input is a precomputed sparse matrix with initial + rows all zero. :issue:`8306` by :user:`Akshay Gupta ` + +- Fix a bug regarding fitting :class:`cluster.KMeans` with a sparse + array X and initial centroids, where X's means were unnecessarily being + subtracted from the centroids. :issue:`7872` by :user:`Josh Karnofsky `. + +- Fixes to the input validation in :class:`covariance.EllipticEnvelope`. + :issue:`8086` by `Andreas Müller`_. + +- Fixed a bug in :class:`covariance.MinCovDet` where inputting data + that produced a singular covariance matrix would cause the helper method + ``_c_step`` to throw an exception. + :issue:`3367` by :user:`Jeremy Steward ` + +- Fixed a bug in :class:`manifold.TSNE` affecting convergence of the + gradient descent. :issue:`8768` by :user:`David DeTomaso `. + +- Fixed a bug in :class:`manifold.TSNE` where it stored the incorrect + ``kl_divergence_``. :issue:`6507` by :user:`Sebastian Saeger `. + +- Fixed improper scaling in :class:`cross_decomposition.PLSRegression` + with ``scale=True``. :issue:`7819` by :user:`jayzed82 `. + +- :class:`cluster.bicluster.SpectralCoclustering` and + :class:`cluster.bicluster.SpectralBiclustering` ``fit`` method conforms + with API by accepting ``y`` and returning the object. :issue:`6126`, + :issue:`7814` by :user:`Laurent Direr ` and :user:`Maniteja + Nandana `. + +- Fix bug where :mod:`mixture` ``sample`` methods did not return as many + samples as requested. :issue:`7702` by :user:`Levi John Wolf `. + +- Fixed the shrinkage implementation in :class:`neighbors.NearestCentroid`. + :issue:`9219` by `Hanmin Qin `_. + +Preprocessing and feature selection + +- For sparse matrices, :func:`preprocessing.normalize` with ``return_norm=True`` + will now raise a ``NotImplementedError`` with 'l1' or 'l2' norm and with + norm 'max' the norms returned will be the same as for dense matrices. + :issue:`7771` by `Ang Lu `_. + +- Fix a bug where :class:`feature_selection.SelectFdr` did not + exactly implement Benjamini-Hochberg procedure. It formerly may have + selected fewer features than it should. + :issue:`7490` by :user:`Peng Meng `. + +- Fixed a bug where :class:`linear_model.RandomizedLasso` and + :class:`linear_model.RandomizedLogisticRegression` breaks for + sparse input. :issue:`8259` by :user:`Aman Dalmia `. + +- Fix a bug where :class:`feature_extraction.FeatureHasher` + mandatorily applied a sparse random projection to the hashed features, + preventing the use of + :class:`feature_extraction.text.HashingVectorizer` in a + pipeline with :class:`feature_extraction.text.TfidfTransformer`. + :issue:`7565` by :user:`Roman Yurchak `. + +- Fix a bug where :class:`feature_selection.mutual_info_regression` did not + correctly use ``n_neighbors``. :issue:`8181` by :user:`Guillaume Lemaitre + `. + +Model evaluation and meta-estimators + +- Fixed a bug where :func:`model_selection.BaseSearchCV.inverse_transform` + returns ``self.best_estimator_.transform()`` instead of + ``self.best_estimator_.inverse_transform()``. + :issue:`8344` by :user:`Akshay Gupta ` and :user:`Rasmus Eriksson `. + +- Added ``classes_`` attribute to :class:`model_selection.GridSearchCV`, + :class:`model_selection.RandomizedSearchCV`, :class:`grid_search.GridSearchCV`, + and :class:`grid_search.RandomizedSearchCV` that matches the ``classes_`` + attribute of ``best_estimator_``. :issue:`7661` and :issue:`8295` + by :user:`Alyssa Batula `, :user:`Dylan Werner-Meier `, + and :user:`Stephen Hoover `. + +- Fixed a bug where :func:`model_selection.validation_curve` + reused the same estimator for each parameter value. + :issue:`7365` by :user:`Aleksandr Sandrovskii `. + +- :func:`model_selection.permutation_test_score` now works with Pandas + types. :issue:`5697` by :user:`Stijn Tonk `. + +- Several fixes to input validation in + :class:`multiclass.OutputCodeClassifier` + :issue:`8086` by `Andreas Müller`_. + +- :class:`multiclass.OneVsOneClassifier`'s ``partial_fit`` now ensures all + classes are provided up-front. :issue:`6250` by + :user:`Asish Panda `. + +- Fix :func:`multioutput.MultiOutputClassifier.predict_proba` to return a + list of 2d arrays, rather than a 3d array. In the case where different + target columns had different numbers of classes, a ``ValueError`` would be + raised on trying to stack matrices with different dimensions. + :issue:`8093` by :user:`Peter Bull `. + +- Cross validation now works with Pandas datatypes that that have a + read-only index. :issue:`9507` by `Loic Esteve`_. + +Metrics + +- :func:`metrics.average_precision_score` no longer linearly + interpolates between operating points, and instead weighs precisions + by the change in recall since the last operating point, as per the + `Wikipedia entry `_. + (`#7356 `_). By + :user:`Nick Dingwall ` and `Gael Varoquaux`_. + +- Fix a bug in :func:`metrics.classification._check_targets` + which would return ``'binary'`` if ``y_true`` and ``y_pred`` were + both ``'binary'`` but the union of ``y_true`` and ``y_pred`` was + ``'multiclass'``. :issue:`8377` by `Loic Esteve`_. + +- Fixed an integer overflow bug in :func:`metrics.confusion_matrix` and + hence :func:`metrics.cohen_kappa_score`. :issue:`8354`, :issue:`7929` + by `Joel Nothman`_ and :user:`Jon Crall `. + +- Fixed passing of ``gamma`` parameter to the ``chi2`` kernel in + :func:`metrics.pairwise.pairwise_kernels` :issue:`5211` by + :user:`Nick Rhinehart `, + :user:`Saurabh Bansod ` and `Andreas Müller`_. + +Miscellaneous + +- Fixed a bug when :func:`datasets.make_classification` fails + when generating more than 30 features. :issue:`8159` by + :user:`Herilalaina Rakotoarison `. + +- Fixed a bug where :func:`datasets.make_moons` gives an + incorrect result when ``n_samples`` is odd. + :issue:`8198` by :user:`Josh Levy `. + +- Some ``fetch_`` functions in :mod:`datasets` were ignoring the + ``download_if_missing`` keyword. :issue:`7944` by :user:`Ralf Gommers `. + +- Fix estimators to accept a ``sample_weight`` parameter of type + ``pandas.Series`` in their ``fit`` function. :issue:`7825` by + `Kathleen Chen`_. + +- Fix a bug in cases where ``numpy.cumsum`` may be numerically unstable, + raising an exception if instability is identified. :issue:`7376` and + :issue:`7331` by `Joel Nothman`_ and :user:`yangarbiter`. + +- Fix a bug where :meth:`base.BaseEstimator.__getstate__` + obstructed pickling customizations of child-classes, when used in a + multiple inheritance context. + :issue:`8316` by :user:`Holger Peters `. + +- Update Sphinx-Gallery from 0.1.4 to 0.1.7 for resolving links in + documentation build with Sphinx>1.5 :issue:`8010`, :issue:`7986` by + :user:`Oscar Najera ` + +- Add ``data_home`` parameter to :func:`sklearn.datasets.fetch_kddcup99`. + :issue:`9289` by `Loic Esteve`_. + +- Fix dataset loaders using Python 3 version of makedirs to also work in + Python 2. :issue:`9284` by :user:`Sebastin Santy `. + +- Several minor issues were fixed with thanks to the alerts of + [lgtm.com](http://lgtm.com). :issue:`9278` by :user:`Jean Helie `, + among others. + +API changes summary +------------------- + +Trees and ensembles + +- Gradient boosting base models are no longer estimators. By `Andreas Müller`_. + +- All tree based estimators now accept a ``min_impurity_decrease`` + parameter in lieu of the ``min_impurity_split``, which is now deprecated. + The ``min_impurity_decrease`` helps stop splitting the nodes in which + the weighted impurity decrease from splitting is no longer alteast + ``min_impurity_decrease``. :issue:`8449` by `Raghav RV`_. + +Linear, kernelized and related models + +- ``n_iter`` parameter is deprecated in :class:`linear_model.SGDClassifier`, + :class:`linear_model.SGDRegressor`, + :class:`linear_model.PassiveAggressiveClassifier`, + :class:`linear_model.PassiveAggressiveRegressor` and + :class:`linear_model.Perceptron`. By `Tom Dupre la Tour`_. + +Other predictors + +- :class:`neighbors.LSHForest` has been deprecated and will be + removed in 0.21 due to poor performance. + :issue:`9078` by :user:`Laurent Direr `. + +- :class:`neighbors.NearestCentroid` no longer purports to support + ``metric='precomputed'`` which now raises an error. :issue:`8515` by + :user:`Sergul Aydore `. + +- The ``alpha`` parameter of :class:`semi_supervised.LabelPropagation` now + has no effect and is deprecated to be removed in 0.21. :issue:`9239` + by :user:`Andre Ambrosio Boechat `, :user:`Utkarsh Upadhyay + `, and `Joel Nothman`_. + +Decomposition, manifold learning and clustering + +- Deprecate the ``doc_topic_distr`` argument of the ``perplexity`` method + in :class:`decomposition.LatentDirichletAllocation` because the + user no longer has access to the unnormalized document topic distribution + needed for the perplexity calculation. :issue:`7954` by + :user:`Gary Foreman `. + +- The ``n_topics`` parameter of :class:`decomposition.LatentDirichletAllocation` + has been renamed to ``n_components`` and will be removed in version 0.21. + :issue:`8922` by :user:`Attractadore`. + +- :meth:`decomposition.SparsePCA.transform`'s ``ridge_alpha`` parameter is + deprecated in preference for class parameter. + :issue:`8137` by :user:`Naoya Kanai `. + +- :class:`cluster.DBSCAN` now has a ``metric_params`` parameter. + :issue:`8139` by :user:`Naoya Kanai `. + +Preprocessing and feature selection + +- :class:`feature_selection.SelectFromModel` now has a ``partial_fit`` + method only if the underlying estimator does. By `Andreas Müller`_. + +- :class:`feature_selection.SelectFromModel` now validates the ``threshold`` + parameter and sets the ``threshold_`` attribute during the call to + ``fit``, and no longer during the call to ``transform```. By `Andreas + Müller`_. + +- The ``non_negative`` parameter in :class:`feature_extraction.FeatureHasher` + has been deprecated, and replaced with a more principled alternative, + ``alternate_sign``. + :issue:`7565` by :user:`Roman Yurchak `. + +- :class:`linear_model.RandomizedLogisticRegression`, + and :class:`linear_model.RandomizedLasso` have been deprecated and will + be removed in version 0.21. + :issue:`8995` by :user:`Ramana.S `. + +Model evaluation and meta-estimators + +- Deprecate the ``fit_params`` constructor input to the + :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` in favor + of passing keyword parameters to the ``fit`` methods + of those classes. Data-dependent parameters needed for model + training should be passed as keyword arguments to ``fit``, + and conforming to this convention will allow the hyperparameter + selection classes to be used with tools such as + :func:`model_selection.cross_val_predict`. + :issue:`2879` by :user:`Stephen Hoover `. + +- In version 0.21, the default behavior of splitters that use the + ``test_size`` and ``train_size`` parameter will change, such that + specifying ``train_size`` alone will cause ``test_size`` to be the + remainder. :issue:`7459` by :user:`Nelson Liu `. + +- :class:`multiclass.OneVsRestClassifier` now has ``partial_fit``, + ``decision_function`` and ``predict_proba`` methods only when the + underlying estimator does. :issue:`7812` by `Andreas Müller`_ and + :user:`Mikhail Korobov `. + +- :class:`multiclass.OneVsRestClassifier` now has a ``partial_fit`` method + only if the underlying estimator does. By `Andreas Müller`_. + +- The ``decision_function`` output shape for binary classification in + :class:`multiclass.OneVsRestClassifier` and + :class:`multiclass.OneVsOneClassifier` is now ``(n_samples,)`` to conform + to scikit-learn conventions. :issue:`9100` by `Andreas Müller`_. + +- The :func:`multioutput.MultiOutputClassifier.predict_proba` + function used to return a 3d array (``n_samples``, ``n_classes``, + ``n_outputs``). In the case where different target columns had different + numbers of classes, a ``ValueError`` would be raised on trying to stack + matrices with different dimensions. This function now returns a list of + arrays where the length of the list is ``n_outputs``, and each array is + (``n_samples``, ``n_classes``) for that particular output. + :issue:`8093` by :user:`Peter Bull `. + +- Replace attribute ``named_steps`` ``dict`` to :class:`utils.Bunch` + in :class:`pipeline.Pipeline` to enable tab completion in interactive + environment. In the case conflict value on ``named_steps`` and ``dict`` + attribute, ``dict`` behavior will be prioritized. + :issue:`8481` by :user:`Herilalaina Rakotoarison `. + +Miscellaneous + +- Deprecate the ``y`` parameter in ``transform`` and ``inverse_transform``. + The method should not accept ``y`` parameter, as it's used at the prediction time. + :issue:`8174` by :user:`Tahar Zanouda `, `Alexandre Gramfort`_ + and `Raghav RV`_. + +- SciPy >= 0.13.3 and NumPy >= 1.8.2 are now the minimum supported versions + for scikit-learn. The following backported functions in + :mod:`utils` have been removed or deprecated accordingly. + :issue:`8854` and :issue:`8874` by :user:`Naoya Kanai ` + +- The ``store_covariances`` and ``covariances_`` parameters of + :class:`discriminant_analysis.QuadraticDiscriminantAnalysis` + has been renamed to ``store_covariance`` and ``covariance_`` to be + consistent with the corresponding parameter names of the + :class:`discriminant_analysis.LinearDiscriminantAnalysis`. They will be + removed in version 0.21. :issue:`7998` by :user:`Jiacheng ` + + Removed in 0.19: + + - ``utils.fixes.argpartition`` + - ``utils.fixes.array_equal`` + - ``utils.fixes.astype`` + - ``utils.fixes.bincount`` + - ``utils.fixes.expit`` + - ``utils.fixes.frombuffer_empty`` + - ``utils.fixes.in1d`` + - ``utils.fixes.norm`` + - ``utils.fixes.rankdata`` + - ``utils.fixes.safe_copy`` + + Deprecated in 0.19, to be removed in 0.21: + + - ``utils.arpack.eigs`` + - ``utils.arpack.eigsh`` + - ``utils.arpack.svds`` + - ``utils.extmath.fast_dot`` + - ``utils.extmath.logsumexp`` + - ``utils.extmath.norm`` + - ``utils.extmath.pinvh`` + - ``utils.graph.graph_laplacian`` + - ``utils.random.choice`` + - ``utils.sparsetools.connected_components`` + - ``utils.stats.rankdata`` + +- Estimators with both methods ``decision_function`` and ``predict_proba`` + are now required to have a monotonic relation between them. The + method ``check_decision_proba_consistency`` has been added in + **utils.estimator_checks** to check their consistency. + :issue:`7578` by :user:`Shubham Bhardwaj ` + +- All checks in ``utils.estimator_checks``, in particular + :func:`utils.estimator_checks.check_estimator` now accept estimator + instances. Most other checks do not accept + estimator classes any more. :issue:`9019` by `Andreas Müller`_. + +- Ensure that estimators' attributes ending with ``_`` are not set + in the constructor but only in the ``fit`` method. Most notably, + ensemble estimators (deriving from :class:`ensemble.BaseEnsemble`) + now only have ``self.estimators_`` available after ``fit``. + :issue:`7464` by `Lars Buitinck`_ and `Loic Esteve`_. + + +Code and Documentation Contributors +----------------------------------- + +Thanks to everyone who has contributed to the maintenance and improvement of the +project since version 0.18, including: + +Joel Nothman, Loic Esteve, Andreas Mueller, Guillaume Lemaitre, Olivier Grisel, +Hanmin Qin, Raghav RV, Alexandre Gramfort, themrmax, Aman Dalmia, Gael +Varoquaux, Naoya Kanai, Tom Dupré la Tour, Rishikesh, Nelson Liu, Taehoon Lee, +Nelle Varoquaux, Aashil, Mikhail Korobov, Sebastin Santy, Joan Massich, Roman +Yurchak, RAKOTOARISON Herilalaina, Thierry Guillemot, Alexandre Abadie, Carol +Willing, Balakumaran Manoharan, Josh Karnofsky, Vlad Niculae, Utkarsh Upadhyay, +Dmitry Petrov, Minghui Liu, Srivatsan, Vincent Pham, Albert Thomas, Jake +VanderPlas, Attractadore, JC Liu, alexandercbooth, chkoar, Óscar Nájera, +Aarshay Jain, Kyle Gilliam, Ramana Subramanyam, CJ Carey, Clement Joudet, David +Robles, He Chen, Joris Van den Bossche, Karan Desai, Katie Luangkote, Leland +McInnes, Maniteja Nandana, Michele Lacchia, Sergei Lebedev, Shubham Bhardwaj, +akshay0724, omtcyfz, rickiepark, waterponey, Vathsala Achar, jbDelafosse, Ralf +Gommers, Ekaterina Krivich, Vivek Kumar, Ishank Gulati, Dave Elliott, ldirer, +Reiichiro Nakano, Levi John Wolf, Mathieu Blondel, Sid Kapur, Dougal J. +Sutherland, midinas, mikebenfield, Sourav Singh, Aseem Bansal, Ibraim Ganiev, +Stephen Hoover, AishwaryaRK, Steven C. Howell, Gary Foreman, Neeraj Gangwar, +Tahar, Jon Crall, dokato, Kathy Chen, ferria, Thomas Moreau, Charlie Brummitt, +Nicolas Goix, Adam Kleczewski, Sam Shleifer, Nikita Singh, Basil Beirouti, +Giorgio Patrini, Manoj Kumar, Rafael Possas, James Bourbeau, James A. Bednar, +Janine Harper, Jaye, Jean Helie, Jeremy Steward, Artsiom, John Wei, Jonathan +LIgo, Jonathan Rahn, seanpwilliams, Arthur Mensch, Josh Levy, Julian Kuhlmann, +Julien Aubert, Jörn Hees, Kai, shivamgargsya, Kat Hempstalk, Kaushik +Lakshmikanth, Kennedy, Kenneth Lyons, Kenneth Myers, Kevin Yap, Kirill Bobyrev, +Konstantin Podshumok, Arthur Imbert, Lee Murray, toastedcornflakes, Lera, Li +Li, Arthur Douillard, Mainak Jas, tobycheese, Manraj Singh, Manvendra Singh, +Marc Meketon, MarcoFalke, Matthew Brett, Matthias Gilch, Mehul Ahuja, Melanie +Goetz, Meng, Peng, Michael Dezube, Michal Baumgartner, vibrantabhi19, Artem +Golubin, Milen Paskov, Antonin Carette, Morikko, MrMjauh, NALEPA Emmanuel, +Namiya, Antoine Wendlinger, Narine Kokhlikyan, NarineK, Nate Guerin, Angus +Williams, Ang Lu, Nicole Vavrova, Nitish Pandey, Okhlopkov Daniil Olegovich, +Andy Craze, Om Prakash, Parminder Singh, Patrick Carlson, Patrick Pei, Paul +Ganssle, Paulo Haddad, Paweł Lorek, Peng Yu, Pete Bachant, Peter Bull, Peter +Csizsek, Peter Wang, Pieter Arthur de Jong, Ping-Yao, Chang, Preston Parry, +Puneet Mathur, Quentin Hibon, Andrew Smith, Andrew Jackson, 1kastner, Rameshwar +Bhaskaran, Rebecca Bilbro, Remi Rampin, Andrea Esuli, Rob Hall, Robert +Bradshaw, Romain Brault, Aman Pratik, Ruifeng Zheng, Russell Smith, Sachin +Agarwal, Sailesh Choyal, Samson Tan, Samuël Weber, Sarah Brown, Sebastian +Pölsterl, Sebastian Raschka, Sebastian Saeger, Alyssa Batula, Abhyuday Pratap +Singh, Sergey Feldman, Sergul Aydore, Sharan Yalburgi, willduan, Siddharth +Gupta, Sri Krishna, Almer, Stijn Tonk, Allen Riddell, Theofilos Papapanagiotou, +Alison, Alexis Mignon, Tommy Boucher, Tommy Löfstedt, Toshihiro Kamishima, +Tyler Folkman, Tyler Lanigan, Alexander Junge, Varun Shenoy, Victor Poughon, +Vilhelm von Ehrenheim, Aleksandr Sandrovskii, Alan Yee, Vlasios Vasileiou, +Warut Vijitbenjaronk, Yang Zhang, Yaroslav Halchenko, Yichuan Liu, Yuichi +Fujikawa, affanv14, aivision2020, xor, andreh7, brady salz, campustrampus, +Agamemnon Krasoulis, ditenberg, elena-sharova, filipj8, fukatani, gedeck, +guiniol, guoci, hakaa1, hongkahjun, i-am-xhy, jakirkham, jaroslaw-weber, +jayzed82, jeroko, jmontoyam, jonathan.striebel, josephsalmon, jschendel, +leereeves, martin-hahn, mathurinm, mehak-sachdeva, mlewis1729, mlliou112, +mthorrell, ndingwall, nuffe, yangarbiter, plagree, pldtc325, Breno Freitas, +Brett Olsen, Brian A. Alfano, Brian Burns, polmauri, Brandon Carter, Charlton +Austin, Chayant T15h, Chinmaya Pancholi, Christian Danielsen, Chung Yen, +Chyi-Kwei Yau, pravarmahajan, DOHMATOB Elvis, Daniel LeJeune, Daniel Hnyk, +Darius Morawiec, David DeTomaso, David Gasquez, David Haberthür, David +Heryanto, David Kirkby, David Nicholson, rashchedrin, Deborah Gertrude Digges, +Denis Engemann, Devansh D, Dickson, Bob Baxley, Don86, E. Lynch-Klarup, Ed +Rogers, Elizabeth Ferriss, Ellen-Co2, Fabian Egli, Fang-Chieh Chou, Bing Tian +Dai, Greg Stupp, Grzegorz Szpak, Bertrand Thirion, Hadrien Bertrand, Harizo +Rajaona, zxcvbnius, Henry Lin, Holger Peters, Icyblade Dai, Igor +Andriushchenko, Ilya, Isaac Laughlin, Iván Vallés, Aurélien Bellet, JPFrancoia, +Jacob Schreiber, Asish Mahapatra + diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst new file mode 100644 index 0000000000000..06bcc9a4e6cf8 --- /dev/null +++ b/doc/whats_new/v0.20.rst @@ -0,0 +1,113 @@ +.. include:: _contributors.rst + +.. currentmodule:: sklearn + +.. _changes_0_20: + +Version 0.20 (under development) +================================ + +Changed models +-------------- + +The following estimators and functions, when fit with the same data and +parameters, may produce different models from the previous version. This often +occurs due to changes in the modelling logic (bug fixes or enhancements), or in +random sampling procedures. + +- :class:`decomposition.IncrementalPCA` in Python 2 (bug fix) +- :class:`isotonic.IsotonicRegression` (bug fix) + +Details are listed in the changelog below. + +(While we are trying to better inform users by providing this information, we +cannot assure that this list is complete.) + +Changelog +--------- + +New features +............ + +Classifiers and regressors + +- :class:`ensemble.GradientBoostingClassifier` and + :class:`ensemble.GradientBoostingRegressor` now support early stopping + via ``n_iter_no_change``, ``validation_fraction`` and ``tol``. :issue:`7071` + by `Raghav RV`_ + +- Added :class:`naive_bayes.ComplementNB`, which implements the Complement + Naive Bayes classifier described in Rennie et al. (2003). + By :user:`Michael A. Alcorn `. + +Enhancements +............ + +Classifiers and regressors + +- In :class:`gaussian_process.GaussianProcessRegressor`, method ``predict`` + is faster when using ``return_std=True`` in particular more when called + several times in a row. :issue:`9234` by :user:`andrewww ` + and :user:`Minghui Liu `. + +- Add `named_estimators_` parameter in + :class:`sklearn.ensemble.voting_classifier` to access fitted + estimators. :issue:`9157` by :user:`Herilalaina Rakotoarison `. + + +Model evaluation and meta-estimators + +- A scorer based on :func:`metrics.brier_score_loss` is also available. + :issue:`9521` by :user:`Hanmin Qin `. + +Linear, kernelized and related models + +- Deprecate ``random_state`` parameter in :class:`svm.OneClassSVM` as the + underlying implementation is not random. + :issue:`9497` by :user:`Albert Thomas `. + +Bug fixes +......... + +Classifiers and regressors + +- Fixed a bug in :class:`isotonic.IsotonicRegression` which incorrectly + combined weights when fitting a model to data involving points with + identical X values. + :issue:`9432` by :user:`Dallas Card ` + +Decomposition, manifold learning and clustering + +- Fix for uninformative error in :class:`decomposition.IncrementalPCA`: + now an error is raised if the number of components is larger than the + chosen batch size. The ``n_components=None`` case was adapted accordingly. + :issue:`6452`. By :user:`Wally Gauze `. + +- Fixed a bug where the ``partial_fit`` method of + :class:`decomposition.IncrementalPCA` used integer division instead of float + division on Python 2 versions. :issue:`9492` by + :user:`James Bourbeau `. + +- Fixed a bug where the ``fit`` method of + :class:`cluster.affinity_propagation_.AffinityPropagation` stored cluster + centers as 3d array instead of 2d array in case of non-convergence. For the + same class, fixed undefined and arbitrary behavior in case of training data + where all samples had equal similarity. + :issue:`9612`. By :user:`Jonatan Samoocha `. + +- In :class:`decomposition.PCA` selecting a n_components parameter greater than + the number of samples now raises an error. + Similarly, the ``n_components=None`` case now selects the minimum of + n_samples and n_features. :issue:`8484`. By :user:`Wally Gauze `. + +- Fixed a bug in :func:`datasets.fetch_kddcup99`, where data were not properly + shuffled. :issue:`9731` by `Nicolas Goix`_. + +API changes summary +------------------- + +Linear, kernelized and related models + +- Deprecate ``random_state`` parameter in :class:`svm.OneClassSVM` as the + underlying implementation is not random. + :issue:`9497` by :user:`Albert Thomas `. diff --git a/examples/applications/plot_stock_market.py b/examples/applications/plot_stock_market.py index 8a85b0645cb8c..6f4dd13eb36f6 100644 --- a/examples/applications/plot_stock_market.py +++ b/examples/applications/plot_stock_market.py @@ -59,11 +59,12 @@ heuristic based on the direction of the nearest neighbor along each axis. """ -print(__doc__) +from __future__ import print_function # Author: Gael Varoquaux gael.varoquaux@normalesup.org # License: BSD 3 clause +import sys from datetime import datetime import numpy as np @@ -73,9 +74,8 @@ from six.moves.urllib.parse import urlencode from sklearn import cluster, covariance, manifold +print(__doc__) -# ############################################################################# -# Retrieve the data from Internet def retry(f, n_attempts=3): "Wrapper function to retry function calls in case of exceptions" @@ -83,7 +83,7 @@ def wrapper(*args, **kwargs): for i in range(n_attempts): try: return f(*args, **kwargs) - except Exception as e: + except Exception: if i == n_attempts - 1: raise return wrapper @@ -120,15 +120,33 @@ def quotes_historical_google(symbol, date1, date2): 'formats': ['object', 'f4', 'f4', 'f4', 'f4', 'f4'] } converters = {0: lambda s: datetime.strptime(s.decode(), '%d-%b-%y')} - return np.genfromtxt(response, delimiter=',', skip_header=1, + data = np.genfromtxt(response, delimiter=',', skip_header=1, dtype=dtype, converters=converters, missing_values='-', filling_values=-1) + expected_len_data = 1258 + len_data = len(data) + min_date = data['date'].min() + max_date = data['date'].max() + if (len_data != expected_len_data or min_date != d1 or max_date != d2): + message = ( + 'Got wrong data for symbol {}, url {}\n' + ' - min_date should be {}, got {}\n' + ' - max_date should be {}, got {}\n' + ' - len(data) should be {}, got {}'.format( + symbol, url, + d1.date(), min_date.date(), + d2.date(), max_date.date(), + expected_len_data, len_data)) + raise ValueError(message) + return data +# ############################################################################# +# Retrieve the data from Internet # Choose a time period reasonably calm (not too long ago so that we get # high-tech firms, and before the 2008 crash) -d1 = datetime(2003, 1, 1) -d2 = datetime(2008, 1, 1) +d1 = datetime(2003, 1, 2) +d2 = datetime(2007, 12, 31) symbol_dict = { 'TOT': 'Total', @@ -170,7 +188,7 @@ def quotes_historical_google(symbol, date1, date2): 'BAC': 'Bank of America', 'GS': 'Goldman Sachs', 'AAPL': 'Apple', - 'SAP': 'SAP', + 'NYSE:SAP': 'SAP', 'CSCO': 'Cisco', 'TXN': 'Texas Instruments', 'XRX': 'Xerox', @@ -188,13 +206,15 @@ def quotes_historical_google(symbol, date1, date2): 'CAT': 'Caterpillar', 'DD': 'DuPont de Nemours'} -symbols, names = np.array(list(symbol_dict.items())).T +symbols, names = np.array(sorted(symbol_dict.items())).T # retry is used because quotes_historical_google can temporarily fail # for various reasons (e.g. empty result from Google API). -quotes = [ - retry(quotes_historical_google)(symbol, d1, d2) for symbol in symbols -] +quotes = [] + +for symbol in symbols: + print('Fetching quote history for %r' % symbol, file=sys.stderr) + quotes.append(retry(quotes_historical_google)(symbol, d1, d2)) close_prices = np.vstack([q['close'] for q in quotes]) open_prices = np.vstack([q['open'] for q in quotes]) diff --git a/examples/cluster/plot_color_quantization.py b/examples/cluster/plot_color_quantization.py index 0bda5c66ce4a3..7ef4ad6353654 100644 --- a/examples/cluster/plot_color_quantization.py +++ b/examples/cluster/plot_color_quantization.py @@ -84,21 +84,18 @@ def recreate_image(codebook, labels, w, h): # Display all results, alongside original image plt.figure(1) plt.clf() -ax = plt.axes([0, 0, 1, 1]) plt.axis('off') plt.title('Original image (96,615 colors)') plt.imshow(china) plt.figure(2) plt.clf() -ax = plt.axes([0, 0, 1, 1]) plt.axis('off') plt.title('Quantized image (64 colors, K-Means)') plt.imshow(recreate_image(kmeans.cluster_centers_, labels, w, h)) plt.figure(3) plt.clf() -ax = plt.axes([0, 0, 1, 1]) plt.axis('off') plt.title('Quantized image (64 colors, Random)') plt.imshow(recreate_image(codebook_random, labels_random, w, h)) diff --git a/examples/cluster/plot_dict_face_patches.py b/examples/cluster/plot_dict_face_patches.py index ac2fde3e2cc6a..6d33f01e6a7cb 100644 --- a/examples/cluster/plot_dict_face_patches.py +++ b/examples/cluster/plot_dict_face_patches.py @@ -41,7 +41,6 @@ patch_size = (20, 20) buffer = [] -index = 1 t0 = time.time() # The online learning part: cycle over the whole dataset 6 times diff --git a/examples/cluster/plot_kmeans_stability_low_dim_dense.py b/examples/cluster/plot_kmeans_stability_low_dim_dense.py index b5d4326c5c713..109d2097b6be9 100644 --- a/examples/cluster/plot_kmeans_stability_low_dim_dense.py +++ b/examples/cluster/plot_kmeans_stability_low_dim_dense.py @@ -69,7 +69,7 @@ def make_data(random_state, n_samples_per_center, grid_size, scale): # Part 1: Quantitative evaluation of various init methods -fig = plt.figure() +plt.figure() plots = [] legends = [] @@ -105,7 +105,7 @@ def make_data(random_state, n_samples_per_center, grid_size, scale): km = MiniBatchKMeans(n_clusters=n_clusters, init='random', n_init=1, random_state=random_state).fit(X) -fig = plt.figure() +plt.figure() for k in range(n_clusters): my_members = km.labels_ == k color = cm.spectral(float(k) / n_clusters, 1) diff --git a/examples/decomposition/plot_pca_3d.py b/examples/decomposition/plot_pca_3d.py index d9db17ffaec39..58494f7ef816d 100644 --- a/examples/decomposition/plot_pca_3d.py +++ b/examples/decomposition/plot_pca_3d.py @@ -73,8 +73,6 @@ def plot_figs(fig_num, elev, azim): pca_score = pca.explained_variance_ratio_ V = pca.components_ - x_pca_axis, y_pca_axis, z_pca_axis = V.T * pca_score / pca_score.min() - x_pca_axis, y_pca_axis, z_pca_axis = 3 * V.T x_pca_plane = np.r_[x_pca_axis[:2], - x_pca_axis[1::-1]] y_pca_plane = np.r_[y_pca_axis[:2], - y_pca_axis[1::-1]] diff --git a/examples/ensemble/plot_bias_variance.py b/examples/ensemble/plot_bias_variance.py index 8d88f99df1668..0f0a2478472c3 100644 --- a/examples/ensemble/plot_bias_variance.py +++ b/examples/ensemble/plot_bias_variance.py @@ -88,12 +88,14 @@ n_estimators = len(estimators) + # Generate data def f(x): x = x.ravel() return np.exp(-x ** 2) + 1.5 * np.exp(-(x - 2) ** 2) + def generate(n_samples, noise, n_repeat=1): X = np.random.rand(n_samples) * 10 - 5 X = np.sort(X) @@ -110,6 +112,7 @@ def generate(n_samples, noise, n_repeat=1): return X, y + X_train = [] y_train = [] @@ -120,6 +123,8 @@ def generate(n_samples, noise, n_repeat=1): X_test, y_test = generate(n_samples=n_test, noise=noise, n_repeat=n_repeat) +plt.figure(figsize=(10, 8)) + # Loop over estimators to compare for n, (name, estimator) in enumerate(estimators): # Compute predictions @@ -166,8 +171,8 @@ def generate(n_samples, noise, n_repeat=1): plt.xlim([-5, 5]) plt.title(name) - if n == 0: - plt.legend(loc="upper left", prop={"size": 11}) + if n == n_estimators - 1: + plt.legend(loc=(1.1, .5)) plt.subplot(2, n_estimators, n_estimators + n + 1) plt.plot(X_test, y_error, "r", label="$error(x)$") @@ -178,7 +183,9 @@ def generate(n_samples, noise, n_repeat=1): plt.xlim([-5, 5]) plt.ylim([0, 0.1]) - if n == 0: - plt.legend(loc="upper left", prop={"size": 11}) + if n == n_estimators - 1: + + plt.legend(loc=(1.1, .5)) +plt.subplots_adjust(right=.75) plt.show() diff --git a/examples/ensemble/plot_forest_iris.py b/examples/ensemble/plot_forest_iris.py index f0fd5dc7d003e..73db88d829b1f 100644 --- a/examples/ensemble/plot_forest_iris.py +++ b/examples/ensemble/plot_forest_iris.py @@ -46,7 +46,6 @@ import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap -from sklearn import clone from sklearn.datasets import load_iris from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier) @@ -90,10 +89,9 @@ X = (X - mean) / std # Train - clf = clone(model) - clf = model.fit(X, y) + model.fit(X, y) - scores = clf.score(X, y) + scores = model.score(X, y) # Create a title for each column and the console by using str() and # slicing away useless parts of the string model_title = str(type(model)).split( diff --git a/examples/ensemble/plot_gradient_boosting_early_stopping.py b/examples/ensemble/plot_gradient_boosting_early_stopping.py index 323aa67bd5040..366d9e0b148d6 100644 --- a/examples/ensemble/plot_gradient_boosting_early_stopping.py +++ b/examples/ensemble/plot_gradient_boosting_early_stopping.py @@ -102,8 +102,6 @@ bar2 = plt.bar(index + bar_width, score_gbes, bar_width, label='With early stopping', color='coral') -max_y = np.amax(np.maximum(score_gb, score_gbes)) - plt.xticks(index + bar_width, names) plt.yticks(np.arange(0, 1.3, 0.1)) diff --git a/examples/gaussian_process/plot_gpc_isoprobability.py b/examples/gaussian_process/plot_gpc_isoprobability.py index 2a27434cf148f..0639a65a384a4 100644 --- a/examples/gaussian_process/plot_gpc_isoprobability.py +++ b/examples/gaussian_process/plot_gpc_isoprobability.py @@ -85,7 +85,7 @@ def g(x): plt.plot(X[y > 0, 0], X[y > 0, 1], 'b.', markersize=12) -cs = plt.contour(x1, x2, y_true, [0.], colors='k', linestyles='dashdot') +plt.contour(x1, x2, y_true, [0.], colors='k', linestyles='dashdot') cs = plt.contour(x1, x2, y_prob, [0.666], colors='b', linestyles='solid') diff --git a/examples/gaussian_process/plot_gpr_noisy_targets.py b/examples/gaussian_process/plot_gpr_noisy_targets.py index e90b5e57ad257..8841f04a3987f 100644 --- a/examples/gaussian_process/plot_gpr_noisy_targets.py +++ b/examples/gaussian_process/plot_gpr_noisy_targets.py @@ -61,7 +61,7 @@ def f(x): # Plot the function, the prediction and the 95% confidence interval based on # the MSE -fig = plt.figure() +plt.figure() plt.plot(x, f(x), 'r:', label=u'$f(x) = x\,\sin(x)$') plt.plot(X, y, 'r.', markersize=10, label=u'Observations') plt.plot(x, y_pred, 'b-', label=u'Prediction') @@ -97,7 +97,7 @@ def f(x): # Plot the function, the prediction and the 95% confidence interval based on # the MSE -fig = plt.figure() +plt.figure() plt.plot(x, f(x), 'r:', label=u'$f(x) = x\,\sin(x)$') plt.errorbar(X.ravel(), y, dy, fmt='r.', markersize=10, label=u'Observations') plt.plot(x, y_pred, 'b-', label=u'Prediction') diff --git a/examples/linear_model/plot_lasso_coordinate_descent_path.py b/examples/linear_model/plot_lasso_coordinate_descent_path.py index 7b6d2a52cae87..3cd96d6692e8d 100644 --- a/examples/linear_model/plot_lasso_coordinate_descent_path.py +++ b/examples/linear_model/plot_lasso_coordinate_descent_path.py @@ -47,8 +47,6 @@ # Display results plt.figure(1) -ax = plt.gca() - colors = cycle(['b', 'r', 'g', 'c', 'k']) neg_log_alphas_lasso = -np.log10(alphas_lasso) neg_log_alphas_enet = -np.log10(alphas_enet) @@ -64,7 +62,6 @@ plt.figure(2) -ax = plt.gca() neg_log_alphas_positive_lasso = -np.log10(alphas_positive_lasso) for coef_l, coef_pl, c in zip(coefs_lasso, coefs_positive_lasso, colors): l1 = plt.plot(neg_log_alphas_lasso, coef_l, c=c) @@ -78,7 +75,6 @@ plt.figure(3) -ax = plt.gca() neg_log_alphas_positive_enet = -np.log10(alphas_positive_enet) for (coef_e, coef_pe, c) in zip(coefs_enet, coefs_positive_enet, colors): l1 = plt.plot(neg_log_alphas_enet, coef_e, c=c) diff --git a/examples/neighbors/plot_digits_kde_sampling.py b/examples/neighbors/plot_digits_kde_sampling.py index ba59fb5ece537..8367d16b955fe 100644 --- a/examples/neighbors/plot_digits_kde_sampling.py +++ b/examples/neighbors/plot_digits_kde_sampling.py @@ -20,7 +20,6 @@ # load the data digits = load_digits() -data = digits.data # project the 64-dimensional data to a lower dimension pca = PCA(n_components=15, whiten=False) diff --git a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py index 5c8543937beba..f46b7ece7cd78 100644 --- a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py +++ b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py @@ -65,7 +65,8 @@ print("Iteration %i %s" % (i, 70 * "_")) print("Label Spreading model: %d labeled & %d unlabeled (%d total)" - % (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples)) + % (n_labeled_points, n_total_samples - n_labeled_points, + n_total_samples)) print(classification_report(true_labels, predicted_labels)) @@ -95,7 +96,7 @@ # for more than 5 iterations, visualize the gain only on the first 5 if i < 5: sub = f.add_subplot(5, 5, index + 1 + (5 * i)) - sub.imshow(image, cmap=plt.cm.gray_r) + sub.imshow(image, cmap=plt.cm.gray_r, interpolation='none') sub.set_title("predict: %i\ntrue: %i" % ( lp_model.transduction_[image_index], y[image_index]), size=10) sub.axis('off') @@ -108,6 +109,7 @@ n_labeled_points += len(uncertainty_index) f.suptitle("Active learning with Label Propagation.\nRows show 5 most " - "uncertain labels to learn with the next model.") -plt.subplots_adjust(0.12, 0.03, 0.9, 0.8, 0.2, 0.45) + "uncertain labels to learn with the next model.", y=1.15) +plt.subplots_adjust(left=0.2, bottom=0.03, right=0.9, top=0.9, wspace=0.2, + hspace=0.85) plt.show() diff --git a/examples/tree/plot_tree_regression_multioutput.py b/examples/tree/plot_tree_regression_multioutput.py index 005f73683921b..b47bfcd80e49a 100644 --- a/examples/tree/plot_tree_regression_multioutput.py +++ b/examples/tree/plot_tree_regression_multioutput.py @@ -42,7 +42,6 @@ # Plot the results plt.figure() -s = 50 s = 25 plt.scatter(y[:, 0], y[:, 1], c="navy", s=s, edgecolor="black", label="data") diff --git a/sklearn/_isotonic.pyx b/sklearn/_isotonic.pyx index 1cec075fc6fc7..ff18e3cad7312 100644 --- a/sklearn/_isotonic.pyx +++ b/sklearn/_isotonic.pyx @@ -100,7 +100,7 @@ def _make_unique(np.ndarray[dtype=np.float64_t] X, if x != current_x: # next unique value x_out[i] = current_x - weights_out[i] = current_weight / current_count + weights_out[i] = current_weight y_out[i] = current_y / current_weight i += 1 current_x = x @@ -113,6 +113,6 @@ def _make_unique(np.ndarray[dtype=np.float64_t] X, current_count += 1 x_out[i] = current_x - weights_out[i] = current_weight / current_count + weights_out[i] = current_weight y_out[i] = current_y / current_weight return x_out, y_out, weights_out diff --git a/sklearn/base.py b/sklearn/base.py index aa4f9f9ce17c1..d97fe92ccdd47 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -225,21 +225,7 @@ def get_params(self, deep=True): """ out = dict() for key in self._get_param_names(): - # We need deprecation warnings to always be on in order to - # catch deprecated param values. - # This is set in utils/__init__.py but it gets overwritten - # when running under python3 somehow. - warnings.simplefilter("always", DeprecationWarning) - try: - with warnings.catch_warnings(record=True) as w: - value = getattr(self, key, None) - if len(w) and w[0].category == DeprecationWarning: - # if the parameter is deprecated, don't show it - continue - finally: - warnings.filters.pop(0) - - # XXX: should we rather test if instance of estimator? + value = getattr(self, key, None) if deep and hasattr(value, 'get_params'): deep_items = value.get_params().items() out.update((key + '__' + k, val) for k, val in deep_items) @@ -316,7 +302,6 @@ def __setstate__(self, state): self.__dict__.update(state) - ############################################################################### class ClassifierMixin(object): """Mixin class for all classifiers in scikit-learn.""" diff --git a/sklearn/cluster/affinity_propagation_.py b/sklearn/cluster/affinity_propagation_.py index 47ed14f826f33..d3bbe529b7c25 100644 --- a/sklearn/cluster/affinity_propagation_.py +++ b/sklearn/cluster/affinity_propagation_.py @@ -6,7 +6,9 @@ # License: BSD 3 clause import numpy as np +import warnings +from sklearn.exceptions import ConvergenceWarning from ..base import BaseEstimator, ClusterMixin from ..utils import as_float_array, check_array from ..utils.validation import check_is_fitted @@ -14,6 +16,20 @@ from ..metrics import pairwise_distances_argmin +def _equal_similarities_and_preferences(S, preference): + def all_equal_preferences(): + return np.all(preference == preference.flat[0]) + + def all_equal_similarities(): + # Create mask to ignore diagonal of S + mask = np.ones(S.shape, dtype=bool) + np.fill_diagonal(mask, 0) + + return np.all(S[mask].flat == S[mask].flat[0]) + + return all_equal_preferences() and all_equal_similarities() + + def affinity_propagation(S, preference=None, convergence_iter=15, max_iter=200, damping=0.5, copy=True, verbose=False, return_n_iter=False): @@ -74,6 +90,16 @@ def affinity_propagation(S, preference=None, convergence_iter=15, max_iter=200, For an example, see :ref:`examples/cluster/plot_affinity_propagation.py `. + When the algorithm does not converge, it returns an empty array as + ``cluster_center_indices`` and ``-1`` as label for each training sample. + + When all training samples have equal similarities and equal preferences, + the assignment of cluster centers and labels depends on the preference. + If the preference is smaller than the similarities, a single cluster center + and label ``0`` for every sample will be returned. Otherwise, every + training sample becomes its own cluster center and is assigned a unique + label. + References ---------- Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages @@ -90,6 +116,23 @@ def affinity_propagation(S, preference=None, convergence_iter=15, max_iter=200, if damping < 0.5 or damping >= 1: raise ValueError('damping must be >= 0.5 and < 1') + preference = np.array(preference) + + if (n_samples == 1 or + _equal_similarities_and_preferences(S, preference)): + # It makes no sense to run the algorithm in this case, so return 1 or + # n_samples clusters, depending on preferences + warnings.warn("All samples have mutually equal similarities. " + "Returning arbitrary cluster center(s).") + if preference.flat[0] >= S.flat[n_samples - 1]: + return ((np.arange(n_samples), np.arange(n_samples), 0) + if return_n_iter + else (np.arange(n_samples), np.arange(n_samples))) + else: + return ((np.array([0]), np.array([0] * n_samples), 0) + if return_n_iter + else (np.array([0]), np.array([0] * n_samples))) + random_state = np.random.RandomState(0) # Place preference on the diagonal of S @@ -177,9 +220,10 @@ def affinity_propagation(S, preference=None, convergence_iter=15, max_iter=200, cluster_centers_indices = np.unique(labels) labels = np.searchsorted(cluster_centers_indices, labels) else: - labels = np.empty((n_samples, 1)) - cluster_centers_indices = None - labels.fill(np.nan) + warnings.warn("Affinity propagation did not converge, this model " + "will not have any cluster centers.", ConvergenceWarning) + labels = np.array([-1] * n_samples) + cluster_centers_indices = [] if return_n_iter: return cluster_centers_indices, labels, it + 1 @@ -254,6 +298,17 @@ class AffinityPropagation(BaseEstimator, ClusterMixin): The algorithmic complexity of affinity propagation is quadratic in the number of points. + When ``fit`` does not converge, ``cluster_centers_`` becomes an empty + array and all training samples will be labelled as ``-1``. In addition, + ``predict`` will then label every sample as ``-1``. + + When all training samples have equal similarities and equal preferences, + the assignment of cluster centers and labels depends on the preference. + If the preference is smaller than the similarities, ``fit`` will result in + a single cluster center and label ``0`` for every sample. Otherwise, every + training sample becomes its own cluster center and is assigned a unique + label. + References ---------- @@ -287,6 +342,9 @@ def fit(self, X, y=None): X : array-like, shape (n_samples, n_features) or (n_samples, n_samples) Data matrix or, if affinity is ``precomputed``, matrix of similarities / affinities. + + y : Ignored + """ X = check_array(X, accept_sparse='csr') if self.affinity == "precomputed": @@ -327,4 +385,10 @@ def predict(self, X): raise ValueError("Predict method is not supported when " "affinity='precomputed'.") - return pairwise_distances_argmin(X, self.cluster_centers_) + if self.cluster_centers_.size > 0: + return pairwise_distances_argmin(X, self.cluster_centers_) + else: + warnings.warn("This model does not have any cluster centers " + "because affinity propagation did not converge. " + "Labeling every sample as '-1'.") + return np.array([-1] * X.shape[0]) diff --git a/sklearn/cluster/bicluster.py b/sklearn/cluster/bicluster.py index 38319a5d8c88b..6c61d6b983bbe 100644 --- a/sklearn/cluster/bicluster.py +++ b/sklearn/cluster/bicluster.py @@ -117,6 +117,8 @@ def fit(self, X, y=None): ---------- X : array-like, shape (n_samples, n_features) + y : Ignored + """ X = check_array(X, accept_sparse='csr', dtype=np.float64) self._check_parameters() diff --git a/sklearn/cluster/birch.py b/sklearn/cluster/birch.py index 04d7726743b06..d2dcd8d9a016f 100644 --- a/sklearn/cluster/birch.py +++ b/sklearn/cluster/birch.py @@ -441,6 +441,9 @@ def fit(self, X, y=None): ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Input data. + + y : Ignored + """ self.fit_, self.partial_fit_ = True, False return self._fit(X) @@ -521,6 +524,9 @@ def partial_fit(self, X=None, y=None): X : {array-like, sparse matrix}, shape (n_samples, n_features), None Input data. If X is not provided, only the global clustering step is done. + + y : Ignored + """ self.partial_fit_, self.fit_ = True, False if X is None: diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py index 115e534b448cb..45bedb26e76b1 100644 --- a/sklearn/cluster/dbscan_.py +++ b/sklearn/cluster/dbscan_.py @@ -275,6 +275,9 @@ def fit(self, X, y=None, sample_weight=None): ``min_samples`` is by itself a core sample; a sample with negative weight may inhibit its eps-neighbor from being core. Note that weights are absolute, and default to 1. + + y : Ignored + """ X = check_array(X, accept_sparse='csr') clust = dbscan(X, sample_weight=sample_weight, @@ -303,6 +306,8 @@ def fit_predict(self, X, y=None, sample_weight=None): weight may inhibit its eps-neighbor from being core. Note that weights are absolute, and default to 1. + y : Ignored + Returns ------- y : ndarray, shape (n_samples,) diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index 966ed5e2cc121..c8ead243192b0 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -685,7 +685,10 @@ def fit(self, X, y=None): Parameters ---------- X : array-like, shape = [n_samples, n_features] - The samples a.k.a. observations. + Training data. Shape [n_samples, n_features], or [n_samples, + n_samples] if affinity=='precomputed'. + + y : Ignored Returns ------- @@ -834,6 +837,8 @@ def fit(self, X, y=None, **params): X : array-like, shape = [n_samples, n_features] The data + y : Ignored + Returns ------- self diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py index af2fc67e083db..06f26b52aa0e6 100644 --- a/sklearn/cluster/k_means_.py +++ b/sklearn/cluster/k_means_.py @@ -879,6 +879,9 @@ def fit(self, X, y=None): ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Training instances to cluster. + + y : Ignored + """ random_state = check_random_state(self.random_state) X = self._check_fit_data(X) @@ -904,6 +907,8 @@ def fit_predict(self, X, y=None): X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to transform. + u : Ignored + Returns ------- labels : array, shape [n_samples,] @@ -921,6 +926,8 @@ def fit_transform(self, X, y=None): X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to transform. + y : Ignored + Returns ------- X_new : array, shape [n_samples, k] @@ -990,6 +997,8 @@ def score(self, X, y=None): X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data. + y : Ignored + Returns ------- score : float @@ -1336,6 +1345,9 @@ def fit(self, X, y=None): ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Training instances to cluster. + + y : Ignored + """ random_state = check_random_state(self.random_state) X = check_array(X, accept_sparse="csr", order='C', @@ -1498,6 +1510,9 @@ def partial_fit(self, X, y=None): ---------- X : array-like, shape = [n_samples, n_features] Coordinates of the data points to cluster. + + y : Ignored + """ X = check_array(X, accept_sparse="csr") diff --git a/sklearn/cluster/mean_shift_.py b/sklearn/cluster/mean_shift_.py index b1680fea3f2e7..37c31777a5a1f 100644 --- a/sklearn/cluster/mean_shift_.py +++ b/sklearn/cluster/mean_shift_.py @@ -389,6 +389,9 @@ def fit(self, X, y=None): ----------- X : array-like, shape=[n_samples, n_features] Samples to cluster. + + y : Ignored + """ X = check_array(X) self.cluster_centers_, self.labels_ = \ diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py index 5f5f0a4e9d452..8532110acb6c4 100644 --- a/sklearn/cluster/spectral.py +++ b/sklearn/cluster/spectral.py @@ -432,6 +432,9 @@ def fit(self, X, y=None): X : array-like or sparse matrix, shape (n_samples, n_features) OR, if affinity==`precomputed`, a precomputed affinity matrix of shape (n_samples, n_samples) + + y : Ignored + """ X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64) diff --git a/sklearn/cluster/tests/test_affinity_propagation.py b/sklearn/cluster/tests/test_affinity_propagation.py index e0e4091d4d2de..408783cd98ff0 100644 --- a/sklearn/cluster/tests/test_affinity_propagation.py +++ b/sklearn/cluster/tests/test_affinity_propagation.py @@ -5,11 +5,15 @@ import numpy as np -from sklearn.utils.testing import assert_equal -from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import assert_raises +from sklearn.exceptions import ConvergenceWarning +from sklearn.utils.testing import ( + assert_equal, assert_false, assert_true, assert_array_equal, assert_raises, + assert_warns, assert_warns_message, assert_no_warnings) from sklearn.cluster.affinity_propagation_ import AffinityPropagation +from sklearn.cluster.affinity_propagation_ import ( + _equal_similarities_and_preferences +) from sklearn.cluster.affinity_propagation_ import affinity_propagation from sklearn.datasets.samples_generator import make_blobs from sklearn.metrics import euclidean_distances @@ -78,3 +82,81 @@ def test_affinity_propagation_predict_error(): af = AffinityPropagation(affinity="precomputed") af.fit(S) assert_raises(ValueError, af.predict, X) + + +def test_affinity_propagation_fit_non_convergence(): + # In case of non-convergence of affinity_propagation(), the cluster + # centers should be an empty array and training samples should be labelled + # as noise (-1) + X = np.array([[0, 0], [1, 1], [-2, -2]]) + + # Force non-convergence by allowing only a single iteration + af = AffinityPropagation(preference=-10, max_iter=1) + + assert_warns(ConvergenceWarning, af.fit, X) + assert_array_equal(np.empty((0, 2)), af.cluster_centers_) + assert_array_equal(np.array([-1, -1, -1]), af.labels_) + + +def test_affinity_propagation_equal_mutual_similarities(): + X = np.array([[-1, 1], [1, -1]]) + S = -euclidean_distances(X, squared=True) + + # setting preference > similarity + cluster_center_indices, labels = assert_warns_message( + UserWarning, "mutually equal", affinity_propagation, S, preference=0) + + # expect every sample to become an exemplar + assert_array_equal([0, 1], cluster_center_indices) + assert_array_equal([0, 1], labels) + + # setting preference < similarity + cluster_center_indices, labels = assert_warns_message( + UserWarning, "mutually equal", affinity_propagation, S, preference=-10) + + # expect one cluster, with arbitrary (first) sample as exemplar + assert_array_equal([0], cluster_center_indices) + assert_array_equal([0, 0], labels) + + # setting different preferences + cluster_center_indices, labels = assert_no_warnings( + affinity_propagation, S, preference=[-20, -10]) + + # expect one cluster, with highest-preference sample as exemplar + assert_array_equal([1], cluster_center_indices) + assert_array_equal([0, 0], labels) + + +def test_affinity_propagation_predict_non_convergence(): + # In case of non-convergence of affinity_propagation(), the cluster + # centers should be an empty array + X = np.array([[0, 0], [1, 1], [-2, -2]]) + + # Force non-convergence by allowing only a single iteration + af = AffinityPropagation(preference=-10, max_iter=1).fit(X) + + # At prediction time, consider new samples as noise since there are no + # clusters + assert_array_equal(np.array([-1, -1, -1]), + af.predict(np.array([[2, 2], [3, 3], [4, 4]]))) + + +def test_equal_similarities_and_preferences(): + # Unequal distances + X = np.array([[0, 0], [1, 1], [-2, -2]]) + S = -euclidean_distances(X, squared=True) + + assert_false(_equal_similarities_and_preferences(S, np.array(0))) + assert_false(_equal_similarities_and_preferences(S, np.array([0, 0]))) + assert_false(_equal_similarities_and_preferences(S, np.array([0, 1]))) + + # Equal distances + X = np.array([[0, 0], [1, 1]]) + S = -euclidean_distances(X, squared=True) + + # Different preferences + assert_false(_equal_similarities_and_preferences(S, np.array([0, 1]))) + + # Same preferences + assert_true(_equal_similarities_and_preferences(S, np.array([0, 0]))) + assert_true(_equal_similarities_and_preferences(S, np.array(0))) diff --git a/sklearn/datasets/kddcup99.py b/sklearn/datasets/kddcup99.py index 5bef7255e37da..4b7b769d7017d 100644 --- a/sklearn/datasets/kddcup99.py +++ b/sklearn/datasets/kddcup99.py @@ -140,7 +140,9 @@ def fetch_kddcup99(subset=None, data_home=None, shuffle=False, Whether to shuffle dataset. random_state : int, RandomState instance or None, optional (default=None) - Random state for shuffling the dataset. + Random state for shuffling the dataset. If subset='SA', this random + state is also used to randomly select the small proportion of abnormal + samples. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used @@ -175,7 +177,7 @@ def fetch_kddcup99(subset=None, data_home=None, shuffle=False, """ data_home = get_data_home(data_home=data_home) - kddcup99 = _fetch_brute_kddcup99(data_home=data_home, shuffle=shuffle, + kddcup99 = _fetch_brute_kddcup99(data_home=data_home, percent10=percent10, download_if_missing=download_if_missing) @@ -225,12 +227,15 @@ def fetch_kddcup99(subset=None, data_home=None, shuffle=False, if subset == 'SF': data = np.c_[data[:, 0], data[:, 2], data[:, 4], data[:, 5]] + if shuffle: + data, target = shuffle_method(data, target, random_state=random_state) + return Bunch(data=data, target=target) def _fetch_brute_kddcup99(data_home=None, download_if_missing=True, random_state=None, - shuffle=False, percent10=True): + percent10=True): """Load the kddcup99 dataset, downloading it if necessary. @@ -251,9 +256,6 @@ def _fetch_brute_kddcup99(data_home=None, If None, the random number generator is the RandomState instance used by `np.random`. - shuffle : bool, default=False - Whether to shuffle dataset. - percent10 : bool, default=True Whether to load only 10 percent of the data. @@ -372,9 +374,6 @@ def _fetch_brute_kddcup99(data_home=None, X = joblib.load(samples_path) y = joblib.load(targets_path) - if shuffle: - X, y = shuffle_method(X, y, random_state=random_state) - return Bunch(data=X, target=y, DESCR=__doc__) diff --git a/sklearn/datasets/tests/test_base.py b/sklearn/datasets/tests/test_base.py index a7cf278e37e44..04fa79f4160f4 100644 --- a/sklearn/datasets/tests/test_base.py +++ b/sklearn/datasets/tests/test_base.py @@ -27,7 +27,6 @@ from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import with_setup DATA_HOME = tempfile.mkdtemp(prefix="scikit_learn_data_home_test_") @@ -85,33 +84,42 @@ def test_default_empty_load_files(): assert_equal(res.DESCR, None) -@with_setup(setup_load_files, teardown_load_files) def test_default_load_files(): - res = load_files(LOAD_FILES_ROOT) - assert_equal(len(res.filenames), 1) - assert_equal(len(res.target_names), 2) - assert_equal(res.DESCR, None) - assert_equal(res.data, [b("Hello World!\n")]) + try: + setup_load_files() + res = load_files(LOAD_FILES_ROOT) + assert_equal(len(res.filenames), 1) + assert_equal(len(res.target_names), 2) + assert_equal(res.DESCR, None) + assert_equal(res.data, [b("Hello World!\n")]) + finally: + teardown_load_files() -@with_setup(setup_load_files, teardown_load_files) def test_load_files_w_categories_desc_and_encoding(): - category = os.path.abspath(TEST_CATEGORY_DIR1).split('/').pop() - res = load_files(LOAD_FILES_ROOT, description="test", - categories=category, encoding="utf-8") - assert_equal(len(res.filenames), 1) - assert_equal(len(res.target_names), 1) - assert_equal(res.DESCR, "test") - assert_equal(res.data, [u("Hello World!\n")]) + try: + setup_load_files() + category = os.path.abspath(TEST_CATEGORY_DIR1).split('/').pop() + res = load_files(LOAD_FILES_ROOT, description="test", + categories=category, encoding="utf-8") + assert_equal(len(res.filenames), 1) + assert_equal(len(res.target_names), 1) + assert_equal(res.DESCR, "test") + assert_equal(res.data, [u("Hello World!\n")]) + finally: + teardown_load_files() -@with_setup(setup_load_files, teardown_load_files) def test_load_files_wo_load_content(): - res = load_files(LOAD_FILES_ROOT, load_content=False) - assert_equal(len(res.filenames), 1) - assert_equal(len(res.target_names), 2) - assert_equal(res.DESCR, None) - assert_equal(res.get('data'), None) + try: + setup_load_files() + res = load_files(LOAD_FILES_ROOT, load_content=False) + assert_equal(len(res.filenames), 1) + assert_equal(len(res.target_names), 2) + assert_equal(res.DESCR, None) + assert_equal(res.get('data'), None) + finally: + teardown_load_files() def test_load_sample_images(): diff --git a/sklearn/datasets/tests/test_kddcup99.py b/sklearn/datasets/tests/test_kddcup99.py index 498b98f4e67ed..77dc2be185b02 100644 --- a/sklearn/datasets/tests/test_kddcup99.py +++ b/sklearn/datasets/tests/test_kddcup99.py @@ -37,3 +37,13 @@ def test_percent10(): data = fetch_kddcup99('smtp') assert_equal(data.data.shape, (9571, 3)) assert_equal(data.target.shape, (9571,)) + + +def test_shuffle(): + try: + dataset = fetch_kddcup99(random_state=0, subset='SA', shuffle=True, + percent10=True, download_if_missing=False) + except IOError: + raise SkipTest("kddcup99 dataset can not be loaded.") + + assert(any(dataset.target[-100:] == b'normal.')) diff --git a/sklearn/datasets/tests/test_lfw.py b/sklearn/datasets/tests/test_lfw.py index 3e5875a060be1..ac6395c4958be 100644 --- a/sklearn/datasets/tests/test_lfw.py +++ b/sklearn/datasets/tests/test_lfw.py @@ -28,7 +28,7 @@ from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_equal from sklearn.utils.testing import SkipTest -from sklearn.utils.testing import raises +from sklearn.utils.testing import assert_raises SCIKIT_LEARN_DATA = tempfile.mkdtemp(prefix="scikit_learn_lfw_test_") @@ -110,10 +110,9 @@ def teardown_module(): shutil.rmtree(SCIKIT_LEARN_EMPTY_DATA) -@raises(IOError) def test_load_empty_lfw_people(): - fetch_lfw_people(data_home=SCIKIT_LEARN_EMPTY_DATA, - download_if_missing=False) + assert_raises(IOError, fetch_lfw_people, data_home=SCIKIT_LEARN_EMPTY_DATA, + download_if_missing=False) def test_load_fake_lfw_people(): @@ -148,16 +147,15 @@ def test_load_fake_lfw_people(): 'Chen Dupont', 'John Lee', 'Lin Bauman', 'Onur Lopez']) -@raises(ValueError) def test_load_fake_lfw_people_too_restrictive(): - fetch_lfw_people(data_home=SCIKIT_LEARN_DATA, min_faces_per_person=100, - download_if_missing=False) + assert_raises(ValueError, fetch_lfw_people, data_home=SCIKIT_LEARN_DATA, + min_faces_per_person=100, download_if_missing=False) -@raises(IOError) def test_load_empty_lfw_pairs(): - fetch_lfw_pairs(data_home=SCIKIT_LEARN_EMPTY_DATA, - download_if_missing=False) + assert_raises(IOError, fetch_lfw_pairs, + data_home=SCIKIT_LEARN_EMPTY_DATA, + download_if_missing=False) def test_load_fake_lfw_pairs(): diff --git a/sklearn/datasets/tests/test_mldata.py b/sklearn/datasets/tests/test_mldata.py index 1ce22079bdd11..7405b8e025c0f 100644 --- a/sklearn/datasets/tests/test_mldata.py +++ b/sklearn/datasets/tests/test_mldata.py @@ -13,7 +13,6 @@ from sklearn.utils.testing import mock_mldata_urlopen from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import with_setup from sklearn.utils.testing import assert_array_equal @@ -43,10 +42,9 @@ def test_mldata_filename(): assert_equal(mldata_filename(name), desired) -@with_setup(setup_tmpdata, teardown_tmpdata) def test_download(): """Test that fetch_mldata is able to download and cache a data set.""" - + setup_tmpdata() _urlopen_ref = datasets.mldata.urlopen datasets.mldata.urlopen = mock_mldata_urlopen({ 'mock': { @@ -66,10 +64,11 @@ def test_download(): fetch_mldata, 'not_existing_name') finally: datasets.mldata.urlopen = _urlopen_ref + teardown_tmpdata() -@with_setup(setup_tmpdata, teardown_tmpdata) def test_fetch_one_column(): + setup_tmpdata() _urlopen_ref = datasets.mldata.urlopen try: dataname = 'onecol' @@ -90,10 +89,11 @@ def test_fetch_one_column(): assert_equal(dset.data.shape, (3, 2)) finally: datasets.mldata.urlopen = _urlopen_ref + teardown_tmpdata() -@with_setup(setup_tmpdata, teardown_tmpdata) def test_fetch_multiple_column(): + setup_tmpdata() _urlopen_ref = datasets.mldata.urlopen try: # create fake data set in cache @@ -167,3 +167,4 @@ def test_fetch_multiple_column(): finally: datasets.mldata.urlopen = _urlopen_ref + teardown_tmpdata() diff --git a/sklearn/datasets/tests/test_svmlight_format.py b/sklearn/datasets/tests/test_svmlight_format.py index d688dc798237b..2e3b7982476b0 100644 --- a/sklearn/datasets/tests/test_svmlight_format.py +++ b/sklearn/datasets/tests/test_svmlight_format.py @@ -15,7 +15,6 @@ from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_raises_regex -from sklearn.utils.testing import raises from sklearn.utils.testing import assert_in from sklearn.utils.fixes import sp_version @@ -138,20 +137,17 @@ def test_load_compressed(): assert_array_equal(y, ybz) -@raises(ValueError) def test_load_invalid_file(): - load_svmlight_file(invalidfile) + assert_raises(ValueError, load_svmlight_file, invalidfile) -@raises(ValueError) def test_load_invalid_order_file(): - load_svmlight_file(invalidfile2) + assert_raises(ValueError, load_svmlight_file, invalidfile2) -@raises(ValueError) def test_load_zero_based(): f = BytesIO(b("-1 4:1.\n1 0:1\n")) - load_svmlight_file(f, zero_based=False) + assert_raises(ValueError, load_svmlight_file, f, zero_based=False) def test_load_zero_based_auto(): @@ -186,21 +182,19 @@ def test_load_with_qid(): assert_array_equal(X.toarray(), [[.53, .12], [.13, .1], [.87, .12]]) -@raises(ValueError) def test_load_invalid_file2(): - load_svmlight_files([datafile, invalidfile, datafile]) + assert_raises(ValueError, load_svmlight_files, + [datafile, invalidfile, datafile]) -@raises(TypeError) def test_not_a_filename(): # in python 3 integers are valid file opening arguments (taken as unix # file descriptors) - load_svmlight_file(.42) + assert_raises(TypeError, load_svmlight_file, .42) -@raises(IOError) def test_invalid_filename(): - load_svmlight_file("trou pic nic douille") + assert_raises(IOError, load_svmlight_file, "trou pic nic douille") def test_dump(): diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py index 62cd2cd2aa101..e4b36d120773a 100644 --- a/sklearn/decomposition/dict_learning.py +++ b/sklearn/decomposition/dict_learning.py @@ -824,7 +824,6 @@ def transform(self, X): check_is_fitted(self, 'components_') X = check_array(X) - n_samples, n_features = X.shape code = sparse_encode( X, self.components_, algorithm=self.transform_algorithm, @@ -927,9 +926,9 @@ def fit(self, X, y=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) - Training vector, where n_samples in the number of samples - and n_features is the number of features. + X : Ignored + + y : Ignored Returns ------- @@ -1081,6 +1080,8 @@ def fit(self, X, y=None): Training vector, where n_samples in the number of samples and n_features is the number of features. + y : Ignored + Returns ------- self : object @@ -1251,6 +1252,8 @@ def fit(self, X, y=None): Training vector, where n_samples in the number of samples and n_features is the number of features. + y : Ignored + Returns ------- self : object @@ -1284,6 +1287,8 @@ def partial_fit(self, X, y=None, iter_offset=None): Training vector, where n_samples in the number of samples and n_features is the number of features. + y : Ignored + iter_offset : integer, optional The number of iteration on data batches that has been performed before this call to partial_fit. This is optional: diff --git a/sklearn/decomposition/factor_analysis.py b/sklearn/decomposition/factor_analysis.py index 4440ee90bd84a..481a5e2322e3f 100644 --- a/sklearn/decomposition/factor_analysis.py +++ b/sklearn/decomposition/factor_analysis.py @@ -149,6 +149,8 @@ def fit(self, X, y=None): X : array-like, shape (n_samples, n_features) Training data. + y : Ignored + Returns ------- self @@ -324,7 +326,6 @@ def score_samples(self, X): Xr = X - self.mean_ precision = self.get_precision() n_features = X.shape[1] - log_like = np.zeros(X.shape[0]) log_like = -.5 * (Xr * (np.dot(Xr, precision))).sum(axis=1) log_like -= .5 * (n_features * log(2. * np.pi) - fast_logdet(precision)) @@ -338,6 +339,8 @@ def score(self, X, y=None): X : array, shape (n_samples, n_features) The data + y : Ignored + Returns ------- ll : float diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py index fcc11ff643a5e..6cb58a250be78 100644 --- a/sklearn/decomposition/fastica_.py +++ b/sklearn/decomposition/fastica_.py @@ -509,6 +509,8 @@ def fit_transform(self, X, y=None): Training data, where n_samples is the number of samples and n_features is the number of features. + y : Ignored + Returns ------- X_new : array-like, shape (n_samples, n_components) @@ -524,6 +526,8 @@ def fit(self, X, y=None): Training data, where n_samples is the number of samples and n_features is the number of features. + y : Ignored + Returns ------- self diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py index f0604001fab53..13e51090dd82e 100644 --- a/sklearn/decomposition/incremental_pca.py +++ b/sklearn/decomposition/incremental_pca.py @@ -158,7 +158,7 @@ def fit(self, X, y=None): Training data, where n_samples is the number of samples and n_features is the number of features. - y : Passthrough for ``Pipeline`` compatibility. + y : Ignored Returns ------- @@ -199,6 +199,8 @@ def partial_fit(self, X, y=None, check_input=True): check_input : bool Run check_array on X. + y : Ignored + Returns ------- self : object diff --git a/sklearn/decomposition/nmf.py b/sklearn/decomposition/nmf.py index 153731cb83651..8b3830470921b 100644 --- a/sklearn/decomposition/nmf.py +++ b/sklearn/decomposition/nmf.py @@ -1211,6 +1211,8 @@ def fit_transform(self, X, y=None, W=None, H=None): X : {array-like, sparse matrix}, shape (n_samples, n_features) Data matrix to be decomposed + y : Ignored + W : array-like, shape (n_samples, n_components) If init='custom', it is used as initial guess for the solution. @@ -1249,6 +1251,8 @@ def fit(self, X, y=None, **params): X : {array-like, sparse matrix}, shape (n_samples, n_features) Data matrix to be decomposed + y : Ignored + Returns ------- self diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index e9743c69422fb..01b521cb7a76f 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -473,6 +473,8 @@ def partial_fit(self, X, y=None): X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. + y : Ignored + Returns ------- self @@ -515,6 +517,8 @@ def fit(self, X, y=None): X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. + y : Ignored + Returns ------- self @@ -714,6 +718,8 @@ def score(self, X, y=None): X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. + y : Ignored + Returns ------- score : float diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 171774321cec0..cbd688f3d748d 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -134,8 +134,12 @@ class PCA(_BasePCA): to guess the dimension if ``0 < n_components < 1`` and svd_solver == 'full', select the number of components such that the amount of variance that needs to be - explained is greater than the percentage specified by n_components - n_components cannot be equal to n_features for svd_solver == 'arpack'. + explained is greater than the percentage specified by n_components. + If svd_solver == 'arpack', the number of components must be strictly + less than the minimum of n_features and n_samples. + Hence, the None case results in: + + n_components == min(n_samples, n_features) - 1 copy : bool (default True) If False, data passed to fit are overwritten and running @@ -166,7 +170,7 @@ class PCA(_BasePCA): arpack : run SVD truncated to n_components calling ARPACK solver via `scipy.sparse.linalg.svds`. It requires strictly - 0 < n_components < X.shape[1] + 0 < n_components < min(X.shape) randomized : run randomized SVD by the method of Halko et al. @@ -210,7 +214,7 @@ class PCA(_BasePCA): Percentage of variance explained by each of the selected components. If ``n_components`` is not set then all components are stored and the - sum of explained variances is equal to 1.0. + sum of the ratios is equal to 1.0. singular_values_ : array, shape (n_components,) The singular values corresponding to each of the selected components. @@ -226,7 +230,8 @@ class PCA(_BasePCA): The estimated number of components. When n_components is set to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this number is estimated from input data. Otherwise it equals the parameter - n_components, or n_features if n_components is None. + n_components, or the lesser value of n_features and n_samples + if n_components is None. noise_variance_ : float The estimated noise covariance following the Probabilistic PCA model @@ -319,6 +324,8 @@ def fit(self, X, y=None): Training data, where n_samples in the number of samples and n_features is the number of features. + y : Ignored + Returns ------- self : object @@ -336,6 +343,8 @@ def fit_transform(self, X, y=None): Training data, where n_samples is the number of samples and n_features is the number of features. + y : Ignored + Returns ------- X_new : array-like, shape (n_samples, n_components) @@ -367,7 +376,10 @@ def _fit(self, X): # Handle n_components==None if self.n_components is None: - n_components = X.shape[1] + if self.svd_solver != 'arpack': + n_components = min(X.shape) + else: + n_components = min(X.shape) - 1 else: n_components = self.n_components @@ -400,10 +412,11 @@ def _fit_full(self, X, n_components): if n_samples < n_features: raise ValueError("n_components='mle' is only supported " "if n_samples >= n_features") - elif not 0 <= n_components <= n_features: + elif not 0 <= n_components <= min(n_samples, n_features): raise ValueError("n_components=%r must be between 0 and " - "n_features=%r with svd_solver='full'" - % (n_components, n_features)) + "min(n_samples, n_features)=%r with " + "svd_solver='full'" + % (n_components, min(n_samples, n_features))) # Center data self.mean_ = np.mean(X, axis=0) @@ -458,14 +471,19 @@ def _fit_truncated(self, X, n_components, svd_solver): raise ValueError("n_components=%r cannot be a string " "with svd_solver='%s'" % (n_components, svd_solver)) - elif not 1 <= n_components <= n_features: + elif not 1 <= n_components <= min(n_samples, n_features): raise ValueError("n_components=%r must be between 1 and " - "n_features=%r with svd_solver='%s'" - % (n_components, n_features, svd_solver)) - elif svd_solver == 'arpack' and n_components == n_features: - raise ValueError("n_components=%r must be stricly less than " - "n_features=%r with svd_solver='%s'" - % (n_components, n_features, svd_solver)) + "min(n_samples, n_features)=%r with " + "svd_solver='%s'" + % (n_components, min(n_samples, n_features), + svd_solver)) + elif svd_solver == 'arpack' and n_components == min(n_samples, + n_features): + raise ValueError("n_components=%r must be strictly less than " + "min(n_samples, n_features)=%r with " + "svd_solver='%s'" + % (n_components, min(n_samples, n_features), + svd_solver)) random_state = check_random_state(self.random_state) @@ -500,6 +518,7 @@ def _fit_truncated(self, X, n_components, svd_solver): self.explained_variance_ratio_ = \ self.explained_variance_ / total_var.sum() self.singular_values_ = S.copy() # Store the singular values. + if self.n_components_ < min(n_features, n_samples): self.noise_variance_ = (total_var.sum() - self.explained_variance_.sum()) @@ -531,7 +550,6 @@ def score_samples(self, X): X = check_array(X) Xr = X - self.mean_ n_features = X.shape[1] - log_like = np.zeros(X.shape[0]) precision = self.get_precision() log_like = -.5 * (Xr * (np.dot(Xr, precision))).sum(axis=1) log_like -= .5 * (n_features * log(2. * np.pi) - @@ -550,6 +568,8 @@ def score(self, X, y=None): X : array, shape(n_samples, n_features) The data. + y : Ignored + Returns ------- ll : float @@ -676,6 +696,8 @@ def fit(self, X, y=None): Training data, where n_samples in the number of samples and n_features is the number of features. + y : Ignored + Returns ------- self : object @@ -762,6 +784,8 @@ def fit_transform(self, X, y=None): New data, where n_samples in the number of samples and n_features is the number of features. + y : Ignored + Returns ------- X_new : array-like, shape (n_samples, n_components) diff --git a/sklearn/decomposition/sparse_pca.py b/sklearn/decomposition/sparse_pca.py index 47c03a80278b9..68db09b5d277c 100644 --- a/sklearn/decomposition/sparse_pca.py +++ b/sklearn/decomposition/sparse_pca.py @@ -107,6 +107,8 @@ def fit(self, X, y=None): Training vector, where n_samples in the number of samples and n_features is the number of features. + y : Ignored + Returns ------- self : object @@ -275,6 +277,8 @@ def fit(self, X, y=None): Training vector, where n_samples in the number of samples and n_features is the number of features. + y : Ignored + Returns ------- self : object diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 6795013b0790a..aa67189407296 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -8,6 +8,7 @@ from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import assert_no_warnings from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import ignore_warnings @@ -349,11 +350,58 @@ def test_pca_inverse(): def test_pca_validation(): - X = [[0, 1], [1, 0]] + # Ensures that solver-specific extreme inputs for the n_components + # parameter raise errors + X = np.array([[0, 1, 0], [1, 0, 0]]) + smallest_d = 2 # The smallest dimension + lower_limit = {'randomized': 1, 'arpack': 1, 'full': 0, 'auto': 0} + for solver in solver_list: - for n_components in [-1, 3]: - assert_raises(ValueError, - PCA(n_components, svd_solver=solver).fit, X) + # We conduct the same test on X.T so that it is invariant to axis. + for data in [X, X.T]: + for n_components in [-1, 3]: + + if solver == 'auto': + solver_reported = 'full' + else: + solver_reported = solver + + assert_raises_regex(ValueError, + "n_components={}L? must be between " + "{}L? and min\(n_samples, n_features\)=" + "{}L? with svd_solver=\'{}\'" + .format(n_components, + lower_limit[solver], + smallest_d, + solver_reported), + PCA(n_components, + svd_solver=solver).fit, data) + if solver == 'arpack': + + n_components = smallest_d + + assert_raises_regex(ValueError, + "n_components={}L? must be " + "strictly less than " + "min\(n_samples, n_features\)={}L?" + " with svd_solver=\'arpack\'" + .format(n_components, smallest_d), + PCA(n_components, svd_solver=solver) + .fit, data) + + +def test_n_components_none(): + # Ensures that n_components == None is handled correctly + X = iris.data + # We conduct the same test on X.T so that it is invariant to axis. + for data in [X, X.T]: + for solver in solver_list: + pca = PCA(svd_solver=solver) + pca.fit(data) + if solver == 'arpack': + assert_equal(pca.n_components_, min(data.shape) - 1) + else: + assert_equal(pca.n_components_, min(data.shape)) def test_randomized_pca_check_projection(): diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py index 87b8b45e1543a..028304672e4da 100644 --- a/sklearn/decomposition/truncated_svd.py +++ b/sklearn/decomposition/truncated_svd.py @@ -132,6 +132,8 @@ def fit(self, X, y=None): X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. + y : Ignored + Returns ------- self : object @@ -148,6 +150,8 @@ def fit_transform(self, X, y=None): X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. + y : Ignored + Returns ------- X_new : array, shape (n_samples, n_components) diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index a72f25a5f7b9b..854f728c5638a 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -153,7 +153,7 @@ class ZeroEstimator(object): """An estimator that simply predicts zero. """ def fit(self, X, y, sample_weight=None): - if np.issubdtype(y.dtype, int): + if np.issubdtype(y.dtype, np.signedinteger): # classification self.n_classes = np.unique(y).shape[0] if self.n_classes == 2: diff --git a/sklearn/ensemble/tests/test_voting_classifier.py b/sklearn/ensemble/tests/test_voting_classifier.py index 023be79912d12..22665384ed7ce 100644 --- a/sklearn/ensemble/tests/test_voting_classifier.py +++ b/sklearn/ensemble/tests/test_voting_classifier.py @@ -296,7 +296,14 @@ def test_set_params(): clf3 = GaussianNB() eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft', weights=[1, 2]) + assert_true('lr' in eclf1.named_estimators) + assert_true(eclf1.named_estimators.lr is eclf1.estimators[0][1]) + assert_true(eclf1.named_estimators.lr is eclf1.named_estimators['lr']) eclf1.fit(X, y) + assert_true('lr' in eclf1.named_estimators_) + assert_true(eclf1.named_estimators_.lr is eclf1.estimators_[0]) + assert_true(eclf1.named_estimators_.lr is eclf1.named_estimators_['lr']) + eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft', weights=[1, 2]) eclf2.set_params(nb=clf2).fit(X, y) diff --git a/sklearn/ensemble/voting_classifier.py b/sklearn/ensemble/voting_classifier.py index ad6c0125dd664..26bc8e66df01a 100644 --- a/sklearn/ensemble/voting_classifier.py +++ b/sklearn/ensemble/voting_classifier.py @@ -21,6 +21,7 @@ from ..externals.joblib import Parallel, delayed from ..utils.validation import has_fit_parameter, check_is_fitted from ..utils.metaestimators import _BaseComposition +from ..utils import Bunch def _parallel_fit_estimator(estimator, X, y, sample_weight=None): @@ -75,6 +76,11 @@ class VotingClassifier(_BaseComposition, ClassifierMixin, TransformerMixin): The collection of fitted sub-estimators as defined in ``estimators`` that are not `None`. + named_estimators_ : Bunch object, a dictionary with attribute access + Attribute to access any fitted sub-estimators by name. + + .. versionadded:: 0.20 + classes_ : array-like, shape = [n_predictions] The classes labels. @@ -94,6 +100,9 @@ class VotingClassifier(_BaseComposition, ClassifierMixin, TransformerMixin): >>> eclf1 = eclf1.fit(X, y) >>> print(eclf1.predict(X)) [1 1 1 2 2 2] + >>> np.array_equal(eclf1.named_estimators_.lr.predict(X), + ... eclf1.named_estimators_['lr'].predict(X)) + True >>> eclf2 = VotingClassifier(estimators=[ ... ('lr', clf1), ('rf', clf2), ('gnb', clf3)], ... voting='soft') @@ -122,7 +131,7 @@ def __init__(self, estimators, voting='hard', weights=None, n_jobs=1, @property def named_estimators(self): - return dict(self.estimators) + return Bunch(**dict(self.estimators)) def fit(self, X, y, sample_weight=None): """ Fit the estimators. @@ -188,6 +197,9 @@ def fit(self, X, y, sample_weight=None): sample_weight=sample_weight) for clf in clfs if clf is not None) + self.named_estimators_ = Bunch(**dict()) + for k, e in zip(self.estimators, self.estimators_): + self.named_estimators_[k[0]] = e return self @property diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index d258625897e27..6f0d6b0214953 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -112,23 +112,19 @@ def test_hasher_zeros(): @ignore_warnings(category=DeprecationWarning) def test_hasher_alternate_sign(): - # the last two tokens produce a hash collision that sums as 0 - X = [["foo", "bar", "baz", "investigation need", "records"]] + X = [list("Thequickbrownfoxjumped")] Xt = FeatureHasher(alternate_sign=True, non_negative=False, input_type='string').fit_transform(X) - assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) - # check that we have a collision that produces a 0 count - assert_true(len(Xt.data) < len(X[0])) - assert_true((Xt.data == 0.).any()) + assert Xt.data.min() < 0 and Xt.data.max() > 0 Xt = FeatureHasher(alternate_sign=True, non_negative=True, input_type='string').fit_transform(X) - assert_true((Xt.data >= 0).all()) # all counts are positive - assert_true((Xt.data == 0.).any()) # we still have a collision + assert Xt.data.min() > 0 + Xt = FeatureHasher(alternate_sign=False, non_negative=True, input_type='string').fit_transform(X) - assert_true((Xt.data > 0).all()) # strictly positive counts + assert Xt.data.min() > 0 Xt_2 = FeatureHasher(alternate_sign=False, non_negative=False, input_type='string').fit_transform(X) # With initially positive features, the non_negative option should @@ -136,6 +132,25 @@ def test_hasher_alternate_sign(): assert_array_equal(Xt.data, Xt_2.data) +@ignore_warnings(category=DeprecationWarning) +def test_hash_collisions(): + X = [list("Thequickbrownfoxjumped")] + + Xt = FeatureHasher(alternate_sign=True, non_negative=False, + n_features=1, input_type='string').fit_transform(X) + # check that some of the hashed tokens are added + # with an opposite sign and cancel out + assert abs(Xt.data[0]) < len(X[0]) + + Xt = FeatureHasher(alternate_sign=True, non_negative=True, + n_features=1, input_type='string').fit_transform(X) + assert abs(Xt.data[0]) < len(X[0]) + + Xt = FeatureHasher(alternate_sign=False, non_negative=True, + n_features=1, input_type='string').fit_transform(X) + assert Xt.data[0] == len(X[0]) + + @ignore_warnings(category=DeprecationWarning) def test_hasher_negative(): X = [{"foo": 2, "bar": -4, "baz": -1}.items()] diff --git a/sklearn/feature_extraction/tests/test_image.py b/sklearn/feature_extraction/tests/test_image.py index 276835c10caf1..5e1b53040f438 100644 --- a/sklearn/feature_extraction/tests/test_image.py +++ b/sklearn/feature_extraction/tests/test_image.py @@ -7,12 +7,10 @@ from scipy import ndimage from scipy.sparse.csgraph import connected_components -from numpy.testing import assert_raises - from sklearn.feature_extraction.image import ( img_to_graph, grid_to_graph, extract_patches_2d, reconstruct_from_patches_2d, PatchExtractor, extract_patches) -from sklearn.utils.testing import assert_equal, assert_true +from sklearn.utils.testing import assert_equal, assert_true, assert_raises def test_img_to_graph(): diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 9e613b1bca8c1..ff13cd6e00179 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -23,13 +23,12 @@ import numpy as np from numpy.testing import assert_array_almost_equal from numpy.testing import assert_array_equal -from numpy.testing import assert_raises from sklearn.utils.testing import (assert_equal, assert_false, assert_true, assert_not_equal, assert_almost_equal, assert_in, assert_less, assert_greater, assert_warns_message, assert_raise_message, clean_warning_registry, ignore_warnings, - SkipTest) + SkipTest, assert_raises) from collections import defaultdict, Mapping from functools import partial diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index fa7306ab9def5..417aeef2f8bc2 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1086,7 +1086,7 @@ def transform(self, X, copy=True): ------- vectors : sparse matrix, [n_samples, n_features] """ - if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float): + if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.floating): # preserve float family dtype X = sp.csr_matrix(X, copy=copy) else: diff --git a/sklearn/feature_selection/variance_threshold.py b/sklearn/feature_selection/variance_threshold.py index c9e018d94a84e..13e1aa7078310 100644 --- a/sklearn/feature_selection/variance_threshold.py +++ b/sklearn/feature_selection/variance_threshold.py @@ -54,7 +54,7 @@ def fit(self, X, y=None): Sample vectors from which to compute variances. y : any - Ignored. This parameter exists only for compatibility with + Ignored This parameter exists only for compatibility with sklearn.pipeline.Pipeline. Returns diff --git a/sklearn/gaussian_process/gaussian_process.py b/sklearn/gaussian_process/gaussian_process.py index 53c519e5d5ac8..5bc89d28df6b6 100644 --- a/sklearn/gaussian_process/gaussian_process.py +++ b/sklearn/gaussian_process/gaussian_process.py @@ -444,11 +444,6 @@ def predict(self, X, eval_MSE=False, batch_size=None): # Normalize input X = (X - self.X_mean) / self.X_std - # Initialize output - y = np.zeros(n_eval) - if eval_MSE: - MSE = np.zeros(n_eval) - # Get pairwise componentwise L1-distances to the input training set dx = manhattan_distances(X, Y=self.X, sum_over_features=False) # Get regression function and correlation diff --git a/sklearn/gaussian_process/gpr.py b/sklearn/gaussian_process/gpr.py index 4f9ff9cee7911..c92ca7f68f368 100644 --- a/sklearn/gaussian_process/gpr.py +++ b/sklearn/gaussian_process/gpr.py @@ -245,6 +245,8 @@ def obj_func(theta, eval_gradient=True): K[np.diag_indices_from(K)] += self.alpha try: self.L_ = cholesky(K, lower=True) # Line 2 + # self.L_ changed, self._K_inv needs to be recomputed + self._K_inv = None except np.linalg.LinAlgError as exc: exc.args = ("The kernel, %s, is not returning a " "positive definite matrix. Try gradually " @@ -320,13 +322,18 @@ def predict(self, X, return_std=False, return_cov=False): y_cov = self.kernel_(X) - K_trans.dot(v) # Line 6 return y_mean, y_cov elif return_std: - # compute inverse K_inv of K based on its Cholesky - # decomposition L and its inverse L_inv - L_inv = solve_triangular(self.L_.T, np.eye(self.L_.shape[0])) - K_inv = L_inv.dot(L_inv.T) + # cache result of K_inv computation + if self._K_inv is None: + # compute inverse K_inv of K based on its Cholesky + # decomposition L and its inverse L_inv + L_inv = solve_triangular(self.L_.T, + np.eye(self.L_.shape[0])) + self._K_inv = L_inv.dot(L_inv.T) + # Compute variance of predictive distribution y_var = self.kernel_.diag(X) - y_var -= np.einsum("ij,ij->i", np.dot(K_trans, K_inv), K_trans) + y_var -= np.einsum("ij,ij->i", + np.dot(K_trans, self._K_inv), K_trans) # Check if any of the variances is negative because of # numerical issues. If yes: set the variance to 0. diff --git a/sklearn/gaussian_process/tests/test_gaussian_process.py b/sklearn/gaussian_process/tests/test_gaussian_process.py index 860e3f290f3ea..37d872fc99fb5 100644 --- a/sklearn/gaussian_process/tests/test_gaussian_process.py +++ b/sklearn/gaussian_process/tests/test_gaussian_process.py @@ -11,7 +11,7 @@ from sklearn.gaussian_process import regression_models as regression from sklearn.gaussian_process import correlation_models as correlation from sklearn.datasets import make_regression -from sklearn.utils.testing import assert_greater, assert_true, raises +from sklearn.utils.testing import assert_greater, assert_true, assert_raises f = lambda x: x * np.sin(x) @@ -95,10 +95,9 @@ def test_2d_2d(regr=regression.constant, corr=correlation.squared_exponential, assert_true(np.allclose(y_pred, y) and np.allclose(MSE, 0.)) -@raises(ValueError) def test_wrong_number_of_outputs(): gp = GaussianProcess() - gp.fit([[1, 2, 3], [4, 5, 6]], [1, 2, 3]) + assert_raises(ValueError, gp.fit, [[1, 2, 3], [4, 5, 6]], [1, 2, 3]) def test_more_builtin_correlation_models(random_start=1): diff --git a/sklearn/gaussian_process/tests/test_gpr.py b/sklearn/gaussian_process/tests/test_gpr.py index b645a6be18e22..602b2b88ae9c9 100644 --- a/sklearn/gaussian_process/tests/test_gpr.py +++ b/sklearn/gaussian_process/tests/test_gpr.py @@ -15,11 +15,13 @@ from sklearn.utils.testing \ import (assert_true, assert_greater, assert_array_less, assert_almost_equal, assert_equal, assert_raise_message, - assert_array_almost_equal) + assert_array_almost_equal, assert_array_equal) def f(x): return x * np.sin(x) + + X = np.atleast_2d([1., 3., 5., 6., 7., 8.]).T X2 = np.atleast_2d([2., 4., 5.5, 6.5, 7.5]).T y = f(X).ravel() @@ -344,3 +346,21 @@ def test_no_fit_default_predict(): assert_array_almost_equal(y_std1, y_std2) assert_array_almost_equal(y_cov1, y_cov2) + + +def test_K_inv_reset(): + y2 = f(X2).ravel() + for kernel in kernels: + # Test that self._K_inv is reset after a new fit + gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y) + assert_true(hasattr(gpr, '_K_inv')) + assert_true(gpr._K_inv is None) + gpr.predict(X, return_std=True) + assert_true(gpr._K_inv is not None) + gpr.fit(X2, y2) + assert_true(gpr._K_inv is None) + gpr.predict(X2, return_std=True) + gpr2 = GaussianProcessRegressor(kernel=kernel).fit(X2, y2) + gpr2.predict(X2, return_std=True) + # the value of K_inv should be independent of the first fit + assert_array_equal(gpr._K_inv, gpr2._K_inv) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index cfe1aba4ea178..5571138d68d83 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -206,7 +206,7 @@ def _translate_train_sizes(train_sizes, n_max_training_samples): n_ticks = train_sizes_abs.shape[0] n_min_required_samples = np.min(train_sizes_abs) n_max_required_samples = np.max(train_sizes_abs) - if np.issubdtype(train_sizes_abs.dtype, np.float): + if np.issubdtype(train_sizes_abs.dtype, np.floating): if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0: raise ValueError("train_sizes has been interpreted as fractions " "of the maximum number of training samples and " diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py index 17b988b08e6c7..bb7c12ab601a2 100644 --- a/sklearn/linear_model/least_angle.py +++ b/sklearn/linear_model/least_angle.py @@ -414,8 +414,6 @@ def lars_path(X, y, Xy=None, Gram=None, max_iter=500, alphas[-add_features:] = 0 coef = coefs[n_iter] prev_coef = coefs[n_iter - 1] - alpha = alphas[n_iter, np.newaxis] - prev_alpha = alphas[n_iter - 1, np.newaxis] else: # mimic the effect of incrementing n_iter on the array references prev_coef = coef diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index 94eb3ea3d2dcb..ea4300df01100 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -17,7 +17,6 @@ from sklearn.utils.testing import assert_warns from sklearn.utils.testing import ignore_warnings from sklearn.utils.testing import assert_warns_message -from sklearn.utils.testing import raises from sklearn.exceptions import ConvergenceWarning from sklearn.linear_model.logistic import ( @@ -249,13 +248,13 @@ def test_write_parameters(): assert_array_almost_equal(clf.decision_function(X), 0) -@raises(ValueError) def test_nan(): # Test proper NaN handling. # Regression test for Issue #252: fit used to go into an infinite loop. Xnan = np.array(X, dtype=np.float64) Xnan[0, 1] = np.nan - LogisticRegression(random_state=0).fit(Xnan, Y1) + logistic = LogisticRegression(random_state=0) + assert_raises(ValueError, logistic.fit, Xnan, Y1) def test_consistency_path(): diff --git a/sklearn/linear_model/tests/test_ransac.py b/sklearn/linear_model/tests/test_ransac.py index 7146ed1a129b2..6f8e716f9ad19 100644 --- a/sklearn/linear_model/tests/test_ransac.py +++ b/sklearn/linear_model/tests/test_ransac.py @@ -1,7 +1,7 @@ import numpy as np from scipy import sparse -from numpy.testing import assert_equal, assert_raises +from numpy.testing import assert_equal from numpy.testing import assert_array_almost_equal from numpy.testing import assert_array_equal @@ -10,6 +10,7 @@ from sklearn.utils.testing import assert_warns from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_raises_regexp +from sklearn.utils.testing import assert_raises from sklearn.linear_model import LinearRegression, RANSACRegressor, Lasso from sklearn.linear_model.ransac import _dynamic_max_trials diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index f033a4f6021b2..d4552a9934cf1 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -9,7 +9,6 @@ from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_less -from sklearn.utils.testing import raises from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_false, assert_true from sklearn.utils.testing import assert_equal @@ -266,11 +265,11 @@ def test_late_onset_averaging_reached(self): decimal=16) assert_almost_equal(clf1.intercept_, average_intercept, decimal=16) - @raises(ValueError) def test_sgd_bad_alpha_for_optimal_learning_rate(self): # Check whether expected ValueError on bad alpha, i.e. 0 # since alpha is used to compute the optimal learning rate - self.factory(alpha=0, learning_rate="optimal") + assert_raises(ValueError, self.factory, + alpha=0, learning_rate="optimal") class DenseSGDClassifierTestCase(unittest.TestCase, CommonTest): @@ -287,63 +286,56 @@ def test_sgd(self): # assert_almost_equal(clf.coef_[0], clf.coef_[1], decimal=7) assert_array_equal(clf.predict(T), true_result) - @raises(ValueError) def test_sgd_bad_l1_ratio(self): # Check whether expected ValueError on bad l1_ratio - self.factory(l1_ratio=1.1) + assert_raises(ValueError, self.factory, l1_ratio=1.1) - @raises(ValueError) def test_sgd_bad_learning_rate_schedule(self): # Check whether expected ValueError on bad learning_rate - self.factory(learning_rate="") + assert_raises(ValueError, self.factory, learning_rate="") - @raises(ValueError) def test_sgd_bad_eta0(self): # Check whether expected ValueError on bad eta0 - self.factory(eta0=0, learning_rate="constant") + assert_raises(ValueError, self.factory, eta0=0, + learning_rate="constant") - @raises(ValueError) def test_sgd_bad_alpha(self): # Check whether expected ValueError on bad alpha - self.factory(alpha=-.1) + assert_raises(ValueError, self.factory, alpha=-.1) - @raises(ValueError) def test_sgd_bad_penalty(self): # Check whether expected ValueError on bad penalty - self.factory(penalty='foobar', l1_ratio=0.85) + assert_raises(ValueError, self.factory, penalty='foobar', + l1_ratio=0.85) - @raises(ValueError) def test_sgd_bad_loss(self): # Check whether expected ValueError on bad loss - self.factory(loss="foobar") + assert_raises(ValueError, self.factory, loss="foobar") - @raises(ValueError) def test_sgd_max_iter_param(self): # Test parameter validity check - self.factory(max_iter=-10000) + assert_raises(ValueError, self.factory, max_iter=-10000) - @raises(ValueError) def test_sgd_shuffle_param(self): # Test parameter validity check - self.factory(shuffle="false") + assert_raises(ValueError, self.factory, shuffle="false") - @raises(TypeError) def test_argument_coef(self): # Checks coef_init not allowed as model argument (only fit) - # Provided coef_ does not match dataset. - self.factory(coef_init=np.zeros((3,))).fit(X, Y) + # Provided coef_ does not match dataset + assert_raises(TypeError, self.factory, coef_init=np.zeros((3,))) - @raises(ValueError) def test_provide_coef(self): # Checks coef_init shape for the warm starts # Provided coef_ does not match dataset. - self.factory().fit(X, Y, coef_init=np.zeros((3,))) + assert_raises(ValueError, self.factory().fit, + X, Y, coef_init=np.zeros((3,))) - @raises(ValueError) def test_set_intercept(self): # Checks intercept_ shape for the warm starts # Provided intercept_ does not match dataset. - self.factory().fit(X, Y, intercept_init=np.zeros((3,))) + assert_raises(ValueError, self.factory().fit, + X, Y, intercept_init=np.zeros((3,))) def test_set_intercept_binary(self): # Checks intercept_ shape for the warm starts in binary case @@ -386,10 +378,10 @@ def test_set_intercept_to_intercept(self): clf = self.factory().fit(X, Y) self.factory().fit(X, Y, intercept_init=clf.intercept_) - @raises(ValueError) def test_sgd_at_least_two_labels(self): # Target must have at least two labels - self.factory(alpha=0.01, max_iter=20).fit(X2, np.ones(9)) + clf = self.factory(alpha=0.01, max_iter=20) + assert_raises(ValueError, clf.fit, X2, np.ones(9)) def test_partial_fit_weight_class_balanced(self): # partial_fit with class_weight='balanced' not supported""" @@ -607,17 +599,15 @@ def test_equal_class_weight(self): # should be similar up to some epsilon due to learning rate schedule assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2) - @raises(ValueError) def test_wrong_class_weight_label(self): # ValueError due to not existing class label. clf = self.factory(alpha=0.1, max_iter=1000, class_weight={0: 0.5}) - clf.fit(X, Y) + assert_raises(ValueError, clf.fit, X, Y) - @raises(ValueError) def test_wrong_class_weight_format(self): # ValueError due to wrong class_weight argument type. clf = self.factory(alpha=0.1, max_iter=1000, class_weight=[0.5]) - clf.fit(X, Y) + assert_raises(ValueError, clf.fit, X, Y) def test_weights_multiplied(self): # Tests that class_weight and sample_weight are multiplicative @@ -700,18 +690,16 @@ def test_sample_weights(self): # the prediction on this point should shift assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1])) - @raises(ValueError) def test_wrong_sample_weights(self): # Test if ValueError is raised if sample_weight has wrong shape clf = self.factory(alpha=0.1, max_iter=1000, fit_intercept=False) # provided sample_weight too long - clf.fit(X, Y, sample_weight=np.arange(7)) + assert_raises(ValueError, clf.fit, X, Y, sample_weight=np.arange(7)) - @raises(ValueError) def test_partial_fit_exception(self): clf = self.factory(alpha=0.01) # classes was not specified - clf.partial_fit(X3, Y3) + assert_raises(ValueError, clf.partial_fit, X3, Y3) def test_partial_fit_binary(self): third = X.shape[0] // 3 @@ -851,15 +839,14 @@ def test_sgd(self): clf.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2]) assert_equal(clf.coef_[0], clf.coef_[1]) - @raises(ValueError) def test_sgd_bad_penalty(self): # Check whether expected ValueError on bad penalty - self.factory(penalty='foobar', l1_ratio=0.85) + assert_raises(ValueError, self.factory, + penalty='foobar', l1_ratio=0.85) - @raises(ValueError) def test_sgd_bad_loss(self): # Check whether expected ValueError on bad loss - self.factory(loss="foobar") + assert_raises(ValueError, self.factory, loss="foobar") def test_sgd_averaged_computed_correctly(self): # Tests the average regressor matches the naive implementation diff --git a/sklearn/linear_model/tests/test_theil_sen.py b/sklearn/linear_model/tests/test_theil_sen.py index 279beb8014e95..3a2b1f9dc006f 100644 --- a/sklearn/linear_model/tests/test_theil_sen.py +++ b/sklearn/linear_model/tests/test_theil_sen.py @@ -20,7 +20,7 @@ from sklearn.linear_model.theil_sen import _spatial_median, _breakdown_point from sklearn.linear_model.theil_sen import _modified_weiszfeld_step from sklearn.utils.testing import ( - assert_almost_equal, assert_greater, assert_less, raises, + assert_almost_equal, assert_greater, assert_less, assert_raises, ) @@ -202,31 +202,31 @@ def test_calc_breakdown_point(): assert_less(np.abs(bp - 1 + 1 / (np.sqrt(2))), 1.e-6) -@raises(ValueError) def test_checksubparams_negative_subpopulation(): X, y, w, c = gen_toy_problem_1d() - TheilSenRegressor(max_subpopulation=-1, random_state=0).fit(X, y) + theil_sen = TheilSenRegressor(max_subpopulation=-1, random_state=0) + assert_raises(ValueError, theil_sen.fit, X, y) -@raises(ValueError) def test_checksubparams_too_few_subsamples(): X, y, w, c = gen_toy_problem_1d() - TheilSenRegressor(n_subsamples=1, random_state=0).fit(X, y) + theil_sen = TheilSenRegressor(n_subsamples=1, random_state=0) + assert_raises(ValueError, theil_sen.fit, X, y) -@raises(ValueError) def test_checksubparams_too_many_subsamples(): X, y, w, c = gen_toy_problem_1d() - TheilSenRegressor(n_subsamples=101, random_state=0).fit(X, y) + theil_sen = TheilSenRegressor(n_subsamples=101, random_state=0) + assert_raises(ValueError, theil_sen.fit, X, y) -@raises(ValueError) def test_checksubparams_n_subsamples_if_less_samples_than_features(): random_state = np.random.RandomState(0) n_samples, n_features = 10, 20 X = random_state.normal(size=(n_samples, n_features)) y = random_state.normal(size=n_samples) - TheilSenRegressor(n_subsamples=9, random_state=0).fit(X, y) + theil_sen = TheilSenRegressor(n_subsamples=9, random_state=0) + assert_raises(ValueError, theil_sen.fit, X, y) def test_subpopulation(): diff --git a/sklearn/manifold/isomap.py b/sklearn/manifold/isomap.py index 1f6d0ae0dc0b1..f649237448d32 100644 --- a/sklearn/manifold/isomap.py +++ b/sklearn/manifold/isomap.py @@ -157,6 +157,8 @@ def fit(self, X, y=None): numpy array, precomputed tree, or NearestNeighbors object. + y: Ignored + Returns ------- self : returns an instance of self. @@ -173,6 +175,8 @@ def fit_transform(self, X, y=None): Training vector, where n_samples in the number of samples and n_features is the number of features. + y: Ignored + Returns ------- X_new : array-like, shape (n_samples, n_components) diff --git a/sklearn/manifold/locally_linear.py b/sklearn/manifold/locally_linear.py index e8705cff359a6..8151658fe97cc 100644 --- a/sklearn/manifold/locally_linear.py +++ b/sklearn/manifold/locally_linear.py @@ -652,6 +652,8 @@ def fit(self, X, y=None): X : array-like of shape [n_samples, n_features] training set. + y: Ignored + Returns ------- self : returns an instance of self. @@ -667,6 +669,8 @@ def fit_transform(self, X, y=None): X : array-like of shape [n_samples, n_features] training set. + y: Ignored + Returns ------- X_new : array-like, shape (n_samples, n_components) diff --git a/sklearn/manifold/mds.py b/sklearn/manifold/mds.py index 5f7327ef4dc84..3890c4e40bffb 100644 --- a/sklearn/manifold/mds.py +++ b/sklearn/manifold/mds.py @@ -379,6 +379,8 @@ def fit(self, X, y=None, init=None): Input data. If ``dissimilarity=='precomputed'``, the input should be the dissimilarity matrix. + y: Ignored + init : ndarray, shape (n_samples,), optional, default: None Starting configuration of the embedding to initialize the SMACOF algorithm. By default, the algorithm is initialized with a randomly @@ -397,6 +399,8 @@ def fit_transform(self, X, y=None, init=None): Input data. If ``dissimilarity=='precomputed'``, the input should be the dissimilarity matrix. + y: Ignored + init : ndarray, shape (n_samples,), optional, default: None Starting configuration of the embedding to initialize the SMACOF algorithm. By default, the algorithm is initialized with a randomly diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py index a330b7da7f856..4ae588d1ae6c0 100644 --- a/sklearn/manifold/spectral_embedding_.py +++ b/sklearn/manifold/spectral_embedding_.py @@ -428,6 +428,8 @@ def _get_affinity_matrix(self, X, Y=None): Interpret X as precomputed adjacency graph computed from samples. + Y: Ignored + Returns ------- affinity_matrix, shape (n_samples, n_samples) @@ -474,6 +476,8 @@ def fit(self, X, y=None): Interpret X as precomputed adjacency graph computed from samples. + Y: Ignored + Returns ------- self : object @@ -514,6 +518,8 @@ def fit_transform(self, X, y=None): Interpret X as precomputed adjacency graph computed from samples. + Y: Ignored + Returns ------- X_new : array-like, shape (n_samples, n_components) diff --git a/sklearn/manifold/t_sne.py b/sklearn/manifold/t_sne.py index 163e8340f7b29..f7dba6dbdd78f 100644 --- a/sklearn/manifold/t_sne.py +++ b/sklearn/manifold/t_sne.py @@ -851,6 +851,8 @@ def fit_transform(self, X, y=None): If the metric is 'precomputed' X must be a square distance matrix. Otherwise it contains a sample per row. + y : Ignored + Returns ------- X_new : array, shape (n_samples, n_components) @@ -870,6 +872,8 @@ def fit(self, X, y=None): matrix. Otherwise it contains a sample per row. If the method is 'exact', X may be a sparse matrix of type 'csr', 'csc' or 'coo'. + + y : Ignored """ self.fit_transform(X) return self diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 2311b48ee2eae..907f476355069 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -244,7 +244,9 @@ def test_preserve_trustworthiness_approximately(): method=method) X_embedded = tsne.fit_transform(X) t = trustworthiness(X, X_embedded, n_neighbors=1) - assert_greater(t, 0.9) + assert_greater(t, 0.9, msg='Trustworthiness={:0.3f} < 0.9 ' + 'for method={} and ' + 'init={}'.format(t, method, init)) def test_optimization_minimizes_kl_divergence(): diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py index 395725c00d7d9..3f169fe1b46de 100644 --- a/sklearn/metrics/classification.py +++ b/sklearn/metrics/classification.py @@ -167,7 +167,7 @@ def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None): 2 In the multilabel case with binary label indicators: - + >>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2))) 0.5 """ @@ -528,9 +528,9 @@ def matthews_corrcoef(y_true, y_pred, sample_weight=None): y_pred = lb.transform(y_pred) C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight) - t_sum = C.sum(axis=1) - p_sum = C.sum(axis=0) - n_correct = np.trace(C) + t_sum = C.sum(axis=1, dtype=np.float64) + p_sum = C.sum(axis=0, dtype=np.float64) + n_correct = np.trace(C, dtype=np.float64) n_samples = p_sum.sum() cov_ytyp = n_correct * n_samples - np.dot(t_sum, p_sum) cov_ypyp = n_samples ** 2 - np.dot(p_sum, p_sum) diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index 4d6b87f701ea4..c259036807f7f 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -483,6 +483,41 @@ def test_matthews_corrcoef_multiclass(): assert_almost_equal(mcc, 0.) +def test_matthews_corrcoef_overflow(): + # https://github.com/scikit-learn/scikit-learn/issues/9622 + rng = np.random.RandomState(20170906) + + def mcc_safe(y_true, y_pred): + conf_matrix = confusion_matrix(y_true, y_pred) + true_pos = conf_matrix[1, 1] + false_pos = conf_matrix[1, 0] + false_neg = conf_matrix[0, 1] + n_points = len(y_true) + pos_rate = (true_pos + false_neg) / n_points + activity = (true_pos + false_pos) / n_points + mcc_numerator = true_pos / n_points - pos_rate * activity + mcc_denominator = activity * pos_rate * (1 - activity) * (1 - pos_rate) + return mcc_numerator / np.sqrt(mcc_denominator) + + def random_ys(n_points): # binary + x_true = rng.random_sample(n_points) + x_pred = x_true + 0.2 * (rng.random_sample(n_points) - 0.5) + y_true = (x_true > 0.5) + y_pred = (x_pred > 0.5) + return y_true, y_pred + + for n_points in [100, 10000, 1000000]: + arr = np.repeat([0., 1.], n_points) # binary + assert_almost_equal(matthews_corrcoef(arr, arr), 1.0) + arr = np.repeat([0., 1., 2.], n_points) # multiclass + assert_almost_equal(matthews_corrcoef(arr, arr), 1.0) + + y_true, y_pred = random_ys(n_points) + assert_almost_equal(matthews_corrcoef(y_true, y_true), 1.0) + assert_almost_equal(matthews_corrcoef(y_true, y_pred), + mcc_safe(y_true, y_pred)) + + def test_precision_recall_f1_score_multiclass(): # Test Precision Recall and F1 Score for multiclass classification task y_true, y_pred, _ = make_prediction(binary=False) diff --git a/sklearn/mixture/dpgmm.py b/sklearn/mixture/dpgmm.py index c2fd42ab45842..ddc861b4c19f0 100644 --- a/sklearn/mixture/dpgmm.py +++ b/sklearn/mixture/dpgmm.py @@ -273,7 +273,6 @@ def score_samples(self, X): X = check_array(X) if X.ndim == 1: X = X[:, np.newaxis] - z = np.zeros((X.shape[0], self.n_components)) sd = digamma(self.gamma_.T[1] + self.gamma_.T[2]) dgamma1 = digamma(self.gamma_.T[1]) - sd dgamma2 = np.zeros(self.n_components) @@ -844,7 +843,6 @@ def _bound_proportions(self, z): return logprior def _bound_concentration(self): - logprior = 0. logprior = gammaln(np.sum(self.gamma_)) - gammaln(self.n_components * self.alpha_) logprior -= np.sum(gammaln(self.gamma_) - gammaln(self.alpha_)) diff --git a/sklearn/mixture/tests/test_gmm.py b/sklearn/mixture/tests/test_gmm.py index 2a2dce1fc18d1..137703adfcad4 100644 --- a/sklearn/mixture/tests/test_gmm.py +++ b/sklearn/mixture/tests/test_gmm.py @@ -9,14 +9,14 @@ import sys import numpy as np -from numpy.testing import (assert_array_equal, assert_array_almost_equal, - assert_raises) +from numpy.testing import assert_array_equal, assert_array_almost_equal + from scipy import stats from sklearn import mixture from sklearn.datasets.samples_generator import make_spd_matrix from sklearn.utils.testing import (assert_true, assert_greater, assert_raise_message, assert_warns_message, - ignore_warnings) + ignore_warnings, assert_raises) from sklearn.metrics.cluster import adjusted_rand_score from sklearn.externals.six.moves import cStringIO as StringIO diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py index 773f70fb7dba2..798f771534571 100644 --- a/sklearn/model_selection/_validation.py +++ b/sklearn/model_selection/_validation.py @@ -1000,6 +1000,7 @@ def learning_curve(estimator, X, y, groups=None, If None, the random number generator is the RandomState instance used by `np.random`. Used when ``shuffle`` == 'True'. + Returns ------- train_sizes_abs : array, shape = (n_unique_ticks,), dtype int Numbers of training examples that has been used to generate the @@ -1097,7 +1098,7 @@ def _translate_train_sizes(train_sizes, n_max_training_samples): n_ticks = train_sizes_abs.shape[0] n_min_required_samples = np.min(train_sizes_abs) n_max_required_samples = np.max(train_sizes_abs) - if np.issubdtype(train_sizes_abs.dtype, np.float): + if np.issubdtype(train_sizes_abs.dtype, np.floating): if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0: raise ValueError("train_sizes has been interpreted as fractions " "of the maximum number of training samples and " diff --git a/sklearn/neighbors/tests/test_approximate.py b/sklearn/neighbors/tests/test_approximate.py index f8b9b45640783..5863a0bd738db 100644 --- a/sklearn/neighbors/tests/test_approximate.py +++ b/sklearn/neighbors/tests/test_approximate.py @@ -46,7 +46,7 @@ def test_neighbors_accuracy_with_n_candidates(): for i, n_candidates in enumerate(n_candidates_values): lshf = ignore_warnings(LSHForest, category=DeprecationWarning)( - n_candidates=n_candidates) + n_candidates=n_candidates, random_state=0) ignore_warnings(lshf.fit)(X) for j in range(n_iter): query = X[rng.randint(0, n_samples)].reshape(1, -1) diff --git a/sklearn/neighbors/tests/test_nearest_centroid.py b/sklearn/neighbors/tests/test_nearest_centroid.py index e50a2e6f07445..25fac197c3657 100644 --- a/sklearn/neighbors/tests/test_nearest_centroid.py +++ b/sklearn/neighbors/tests/test_nearest_centroid.py @@ -6,10 +6,10 @@ from scipy import sparse as sp from numpy.testing import assert_array_equal from numpy.testing import assert_equal -from numpy.testing import assert_raises from sklearn.neighbors import NearestCentroid from sklearn import datasets +from sklearn.utils.testing import assert_raises # toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] @@ -57,9 +57,9 @@ def test_classification_toy(): def test_precomputed(): clf = NearestCentroid(metric='precomputed') - with assert_raises(ValueError) as context: + with assert_raises(ValueError): clf.fit(X, y) - assert_equal(ValueError, type(context.exception)) + def test_iris(): # Check consistency on dataset iris. diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 66da9dffeb066..54d29651ac776 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -110,8 +110,7 @@ class Pipeline(_BaseComposition): # BaseEstimator interface def __init__(self, steps, memory=None): - # shallow copy of steps - self.steps = list(steps) + self.steps = steps self._validate_steps() self.memory = memory @@ -184,6 +183,8 @@ def _final_estimator(self): # Estimator interface def _fit(self, X, y=None, **fit_params): + # shallow copy of steps - this should really be steps_ + self.steps = list(self.steps) self._validate_steps() # Setup the memory memory = check_memory(self.memory) @@ -413,6 +414,7 @@ def transform(self): Xt : array-like, shape = [n_samples, n_transformed_features] """ # _final_estimator is None or has transform, otherwise attribute error + # XXX: Handling the None case means we can't use if_delegate_has_method if self._final_estimator is not None: self._final_estimator.transform return self._transform @@ -443,6 +445,7 @@ def inverse_transform(self): Xt : array-like, shape = [n_samples, n_features] """ # raise AttributeError if necessary for hasattr behaviour + # XXX: Handling the None case means we can't use if_delegate_has_method for name, transform in self.steps: if transform is not None: transform.inverse_transform @@ -613,7 +616,7 @@ class FeatureUnion(_BaseComposition, TransformerMixin): """ def __init__(self, transformer_list, n_jobs=1, transformer_weights=None): - self.transformer_list = list(transformer_list) + self.transformer_list = transformer_list self.n_jobs = n_jobs self.transformer_weights = transformer_weights self._validate_transformers() @@ -704,6 +707,7 @@ def fit(self, X, y=None): self : FeatureUnion This estimator """ + self.transformer_list = list(self.transformer_list) self._validate_transformers() transformers = Parallel(n_jobs=self.n_jobs)( delayed(_fit_one_transformer)(trans, X, y) diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py index 7c6642a504ad1..551451a47f5a6 100644 --- a/sklearn/svm/classes.py +++ b/sklearn/svm/classes.py @@ -88,10 +88,13 @@ class LinearSVC(BaseEstimator, LinearClassifierMixin, random_state : int, RandomState instance or None, optional (default=None) The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. + the data for the dual coordinate descent (if ``dual=True``). When + ``dual=False`` the underlying implementation of :class:`LinearSVC` + is not random and ``random_state`` has no effect on the results. If + int, random_state is the seed used by the random number generator; If + RandomState instance, random_state is the random number generator; If + None, the random number generator is the RandomState instance used by + `np.random`. max_iter : int, (default=1000) The maximum number of iterations to be run. @@ -509,11 +512,11 @@ class SVC(BaseSVC): Deprecated *decision_function_shape='ovo' and None*. random_state : int, RandomState instance or None, optional (default=None) - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. + The seed of the pseudo random number generator used when shuffling + the data for probability estimates. If int, random_state is the + seed used by the random number generator; If RandomState instance, + random_state is the random number generator; If None, the random + number generator is the RandomState instance used by `np.random`. Attributes ---------- @@ -665,11 +668,11 @@ class NuSVC(BaseSVC): Deprecated *decision_function_shape='ovo' and None*. random_state : int, RandomState instance or None, optional (default=None) - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. + The seed of the pseudo random number generator used when shuffling + the data for probability estimates. If int, random_state is the seed + used by the random number generator; If RandomState instance, + random_state is the random number generator; If None, the random + number generator is the RandomState instance used by `np.random`. Attributes ---------- @@ -1019,11 +1022,11 @@ class OneClassSVM(BaseLibSVM): Hard limit on iterations within solver, or -1 for no limit. random_state : int, RandomState instance or None, optional (default=None) - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. + Ignored. + + .. deprecated:: 0.20 + ``random_state`` has been deprecated in 0.20 and will be removed in + 0.22. Attributes ---------- @@ -1080,6 +1083,11 @@ def fit(self, X, y=None, sample_weight=None, **params): If X is not a C-ordered contiguous array it is copied. """ + + if self.random_state is not None: + warnings.warn("The random_state parameter is deprecated and will" + " be removed in version 0.22.", DeprecationWarning) + super(OneClassSVM, self).fit(X, np.ones(_num_samples(X)), sample_weight=sample_weight, **params) return self diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index 583c413bc5c11..e46dbb92df44a 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -5,7 +5,7 @@ from sklearn.svm import LinearSVC from sklearn.linear_model.logistic import LogisticRegression -from sklearn.utils.testing import assert_true, raises +from sklearn.utils.testing import assert_true, assert_raises from sklearn.utils.testing import assert_raise_message @@ -63,13 +63,11 @@ def check_l1_min_c(X, y, loss, fit_intercept=True, intercept_scaling=None): (np.asarray(clf.intercept_) != 0).any()) -@raises(ValueError) def test_ill_posed_min_c(): X = [[0, 0], [0, 0]] y = [0, 1] - l1_min_c(X, y) + assert_raises(ValueError, l1_min_c, X, y) -@raises(ValueError) def test_unsupported_loss(): - l1_min_c(dense_X, Y1, 'l1') + assert_raises(ValueError, l1_min_c, dense_X, Y1, 'l1') diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 948d5818b9b0e..7ad0f20382657 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -61,19 +61,6 @@ def __init__(self, a=np.array([0])): self.a = a.copy() -class DeprecatedAttributeEstimator(BaseEstimator): - def __init__(self, a=None, b=None): - self.a = a - if b is not None: - DeprecationWarning("b is deprecated and renamed 'a'") - self.a = b - - @property - @deprecated("Parameter 'b' is deprecated and renamed to 'a'") - def b(self): - return self._b - - class Buggy(BaseEstimator): " A buggy estimator that does not set its parameters right. " @@ -219,19 +206,6 @@ def test_get_params(): assert_raises(ValueError, test.set_params, a__a=2) -def test_get_params_deprecated(): - # deprecated attribute should not show up as params - est = DeprecatedAttributeEstimator(a=1) - - assert_true('a' in est.get_params()) - assert_true('a' in est.get_params(deep=True)) - assert_true('a' in est.get_params(deep=False)) - - assert_true('b' not in est.get_params()) - assert_true('b' not in est.get_params(deep=True)) - assert_true('b' not in est.get_params(deep=False)) - - def test_is_classifier(): svc = SVC() assert_true(is_classifier(svc)) diff --git a/sklearn/tests/test_isotonic.py b/sklearn/tests/test_isotonic.py index d5d0715a0fb7f..967acb2324f19 100644 --- a/sklearn/tests/test_isotonic.py +++ b/sklearn/tests/test_isotonic.py @@ -166,6 +166,30 @@ def test_isotonic_regression_ties_secondary_(): assert_array_almost_equal(ir.fit_transform(x, y), y_true, 4) +def test_isotonic_regression_with_ties_in_differently_sized_groups(): + """ + Non-regression test to handle issue 9432: + https://github.com/scikit-learn/scikit-learn/issues/9432 + + Compare against output in R: + > library("isotone") + > x <- c(0, 1, 1, 2, 3, 4) + > y <- c(0, 0, 1, 0, 0, 1) + > res1 <- gpava(x, y, ties="secondary") + > res1$x + + `isotone` version: 1.1-0, 2015-07-24 + R version: R version 3.3.2 (2016-10-31) + """ + x = np.array([0, 1, 1, 2, 3, 4]) + y = np.array([0, 0, 1, 0, 0, 1]) + y_true = np.array([0., 0.25, 0.25, 0.25, 0.25, 1.]) + ir = IsotonicRegression() + ir.fit(x, y) + assert_array_almost_equal(ir.transform(x), y_true) + assert_array_almost_equal(ir.fit_transform(x, y), y_true) + + def test_isotonic_regression_reversed(): y = np.array([10, 9, 10, 7, 6, 6.1, 5]) y_ = IsotonicRegression(increasing=False).fit_transform( diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 1165370885d36..d1d62f80e51a5 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -19,6 +19,7 @@ from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_dict_equal +from sklearn.utils.testing import assert_no_warnings from sklearn.base import clone, BaseEstimator from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union @@ -187,7 +188,7 @@ def test_pipeline_init(): assert_raises(ValueError, pipe.set_params, anova__C=0.1) # Test clone - pipe2 = clone(pipe) + pipe2 = assert_no_warnings(clone, pipe) assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc']) # Check that apart from estimators, the parameters are the same @@ -421,6 +422,10 @@ def test_feature_union(): X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) + # Test clone + fs2 = assert_no_warnings(clone, fs) + assert_false(fs.transformer_list[0][1] is fs2.transformer_list[0][1]) + # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 97eee80ecff71..71ee8fa2bcb61 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -30,7 +30,6 @@ from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_warns from sklearn.utils.testing import assert_warns_message -from sklearn.utils.testing import raises from sklearn.utils.testing import ignore_warnings from sklearn.utils.validation import check_random_state @@ -394,11 +393,10 @@ def test_importances(): clf2.feature_importances_) -@raises(ValueError) def test_importances_raises(): # Check if variable importance before fit raises ValueError. clf = DecisionTreeClassifier() - clf.feature_importances_ + assert_raises(ValueError, getattr, clf, 'feature_importances_') def test_importances_gini_equal_mse(): diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 4b2665cdd4f77..83e8a48a6625a 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -90,7 +90,7 @@ def safe_mask(X, mask): mask """ mask = np.asarray(mask) - if np.issubdtype(mask.dtype, np.int): + if np.issubdtype(mask.dtype, np.signedinteger): return mask if hasattr(X, "toarray"): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 81f0d88e3f02b..3e7cb198a9d12 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -76,6 +76,7 @@ def _yield_non_meta_checks(name, estimator): yield check_sample_weights_pandas_series yield check_sample_weights_list yield check_estimators_fit_returns_self + yield check_complex_data # Check that all estimator yield informative messages when # trained on empty datasets @@ -458,6 +459,16 @@ def check_dtype_object(name, estimator_orig): assert_raises_regex(TypeError, msg, estimator.fit, X, y) +def check_complex_data(name, estimator_orig): + # check that estimators raise an exception on providing complex data + X = np.random.sample(10) + 1j * np.random.sample(10) + X = X.reshape(-1, 1) + y = np.random.sample(10) + 1j * np.random.sample(10) + estimator = clone(estimator_orig) + assert_raises_regex(ValueError, "Complex data not supported", + estimator.fit, X, y) + + @ignore_warnings def check_dict_unchanged(name, estimator_orig): # this estimator raises diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 70619673bea3b..e95ceb57497ae 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -421,7 +421,6 @@ def weighted_mode(a, w, axis=0): else: a = np.asarray(a) w = np.asarray(w) - axis = axis if a.shape != w.shape: w = np.zeros(a.shape, dtype=w.dtype) + w diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index 9ff79c628a1b8..52c12ce5d5953 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -18,6 +18,9 @@ from cython cimport floating np.import_array() +ctypedef fused integral: + int + long long ctypedef np.float64_t DOUBLE @@ -30,11 +33,11 @@ def csr_row_norms(X): def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data, shape, - np.ndarray[int, ndim=1, mode="c"] X_indices, - np.ndarray[int, ndim=1, mode="c"] X_indptr): + np.ndarray[integral, ndim=1, mode="c"] X_indices, + np.ndarray[integral, ndim=1, mode="c"] X_indptr): cdef: - unsigned int n_samples = shape[0] - unsigned int n_features = shape[1] + unsigned long long n_samples = shape[0] + unsigned long long n_features = shape[1] np.ndarray[DOUBLE, ndim=1, mode="c"] norms np.npy_intp i, j @@ -326,17 +329,16 @@ def inplace_csr_row_normalize_l1(X): def _inplace_csr_row_normalize_l1(np.ndarray[floating, ndim=1] X_data, shape, - np.ndarray[int, ndim=1] X_indices, - np.ndarray[int, ndim=1] X_indptr): - cdef unsigned int n_samples = shape[0] - cdef unsigned int n_features = shape[1] + np.ndarray[integral, ndim=1] X_indices, + np.ndarray[integral, ndim=1] X_indptr): + cdef unsigned long long n_samples = shape[0] + cdef unsigned long long n_features = shape[1] # the column indices for row i are stored in: # indices[indptr[i]:indices[i+1]] # and their corresponding values are stored in: # data[indptr[i]:indptr[i+1]] - cdef unsigned int i - cdef unsigned int j + cdef np.npy_intp i, j cdef double sum_ for i in xrange(n_samples): @@ -361,13 +363,12 @@ def inplace_csr_row_normalize_l2(X): def _inplace_csr_row_normalize_l2(np.ndarray[floating, ndim=1] X_data, shape, - np.ndarray[int, ndim=1] X_indices, - np.ndarray[int, ndim=1] X_indptr): - cdef unsigned int n_samples = shape[0] - cdef unsigned int n_features = shape[1] + np.ndarray[integral, ndim=1] X_indices, + np.ndarray[integral, ndim=1] X_indptr): + cdef integral n_samples = shape[0] + cdef integral n_features = shape[1] - cdef unsigned int i - cdef unsigned int j + cdef np.npy_intp i, j cdef double sum_ for i in xrange(n_samples): diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 4e7f7ea3e98a3..c5b6209cc5728 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -45,9 +45,31 @@ import sklearn from sklearn.base import BaseEstimator from sklearn.externals import joblib +from sklearn.utils import deprecated -from nose.tools import raises -from nose import with_setup +additional_names_in_all = [] +try: + from nose.tools import raises as _nose_raises + deprecation_message = ( + 'sklearn.utils.testing.raises has been deprecated in version 0.20 ' + 'and will be removed in 0.22. Please use ' + 'sklearn.utils.testing.assert_raises instead.') + raises = deprecated(deprecation_message)(_nose_raises) + additional_names_in_all.append('raises') +except ImportError: + pass + +try: + from nose.tools import with_setup as _with_setup + deprecation_message = ( + 'sklearn.utils.testing.with_setup has been deprecated in version 0.20 ' + 'and will be removed in 0.22.' + 'If your code relies on with_setup, please use' + ' nose.tools.with_setup instead.') + with_setup = deprecated(deprecation_message)(_with_setup) + additional_names_in_all.append('with_setup') +except ImportError: + pass from numpy.testing import assert_almost_equal from numpy.testing import assert_array_equal @@ -61,12 +83,13 @@ from sklearn.utils._unittest_backport import TestCase __all__ = ["assert_equal", "assert_not_equal", "assert_raises", - "assert_raises_regexp", "raises", "with_setup", "assert_true", + "assert_raises_regexp", "assert_true", "assert_false", "assert_almost_equal", "assert_array_equal", "assert_array_almost_equal", "assert_array_less", "assert_less", "assert_less_equal", "assert_greater", "assert_greater_equal", "assert_approx_equal", "SkipTest"] +__all__.extend(additional_names_in_all) _dummy = TestCase('__init__') assert_equal = _dummy.assertEqual @@ -745,10 +768,6 @@ def __exit__(self, exc_type, exc_val, exc_tb): _delete_folder(self.temp_folder) -with_network = with_setup(check_skip_network) -with_travis = with_setup(check_skip_travis) - - class _named_check(object): """Wraps a check to show a useful description diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 86d604ef33f66..f53b814c70084 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -206,10 +206,19 @@ def test_row_norms(): precision) assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision) - Xcsr = sparse.csr_matrix(X, dtype=dtype) - assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True), - precision) - assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr), precision) + for csr_index_dtype in [np.int32, np.int64]: + Xcsr = sparse.csr_matrix(X, dtype=dtype) + # csr_matrix will use int32 indices by default, + # up-casting those to int64 when necessary + if csr_index_dtype is np.int64: + Xcsr.indptr = Xcsr.indptr.astype(csr_index_dtype) + Xcsr.indices = Xcsr.indices.astype(csr_index_dtype) + assert Xcsr.indices.dtype == csr_index_dtype + assert Xcsr.indptr.dtype == csr_index_dtype + assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True), + precision) + assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr), + precision) def test_randomized_svd_low_rank_with_noise(): diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index fd09267ea7b0a..f2b35e7459833 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -478,8 +478,16 @@ def test_inplace_normalize(): for dtype in (np.float64, np.float32): X = rs.randn(10, 5).astype(dtype) X_csr = sp.csr_matrix(X) - inplace_csr_row_normalize(X_csr) - assert_equal(X_csr.dtype, dtype) - if inplace_csr_row_normalize is inplace_csr_row_normalize_l2: - X_csr.data **= 2 - assert_array_almost_equal(np.abs(X_csr).sum(axis=1), ones) + for index_dtype in [np.int32, np.int64]: + # csr_matrix will use int32 indices by default, + # up-casting those to int64 when necessary + if index_dtype is np.int64: + X_csr.indptr = X_csr.indptr.astype(index_dtype) + X_csr.indices = X_csr.indices.astype(index_dtype) + assert X_csr.indices.dtype == index_dtype + assert X_csr.indptr.dtype == index_dtype + inplace_csr_row_normalize(X_csr) + assert_equal(X_csr.dtype, dtype) + if inplace_csr_row_normalize is inplace_csr_row_normalize_l2: + X_csr.data **= 2 + assert_array_almost_equal(np.abs(X_csr).sum(axis=1), ones) diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 6bebad884d835..37a0eb859f565 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -437,6 +437,45 @@ def test_check_array_min_samples_and_features_messages(): assert_array_equal(y, y_checked) +def test_check_array_complex_data_error(): + X = np.array([[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]]) + assert_raises_regex( + ValueError, "Complex data not supported", check_array, X) + + # list of lists + X = [[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]] + assert_raises_regex( + ValueError, "Complex data not supported", check_array, X) + + # tuple of tuples + X = ((1 + 2j, 3 + 4j, 5 + 7j), (2 + 3j, 4 + 5j, 6 + 7j)) + assert_raises_regex( + ValueError, "Complex data not supported", check_array, X) + + # list of np arrays + X = [np.array([1 + 2j, 3 + 4j, 5 + 7j]), + np.array([2 + 3j, 4 + 5j, 6 + 7j])] + assert_raises_regex( + ValueError, "Complex data not supported", check_array, X) + + # tuple of np arrays + X = (np.array([1 + 2j, 3 + 4j, 5 + 7j]), + np.array([2 + 3j, 4 + 5j, 6 + 7j])) + assert_raises_regex( + ValueError, "Complex data not supported", check_array, X) + + # dataframe + X = MockDataFrame( + np.array([[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]])) + assert_raises_regex( + ValueError, "Complex data not supported", check_array, X) + + # sparse matrix + X = sp.coo_matrix([[0, 1 + 2j], [0, 0]]) + assert_raises_regex( + ValueError, "Complex data not supported", check_array, X) + + def test_has_fit_parameter(): assert_false(has_fit_parameter(KNeighborsClassifier, "sample_weight")) assert_true(has_fit_parameter(RandomForestRegressor, "sample_weight")) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 5847b540d7b6c..080c30fcf9b2c 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -13,6 +13,7 @@ import numpy as np import scipy.sparse as sp +from numpy.core.numeric import ComplexWarning from ..externals import six from ..utils.fixes import signature @@ -307,6 +308,13 @@ def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy, return spmatrix +def _ensure_no_complex_data(array): + if hasattr(array, 'dtype') and array.dtype is not None \ + and hasattr(array.dtype, 'kind') and array.dtype.kind == "c": + raise ValueError("Complex data not supported\n" + "{}\n".format(array)) + + def check_array(array, accept_sparse=False, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, @@ -427,10 +435,28 @@ def check_array(array, accept_sparse=False, dtype="numeric", order=None, context = " by %s" % estimator_name if estimator is not None else "" if sp.issparse(array): + _ensure_no_complex_data(array) array = _ensure_sparse_format(array, accept_sparse, dtype, copy, force_all_finite) else: - array = np.array(array, dtype=dtype, order=order, copy=copy) + # If np.array(..) gives ComplexWarning, then we convert the warning + # to an error. This is needed because specifying a non complex + # dtype to the function converts complex to real dtype, + # thereby passing the test made in the lines following the scope + # of warnings context manager. + with warnings.catch_warnings(): + try: + warnings.simplefilter('error', ComplexWarning) + array = np.array(array, dtype=dtype, order=order, copy=copy) + except ComplexWarning: + raise ValueError("Complex data not supported\n" + "{}\n".format(array)) + + # It is possible that the np.array(..) gave no warning. This happens + # when no dtype conversion happend, for example dtype = None. The + # result is that np.array(..) produces an array of complex dtype + # and we need to catch and raise exception for such cases. + _ensure_no_complex_data(array) if ensure_2d: if array.ndim == 1: