Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated manual & improved plot function #105

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
32 changes: 23 additions & 9 deletions doc_src/manual.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@ Manual

Quick Start
-----------
To run the examples, just download the `data <https://github.com/automl/fanova/blob/master/fanova/example/online_lda.tar.gz>`_ and start the python console.
To run the examples, download the data from the `github repository <https://github.com/automl/fanova/tree/master/examples/example_data/online_lda>`_ and start the python console.
We can then import fANOVA and start it by typing

>>> from fanova import fANOVA
>>> import csv
>>> import os
>>> import numpy as np
>>> path = os.path.dirname(os.path.realpath(__file__))
>>> X = np.loadtxt(path + '/example_data/online_lda/online_lda_features.csv', delimiter=",")
>>> Y = np.loadtxt(path + '/example_data/online_lda/online_lda_responses.csv', delimiter=",")
Expand All @@ -21,12 +23,18 @@ This creates a new fANOVA object and fits the Random Forest on the specified dat
To compute now the marginal of the first parameter type:

>>> f.quantify_importance((0, ))
0.075414122571199116
{(0,): {'individual importance': 0.07567390839783641,
'total importance': 0.07567390839783641,
'individual std': 0.020053764191788233,
'total std': 0.020053764191788233}}

fANOVA also allows to specify parameters by their names.

>>> f.quantify_importance(("Col0", ))
0.075414122571199116
>>> f.quantify_importance(("x_000", ))
{('x_000',): {'individual importance': 0.07567390839783641,
'total importance': 0.07567390839783641,
'individual std': 0.020053764191788233,
'total std': 0.020053764191788233}}


Advanced
Expand All @@ -48,9 +56,9 @@ You can also specify the number of trees in the random forest as well as the min
More functions
--------------

* **f.get_most_important_pairwise_marginals(n)**
* **f.get_most_important_pairwise_marginals(n=N)**

Returns the **n** most important pairwise marginals
Returns the **N** most important pairwise marginals

* **f.get_most_important_pairwise_marginals(params)**

Expand Down Expand Up @@ -91,7 +99,7 @@ The same can been done for pairwise marginals

>>> vis.plot_pairwise_marginal([0,1])

.. image:: ../examples/example_data/online_lda/figure2.png
.. image:: ../examples/example_data/online_lda/pairwise.png


If you are just interested in the N most important pairwise marginals you can plot them through:
Expand Down Expand Up @@ -120,7 +128,13 @@ You will also find an extra directory in your specified plot directory called 'i
How to load a CSV-file
--------------------------

import numpy as np
>>> import numpy as np
>>> X = np.loadtxt('your_file.csv', delimiter=",")

data = np.loadtxt('your_file.csv', delimiter=",")
Alternatively, pandas may be used:

>>> import pandas as pd
>>> df = pd.read_csv('your_file.csv')
>>> X = df[your_param_columns]
>>> Y = df[your_score_column]
>>> f = fANOVA(X, Y, config_space=cs)
68 changes: 46 additions & 22 deletions fanova/visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def generate_marginal(self, p, resolution=100):
std = np.sqrt(v)
return mean, std

def plot_marginal(self, param, resolution=100, log_scale=None, show=True, incumbents=None):
def plot_marginal(self, param, resolution=100, log_scale=None, show=True, incumbents=None, ax=None):
"""
Creates a plot of marginal of a selected parameter

Expand All @@ -310,14 +310,31 @@ def plot_marginal(self, param, resolution=100, log_scale=None, show=True, incumb
resolution: int
Number of samples to generate from the parameter range as values to predict
log_scale: boolean
If log scale is required or not. If no value is given, it is deduced from the ConfigSpace provided
Whether to plot using log scale or not. If no value is given, it is deduced from the ConfigSpace provided and from values.
show: boolean
whether to call plt.show() to show plot directly as interactive matplotlib-plot
incumbents: List[Configuration]
list of ConfigSpace.Configurations that are marked as incumbents
ax: AxesSubplot, optional
A matplotlib AxesSubplot in which to place the plot or, if None, a new figure will be created.

Returns
-------
ax: AxesSubplot
A matplotlib AxesSubplot containing the plot. To save it to disk use `ax.get_figure().savefig('filename.png')`.
"""
param, param_name, param_idx = self._get_parameter(param)

# get figure AxesSubplot to plot on (or make a new one)
if (ax is None):
# create empty figure to work with
fig, ax = plt.subplots(1)
else:
fig = ax.get_figure()

# don't show the figure when user has provided their own figure AxesSubplot
show = False

# check if categorical
if isinstance(param, NumericalHyperparameter):
# PREPROCESS
Expand All @@ -328,33 +345,39 @@ def plot_marginal(self, param, resolution=100, log_scale=None, show=True, incumb
lower_curve = mean - std
upper_curve = mean + std


# auto-detect whether to do log-scale
if log_scale is None:
log_scale = param.log or (np.diff(grid).std() > 0.000001)
# take log value from ConfigSpace
log_scale = param.log

# auto-detect if log-scale might be better
if not log_scale and (np.diff(grid).std() > 0.000001):
self.logger.info("Plotting this parameter, %s, in log-scale because auto-detected that it might be better." % param_name)
log_scale = True

# PLOT
if log_scale:
if np.diff(grid).std() > 0.000001:
self.logger.info("It might be better to plot this parameter '%s' in log-scale.", param_name)
plt.semilogx(grid, mean, 'b', label='predicted %s' % self._y_label)
ax.semilogx(grid, mean, 'b', label='predicted %s' % self._y_label)
else:
plt.plot(grid, mean, 'b', label='predicted %s' % self._y_label)
plt.fill_between(grid, upper_curve, lower_curve, facecolor='red', alpha=0.6, label='std')
ax.plot(grid, mean, 'b', label='predicted %s' % self._y_label)
ax.fill_between(grid, upper_curve, lower_curve, facecolor='red', alpha=0.6, label='std')

if incumbents is not None:
if not isinstance(incumbents, list):
incumbents = [incumbents]
values = [inc[param_name] for inc in incumbents if param_name in inc and inc[param_name] is not None]
indices = [(np.abs(np.asarray(grid) - val)).argmin() for val in values]
if len(indices) > 0:
plt.scatter(list([grid[idx] for idx in indices]),
ax.scatter(list([grid[idx] for idx in indices]),
list([mean[idx] for idx in indices]),
label='incumbent', c='black', marker='.', zorder=999)

plt.xlabel(param_name)
plt.ylabel(self._y_label)
plt.grid(True)
plt.legend()
plt.tight_layout()
ax.set_xlabel(param_name)
ax.set_ylabel(self._y_label)
ax.grid(True)
ax.legend()
fig.tight_layout()

else:
# PREPROCESS
Expand All @@ -376,8 +399,8 @@ def plot_marginal(self, param, resolution=100, log_scale=None, show=True, incumb
max_y = mean[0]

# PLOT
b = plt.boxplot([[x] for x in mean])
plt.xticks(indices, labels)
b = ax.boxplot([[x] for x in mean])
ax.set_xticks(indices, labels)
# blow up boxes
for box, std_ in zip(b["boxes"], std):
y = box.get_ydata()
Expand All @@ -388,16 +411,17 @@ def plot_marginal(self, param, resolution=100, log_scale=None, show=True, incumb
min_y = min(min_y, y[0] - std_)
max_y = max(max_y, y[2] + std_)

plt.ylim([min_y, max_y])
ax.set_ylim([min_y, max_y])

plt.ylabel(self._y_label)
plt.xlabel(param_name)
plt.tight_layout()
ax.set_ylabel(self._y_label)
ax.set_xlabel(param_name)
fig.tight_layout()

if show:
plt.show()
else:
return plt

# Always return the matplotlib plot (to allow users to save it etc)
return ax

def create_most_important_pairwise_marginal_plots(self, params=None, n=20, three_d=True, resolution=20):
"""
Expand Down