Merge pull request #405 from corochann/add_cgcnn_megnet

Add cgcnn megnet
chainer · Nov 6, 2019 · 5a9319d · 5a9319d
2 parents b904fa0 + fb22a79
commit 5a9319d
Show file tree

Hide file tree

Showing 42 changed files with 1,755 additions and 134 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -4,8 +4,6 @@ language: python
 os: linux
 dist: trusty
 python:
-  - 2.7
-  - 3.5
   - 3.6
 env:
 env:
@@ -26,7 +24,7 @@ install:
   - pip install codecov
   - pip install mock
   - conda install pip pytest pytest-cov
-  - conda install -c rdkit rdkit==2017.09.3.0
+  - conda install -c rdkit rdkit==2019.03.2.0
   - if [ "${CHAINER_VERSION}" = "prerelease" ]; then
       pip install --pre chainer;
     else

diff --git a/README.md b/README.md
@@ -25,12 +25,13 @@ Chainer Chemistry depends on the following packages:
  - [`pandas`](https://pandas.pydata.org)
  - [`scikit-learn`](http://scikit-learn.org/stable/)
  - [`tqdm`](https://pypi.python.org/pypi/tqdm)
+ - [`h5py`](https://pypi.python.org/pypi/h5py)
 
 These are automatically added to the system when installing the library via the
 `pip` command (see _Installation_). However, the following  needs to be
 installed manually:
 
- - [`rdkit (release 2017.09.3.0)`](https://github.com/rdkit/rdkit)
+ - [`rdkit (release 2019.03.2.0)`](https://github.com/rdkit/rdkit)
 
 Please refer to the RDKit [documentation](http://www.rdkit.org/docs/Install.html)
 for more information regarding the installation steps.
@@ -44,7 +45,7 @@ currently supported:
 | v0.4.0              | v3.0 ~ v4.0 *1  | 2017.09.3.0    | 2.7, 3.5, 3.6    |
 | v0.5.0              | v3.0 ~ v5.0 *2  | 2017.09.3.0    | 2.7, 3.5, 3.6    |
 | v0.6.0              | v6.0 ~      *3  | 2017.09.3.0    | 2.7, 3.5, 3.6    |
-| master branch       | v6.0 ~      *3  | 2017.09.3.0    | 2.7, 3.5, 3.6    |
+| master branch       | v6.0 ~      *3  | 2019.03.2.0    | 3.6, 3.7         |
 | v0.7.0 release plan | v7.0 ~          | 2019.03.2.0    | 3.6, 3.7      *4 |
 
 
@@ -112,6 +113,8 @@ The following graph convolutional neural networks are currently supported:
 - MPNN: Message Passing Neural Networks [3]
 - Set2Set [19]
 - GNN-FiLM: Graph Neural Networks with Feature-wise Linear Modulation [20]
+- MEGNet: MatErials Graph Network [24]
+- CGCNN: Crystal Graph Convolutional Neural Networks [25]
 
 We test supporting the brand-new Graph Warp Module (GWM) [18]-attached models for:
 - NFP ('nfp_gwm')
@@ -209,7 +212,6 @@ papers. Use the library at your own risk.
 [18] K. Ishiguro, S. Maeda, and M. Koyama, ``Graph Warp Module: an Auxiliary Module for Boosting the Power of Graph Neural Networks'', arXiv:1902.01020 [cs.LG], 2019.
 
 [19] Oriol Vinyals, Samy Bengio, Manjunath Kudlur. Order Matters: Sequence to sequence for sets. *arXiv preprint arXiv:1511.06391*, 2015.
-. 
 
 [20] Marc Brockschmidt, ``GNN-FiLM: Graph Neural Networks with Feature-wise Linear Modulation'', arXiv:1906.12192 [cs.ML], 2019.
 
@@ -218,3 +220,7 @@ papers. Use the library at your own risk.
 [22] C. Lee Giles and Kurt D. Bollacker and Steve Lawrence, CiteSeer: An Automatic Citation Indexing System. *Proceedings of the Third ACM Conference on Digital Libraries*, 1998.
 
 [23] William L. Hamilton and Zhitao Ying and Jure Leskovec, Inductive Representation Learning on Large Graphs. *Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, 4-9 December 2017*
+
+[24] Chi Chen, Weike Ye, Yunxing Zuo, Chen Zheng, and Shyue Ping Ong. Graph networks as a universal machine learning framework for molecules and crystals. *Chemistry of Materials*, 31(9):3564–3572, 2019.
+
+[25] Tian Xie and Jeffrey C Grossman. Crystal graph convolutional neural networks for an accurate and interpretable prediction of material properties. *Physical review letters*, 120(14):145301, 2018.
diff --git a/chainer_chemistry/dataset/converters/__init__.py b/chainer_chemistry/dataset/converters/__init__.py
@@ -0,0 +1,22 @@
+from chainer_chemistry.dataset.converters.concat_mols import concat_mols  # NOQA
+from chainer_chemistry.dataset.converters.megnet_converter import megnet_converter  # NOQA
+from chainer_chemistry.dataset.converters.cgcnn_converter import cgcnn_converter  # NOQA
+
+converter_method_dict = {
+    'ecfp': concat_mols,
+    'nfp': concat_mols,
+    'nfp_gwm': concat_mols,
+    'ggnn': concat_mols,
+    'ggnn_gwm': concat_mols,
+    'gin': concat_mols,
+    'gin_gwm': concat_mols,
+    'schnet': concat_mols,
+    'weavenet': concat_mols,
+    'relgcn': concat_mols,
+    'rsgcn': concat_mols,
+    'rsgcn_gwm': concat_mols,
+    'relgat': concat_mols,
+    'gnnfilm': concat_mols,
+    'megnet': megnet_converter,
+    'cgcnn': cgcnn_converter
+}
diff --git a/chainer_chemistry/dataset/converters/cgcnn_converter.py b/chainer_chemistry/dataset/converters/cgcnn_converter.py
@@ -0,0 +1,36 @@
+import numpy
+
+import chainer
+from chainer import functions
+from chainer.dataset.convert import to_device
+
+
+@chainer.dataset.converter()
+def cgcnn_converter(batch, device=None, padding=None):
+    """CGCNN converter"""
+    if len(batch) == 0:
+        raise ValueError("batch is empty")
+
+    atom_feat, nbr_feat, nbr_idx = [], [], []
+    batch_atom_idx, target = [], []
+    current_idx = 0
+    xp = device.xp
+    for element in batch:
+        atom_feat.append(element[0])
+        nbr_feat.append(element[1])
+        nbr_idx.append(element[2] + current_idx)
+        target.append(element[3])
+        n_atom = element[0].shape[0]
+        atom_idx = numpy.arange(n_atom) + current_idx
+        batch_atom_idx.append(atom_idx)
+        current_idx += n_atom
+
+    atom_feat = to_device(device, functions.concat(atom_feat, axis=0).data)
+    nbr_feat = to_device(device, functions.concat(nbr_feat, axis=0).data)
+    # Always use numpy array for batch_atom_index
+    # this is list of variable length array
+    batch_atom_idx = numpy.array(batch_atom_idx)
+    nbr_idx = to_device(device, functions.concat(nbr_idx, axis=0).data)
+    target = to_device(device, xp.asarray(target))
+    result = (atom_feat, nbr_feat, batch_atom_idx, nbr_idx, target)
+    return result
diff --git a/chainer_chemistry/dataset/converters.py → ...emistry/dataset/converters/concat_mols.py b/chainer_chemistry/dataset/converters.py → ...emistry/dataset/converters/concat_mols.py
@@ -1,69 +1,69 @@
-import chainer
-
-
-@chainer.dataset.converter()
-def concat_mols(batch, device=None, padding=0):
-    """Concatenates a list of molecules into array(s).
-
-    This function converts an "array of tuples" into a "tuple of arrays".
-    Specifically, given a list of examples each of which consists of
-    a list of elements, this function first makes an array
-    by taking the element in the same position from each example
-    and concatenates them along the newly-inserted first axis
-    (called `batch dimension`) into one array.
-    It repeats this for all positions and returns the resulting arrays.
-
-    The output type depends on the type of examples in ``batch``.
-    For instance, consider each example consists of two arrays ``(x, y)``.
-    Then, this function concatenates ``x`` 's into one array, and ``y`` 's
-    into another array, and returns a tuple of these two arrays. Another
-    example: consider each example is a dictionary of two entries whose keys
-    are ``'x'`` and ``'y'``, respectively, and values are arrays. Then, this
-    function concatenates ``x`` 's into one array, and ``y`` 's into another
-    array, and returns a dictionary with two entries ``x`` and ``y`` whose
-    values are the concatenated arrays.
-
-    When the arrays to concatenate have different shapes, the behavior depends
-    on the ``padding`` value. If ``padding`` is ``None``, it raises an error.
-    Otherwise, it builds an array of the minimum shape that the
-    contents of all arrays can be substituted to. The padding value is then
-    used to the extra elements of the resulting arrays.
-
-    The current implementation is identical to
-    :func:`~chainer.dataset.concat_examples` of Chainer, except the default
-    value of the ``padding`` option is changed to ``0``.
-
-    .. admonition:: Example
-
-       >>> import numpy
-       >>> from chainer_chemistry.dataset.converters import concat_mols
-       >>> x0 = numpy.array([1, 2])
-       >>> x1 = numpy.array([4, 5, 6])
-       >>> dataset = [x0, x1]
-       >>> results = concat_mols(dataset)
-       >>> print(results)
-       [[1 2 0]
-        [4 5 6]]
-
-    .. seealso:: :func:`chainer.dataset.concat_examples`
-
-    Args:
-        batch (list):
-            A list of examples. This is typically given by a dataset
-            iterator.
-        device (int):
-            Device ID to which each array is sent. Negative value
-            indicates the host memory (CPU). If it is omitted, all arrays are
-            left in the original device.
-        padding:
-            Scalar value for extra elements. If this is None (default),
-            an error is raised on shape mismatch. Otherwise, an array of
-            minimum dimensionalities that can accommodate all arrays is
-            created, and elements outside of the examples are padded by this
-            value.
-
-    Returns:
-        Array, a tuple of arrays, or a dictionary of arrays:
-        The type depends on the type of each example in the batch.
-    """
-    return chainer.dataset.concat_examples(batch, device, padding=padding)
+import chainer
+
+
+@chainer.dataset.converter()
+def concat_mols(batch, device=None, padding=0):
+    """Concatenates a list of molecules into array(s).
+
+    This function converts an "array of tuples" into a "tuple of arrays".
+    Specifically, given a list of examples each of which consists of
+    a list of elements, this function first makes an array
+    by taking the element in the same position from each example
+    and concatenates them along the newly-inserted first axis
+    (called `batch dimension`) into one array.
+    It repeats this for all positions and returns the resulting arrays.
+
+    The output type depends on the type of examples in ``batch``.
+    For instance, consider each example consists of two arrays ``(x, y)``.
+    Then, this function concatenates ``x`` 's into one array, and ``y`` 's
+    into another array, and returns a tuple of these two arrays. Another
+    example: consider each example is a dictionary of two entries whose keys
+    are ``'x'`` and ``'y'``, respectively, and values are arrays. Then, this
+    function concatenates ``x`` 's into one array, and ``y`` 's into another
+    array, and returns a dictionary with two entries ``x`` and ``y`` whose
+    values are the concatenated arrays.
+
+    When the arrays to concatenate have different shapes, the behavior depends
+    on the ``padding`` value. If ``padding`` is ``None``, it raises an error.
+    Otherwise, it builds an array of the minimum shape that the
+    contents of all arrays can be substituted to. The padding value is then
+    used to the extra elements of the resulting arrays.
+
+    The current implementation is identical to
+    :func:`~chainer.dataset.concat_examples` of Chainer, except the default
+    value of the ``padding`` option is changed to ``0``.
+
+    .. admonition:: Example
+
+       >>> import numpy
+       >>> from chainer_chemistry.dataset.converters import concat_mols
+       >>> x0 = numpy.array([1, 2])
+       >>> x1 = numpy.array([4, 5, 6])
+       >>> dataset = [x0, x1]
+       >>> results = concat_mols(dataset)
+       >>> print(results)
+       [[1 2 0]
+        [4 5 6]]
+
+    .. seealso:: :func:`chainer.dataset.concat_examples`
+
+    Args:
+        batch (list):
+            A list of examples. This is typically given by a dataset
+            iterator.
+        device (int):
+            Device ID to which each array is sent. Negative value
+            indicates the host memory (CPU). If it is omitted, all arrays are
+            left in the original device.
+        padding:
+            Scalar value for extra elements. If this is None (default),
+            an error is raised on shape mismatch. Otherwise, an array of
+            minimum dimensionalities that can accommodate all arrays is
+            created, and elements outside of the examples are padded by this
+            value.
+
+    Returns:
+        Array, a tuple of arrays, or a dictionary of arrays:
+        The type depends on the type of each example in the batch.
+    """
+    return chainer.dataset.concat_examples(batch, device, padding=padding)
diff --git a/chainer_chemistry/dataset/converters/megnet_converter.py b/chainer_chemistry/dataset/converters/megnet_converter.py
@@ -0,0 +1,41 @@
+import chainer
+from chainer.dataset.convert import to_device
+
+
+@chainer.dataset.converter()
+def megnet_converter(batch, device=None, padding=0):
+    """MEGNet converter"""
+    if len(batch) == 0:
+        raise ValueError("batch is empty")
+
+    atom_feat, pair_feat, global_feat, target = [], [], [], []
+    atom_idx, pair_idx, start_idx, end_idx = [], [], [], []
+    batch_size = len(batch)
+    current_atom_idx = 0
+    for i in range(batch_size):
+        element = batch[i]
+        n_atom = element[0].shape[0]
+        n_pair = element[1].shape[0]
+        atom_feat.extend(element[0])
+        pair_feat.extend(element[1])
+        global_feat.append(element[2])
+        atom_idx.extend([i]*n_atom)
+        pair_idx.extend([i]*n_pair)
+        start_idx.extend(element[3][0] + current_atom_idx)
+        end_idx.extend(element[3][1] + current_atom_idx)
+        target.append(element[4])
+        current_atom_idx += n_atom
+
+    xp = device.xp
+    atom_feat = to_device(device, xp.asarray(atom_feat))
+    pair_feat = to_device(device, xp.asarray(pair_feat))
+    global_feat = to_device(device, xp.asarray(global_feat))
+    atom_idx = to_device(device, xp.asarray(atom_idx))
+    pair_idx = to_device(device, xp.asarray(pair_idx))
+    start_idx = to_device(device, xp.asarray(start_idx))
+    end_idx = to_device(device, xp.asarray(end_idx))
+    target = to_device(device, xp.asarray(target))
+    result = (atom_feat, pair_feat, global_feat, atom_idx, pair_idx,
+              start_idx, end_idx, target)
+
+    return result
diff --git a/chainer_chemistry/dataset/preprocessors/__init__.py b/chainer_chemistry/dataset/preprocessors/__init__.py
@@ -21,6 +21,8 @@
 from chainer_chemistry.dataset.preprocessors.rsgcn_preprocessor import RSGCNPreprocessor  # NOQA
 from chainer_chemistry.dataset.preprocessors.schnet_preprocessor import SchNetPreprocessor  # NOQA
 from chainer_chemistry.dataset.preprocessors.weavenet_preprocessor import WeaveNetPreprocessor  # NOQA
+from chainer_chemistry.dataset.preprocessors.megnet_preprocessor import MEGNetPreprocessor  # NOQA
+from chainer_chemistry.dataset.preprocessors.cgcnn_preprocessor import CGCNNPreprocessor  # NOQA
 
 preprocess_method_dict = {
     'ecfp': ECFPPreprocessor,
@@ -39,4 +41,6 @@
     'relgcn_sparse': RelGCNSparsePreprocessor,
     'gin_sparse': GINSparsePreprocessor,
     'gnnfilm': GNNFiLMPreprocessor,
+    'megnet': MEGNetPreprocessor,
+    'cgcnn': CGCNNPreprocessor
 }
diff --git a/chainer_chemistry/dataset/preprocessors/cgcnn_preprocessor.py b/chainer_chemistry/dataset/preprocessors/cgcnn_preprocessor.py
@@ -0,0 +1,59 @@
+from logging import getLogger
+import numpy
+import os
+import shutil
+
+from chainer.dataset import download
+
+from chainer_chemistry.dataset.utils import GaussianDistance
+from chainer_chemistry.dataset.preprocessors.mol_preprocessor import MolPreprocessor  # NOQA
+from chainer_chemistry.utils import load_json
+
+download_url = 'https://raw.githubusercontent.com/txie-93/cgcnn/master/data/sample-regression/atom_init.json'  # NOQA
+file_name_atom_init_json = 'atom_init.json'
+
+_root = 'pfnet/chainer/cgcnn'
+
+
+def get_atom_init_json_filepath(download_if_not_exist=True):
+    """Construct a filepath which stores atom_init_json
+
+    This method check whether the file exist or not,  and downloaded it if
+    necessary.
+
+    Args:
+        download_if_not_exist (bool): If `True` download dataset
+            if it is not downloaded yet.
+
+    Returns (str): file path for atom_init_json
+    """
+    cache_root = download.get_dataset_directory(_root)
+    cache_path = os.path.join(cache_root, file_name_atom_init_json)
+    if not os.path.exists(cache_path) and download_if_not_exist:
+        logger = getLogger(__name__)
+        logger.info('Downloading atom_init.json...')
+        download_file_path = download.cached_download(download_url)
+        shutil.copy(download_file_path, cache_path)
+    return cache_path
+
+
+class CGCNNPreprocessor(MolPreprocessor):
+    """CGCNNPreprocessor
+
+    Args:
+    For Molecule: TODO
+    """
+
+    def __init__(self, max_num_nbr=12, max_radius=8, expand_dim=40):
+        super(CGCNNPreprocessor, self).__init__()
+
+        self.max_num_nbr = max_num_nbr
+        self.max_radius = max_radius
+        self.gdf = GaussianDistance(centers=numpy.linspace(0, 8, expand_dim))
+        feat_dict = load_json(get_atom_init_json_filepath())
+        self.atom_features = {int(key): numpy.array(value,
+                                                    dtype=numpy.float32)
+                              for key, value in feat_dict.items()}
+
+    def get_input_features(self, mol):
+        raise NotImplementedError()