From 76d0c34613c815de4a72faf212bb347519ab2413 Mon Sep 17 00:00:00 2001 From: peikai zheng Date: Fri, 15 Dec 2017 23:21:18 +0800 Subject: [PATCH 01/18] fix version --- python-package/setup_pip.py | 2 +- python-package/xlearn/VERSION | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python-package/setup_pip.py b/python-package/setup_pip.py index 12380724..9719771c 100644 --- a/python-package/setup_pip.py +++ b/python-package/setup_pip.py @@ -81,7 +81,7 @@ def run(self): if __name__ == "__main__": setup(name='xlearn', - version="0.20.a1", + version=open(os.path.join(CURRENT_DIR, 'xlearn/VERSION')).read().strip(), description="xLearn Python Package", maintainer='Chao Ma', maintainer_email='mctt90@gmail.com', diff --git a/python-package/xlearn/VERSION b/python-package/xlearn/VERSION index 6e8bf73a..55862aea 100644 --- a/python-package/xlearn/VERSION +++ b/python-package/xlearn/VERSION @@ -1 +1 @@ -0.1.0 +0.20a1 From 4800db3ae683b9f3e4cae7aa2a52e76a4825bfdb Mon Sep 17 00:00:00 2001 From: peikai zheng Date: Tue, 19 Dec 2017 13:33:02 +0800 Subject: [PATCH 02/18] fix version --- python-package/xlearn/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/xlearn/VERSION b/python-package/xlearn/VERSION index 6e8bf73a..55862aea 100644 --- a/python-package/xlearn/VERSION +++ b/python-package/xlearn/VERSION @@ -1 +1 @@ -0.1.0 +0.20a1 From 85ef0dfd82ceb58a035ee01eee4eefc99d5dece2 Mon Sep 17 00:00:00 2001 From: peikai zheng Date: Sat, 23 Dec 2017 18:13:09 +0800 Subject: [PATCH 03/18] add git ignore --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 7f325e52..d0c505db 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,7 @@ build/* .Rhistory *.tar.gz xlearn/* + +# Python-package +*.egg-info +*.pyc From 6235f70a64e48cd75dbed2e1ea6949f78a7e6cc8 Mon Sep 17 00:00:00 2001 From: peikai zheng Date: Sat, 23 Dec 2017 18:14:46 +0800 Subject: [PATCH 04/18] c_api wrapper --- src/c_api/c_api.cc | 47 ++++++++++++++++++++++++++++++++++++++++++--- src/c_api/c_api.h | 19 ++++++++++++++++++ src/reader/reader.h | 3 +++ 3 files changed, 66 insertions(+), 3 deletions(-) diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index c27ac2ef..aeba41be 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -20,15 +20,17 @@ Author: Chao Ma (mctt90@gmail.com) This file is the implementation of C API for xLearn. */ -#include +#include #include +#include #include -#include "src/c_api/c_api.h" -#include "src/c_api/c_api_error.h" #include "src/base/format_print.h" #include "src/base/timer.h" +#include "src/c_api/c_api.h" +#include "src/c_api/c_api_error.h" +#include "src/data/data_structure.h" // Say hello to user XL_DLL int XLearnHello() { @@ -237,3 +239,42 @@ XL_DLL int XLearnSetBool(XL *out, const char *key, const bool value) { } API_END(); } + +XL_DLL int XLDMatrixCreateFromFile(const char *fname, + int silent, + XL* out) { + API_BEGIN(); + API_END(); +} + +XL_DLL int XLDMatrixCreateFromCSREx(const size_t* indptr, + const unsigned* indices, + const real_t* data, + size_t nindptr, + size_t nelem, + size_t num_col, + XL* out) { + API_BEGIN(); + xLearn::DMatrix* mat = new xLearn::DMatrix(); + mat->row_length = nindptr; + mat->row.reserve(nindptr - 1); + for (size_t i = 1; i < nindptr; ++ i) { + xLearn::SparseRow* row = new xLearn::SparseRow(); + row->reserve(indptr[i] - indptr[i - 1]); + for (size_t j = indptr[i - 1]; j < indptr[i]; ++ j) { + if (!std::isnan(data[j])) { + row->emplace_back(xLearn::Node(0, indices[j], data[j])); + } + } + mat->row.push_back(row); + } + *out = mat; + API_END(); +} + +XL_DLL int XLDMatrixCreateFromCSR(const size_t* indptr, + const unsigned* indices, + const real_t* data, + size_t nindptr, + size_t nelem, + XL* out); \ No newline at end of file diff --git a/src/c_api/c_api.h b/src/c_api/c_api.h index a472c964..fc859b5c 100644 --- a/src/c_api/c_api.h +++ b/src/c_api/c_api.h @@ -111,4 +111,23 @@ class XLearn { DISALLOW_COPY_AND_ASSIGN(XLearn); }; +XL_DLL int XLDMatrixCreateFromFile(const char *fname, + int silent, + XL* out); + +XL_DLL int XLDMatrixCreateFromCSREx(const size_t* indptr, + const unsigned* indices, + const real_t* data, + size_t nindptr, + size_t nelem, + size_t num_col, + XL* out); + +XL_DLL int XLDMatrixCreateFromCSR(const size_t* indptr, + const unsigned* indices, + const real_t* data, + size_t nindptr, + size_t nelem, + XL* out); + #endif // XLEARN_C_API_C_API_H_ \ No newline at end of file diff --git a/src/reader/reader.h b/src/reader/reader.h index a2a95be5..23856d1c 100644 --- a/src/reader/reader.h +++ b/src/reader/reader.h @@ -152,6 +152,9 @@ class InmemReader : public Reader { // Pre-load all the data into memory buffer. virtual void Initialize(const std::string& filename); + // Initialized from DMatrix + virtual void Initialize(const DMatrix* const matrix); + // Sample data from the memory buffer. virtual index_t Samples(DMatrix* &matrix); From efa31373424ed17afd834a57e6882f4ad27c437f Mon Sep 17 00:00:00 2001 From: peikai zheng Date: Sun, 24 Dec 2017 16:42:33 +0800 Subject: [PATCH 05/18] fix c_api --- python-package/xlearn/compat.py | 14 +++++++++++++ python-package/xlearn/core.py | 37 +++++++++++++++++++++++++++++++++ src/c_api/c_api.cc | 11 +++++++++- src/c_api/c_api.h | 3 +++ src/data/data_structure.h | 15 +++++++++++++ 5 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 python-package/xlearn/compat.py create mode 100644 python-package/xlearn/core.py diff --git a/python-package/xlearn/compat.py b/python-package/xlearn/compat.py new file mode 100644 index 00000000..070d7372 --- /dev/null +++ b/python-package/xlearn/compat.py @@ -0,0 +1,14 @@ +# coding: utf-8 + +from __future__ import absolute_import + +import sys + +PY3 = (sys.version_info[0] == 3) + +if PY3: + STRING_TYPES = str + +else: + STRING_TYPES = basestring + diff --git a/python-package/xlearn/core.py b/python-package/xlearn/core.py new file mode 100644 index 00000000..285f7fb9 --- /dev/null +++ b/python-package/xlearn/core.py @@ -0,0 +1,37 @@ +# coding: utf-8 + +from __future__ import absolute_import + +import ctypes +import scipy.sparse + +from .base import _LIB, _check_call, c_str +from .compat import STRING_TYPES + +class DMatrix(object): + """Data Matrix used in xlearn""" + + _feature_names = None + + def __init__(self, data, label=None, silent=None, + feature_names=None): + if data is None: + self.handle = None + return + + if isinstance(data, STRING_TYPES): + self.handle = ctypes.c_void_p() + _check_call(_LIB.XLDMatrixCreateFromFile(c_str(data), + ctypes.c_int(silent), + ctypes.byref(self.handle))) + elif isinstance(data, scipy.sparse.csr_matrix): + self._init_from_csr(data); + elif isinstance(data, scipy.sparse.csc_matrix): + self._init_from_csc(data); + else: + try: + csr = scipy.sparse.csr_matrix(data) + self._init_from_csr(csr) + except: + raise TypeError('can not initialize DMatrix from {}'.format(type(data).__name__)) + diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index aeba41be..a8e0249a 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -277,4 +277,13 @@ XL_DLL int XLDMatrixCreateFromCSR(const size_t* indptr, const real_t* data, size_t nindptr, size_t nelem, - XL* out); \ No newline at end of file + XL* out); + +XL_DLL int XLDMatrixSetLabel(XL* out, + const real_t* label, + const size_t& len) { + API_BEGIN(); + auto p = reinterpret_cast(out); + p->SetLabel(label, len); + API_END(); +} diff --git a/src/c_api/c_api.h b/src/c_api/c_api.h index fc859b5c..6ab0bec0 100644 --- a/src/c_api/c_api.h +++ b/src/c_api/c_api.h @@ -130,4 +130,7 @@ XL_DLL int XLDMatrixCreateFromCSR(const size_t* indptr, size_t nelem, XL* out); +XL_DLL int XLDMatrixSetLabel(XL* out, + const real_t* label); + #endif // XLEARN_C_API_C_API_H_ \ No newline at end of file diff --git a/src/data/data_structure.h b/src/data/data_structure.h index 75b3ee9f..75935e6f 100644 --- a/src/data/data_structure.h +++ b/src/data/data_structure.h @@ -207,6 +207,21 @@ struct DMatrix { hash_value_2 = hash_2; } + // set label of the matrix. + void SetLabel(const std::vector& label) { + has_label = true; + this->Y = label; + } + + void SetLabel(const real_t* label, + const size_t& len) { + has_label = true; + this->Y.reserve(len); + for (size_t i = 0; i < len; ++ i) { + this->Y.push_back(label[i]); + } + } + // Copy another data matrix to this matrix. // Note that here we do the deep copy and we will // allocate memory if current matrix is empty. From a0a03f8e0b488d74f8f24e8df316b51564f72425 Mon Sep 17 00:00:00 2001 From: peikai zheng Date: Tue, 26 Dec 2017 12:09:09 +0800 Subject: [PATCH 06/18] fix c_api --- python-package/xlearn/__init__.py | 1 + python-package/xlearn/core.py | 35 +++++++++++++++++++++++++++++-- src/c_api/c_api.cc | 28 +++++++++++++++++++++++++ src/c_api/c_api.h | 8 +++++++ 4 files changed, 70 insertions(+), 2 deletions(-) diff --git a/python-package/xlearn/__init__.py b/python-package/xlearn/__init__.py index 4867886d..ab04e194 100644 --- a/python-package/xlearn/__init__.py +++ b/python-package/xlearn/__init__.py @@ -3,6 +3,7 @@ from __future__ import absolute_import import os from .xlearn import * +from .core import DMatrix VERSION_FILE = os.path.join(os.path.dirname(__file__), 'VERSION') with open(VERSION_FILE) as f: diff --git a/python-package/xlearn/core.py b/python-package/xlearn/core.py index 285f7fb9..f731833b 100644 --- a/python-package/xlearn/core.py +++ b/python-package/xlearn/core.py @@ -3,17 +3,25 @@ from __future__ import absolute_import import ctypes +import numpy as np import scipy.sparse from .base import _LIB, _check_call, c_str from .compat import STRING_TYPES +def c_array(ctype, values): + """Convert a python string to c array.""" + if isinstance(values, np.ndarray) and values.dtype.itemsize == ctypes.sizeof(ctype): + return (ctype * len(values)).from_buffer_copy(values) + return (ctype * len(values))(*values) + class DMatrix(object): """Data Matrix used in xlearn""" _feature_names = None + _field_names = None - def __init__(self, data, label=None, silent=None, + def __init__(self, data, label=None, field=None, silent=None, feature_names=None): if data is None: self.handle = None @@ -34,4 +42,27 @@ def __init__(self, data, label=None, silent=None, self._init_from_csr(csr) except: raise TypeError('can not initialize DMatrix from {}'.format(type(data).__name__)) - + + def _init_from_csr(self, csr): + if len(csr.indices) != len(csr.data): + raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data))) + self.handle = ctypes.c_void_p() + _check_call(_LIB.XLDMatrixCreateFromCSREx(c_array(ctypes.c_size_t, csr.indptr), + c_array(ctypes.c_uint, csr.indices), + c_array(ctypes.c_float, csr.data), + ctypes.c_size_t(len(csr.indptr)), + ctypes.c_size_t(len(csr.data)), + ctypes.c_size_t(csr.shape[1]), + ctypes.byref(self.handle))) + + def _init_from_csc(self, csc): + if len(csc.indices) != len(csc.data): + raise ValueError('length mismatch: {} vs {}'.format(len(csc.indices), len(csc.data))) + self.handle = ctypes.c_void_p(); + _check_call(_LIB.XLDMatrixCreateFromCSCEx(c_array(ctypes.c_size_t, csc.indptr), + c_array(ctypes.c_uint, csc.indices), + c_array(ctypes.c_float, csc.data), + ctypes.c_size_t(len(csc.indptr)), + ctypes.c_size_t(len(csc.data)), + ctypes.c_size_t(csc.shape[0]), + ctypes.byref(self.handle))) diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index a8e0249a..6b280878 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -279,6 +279,31 @@ XL_DLL int XLDMatrixCreateFromCSR(const size_t* indptr, size_t nelem, XL* out); +XL_DLL int XLDMatrixCreateFromCSCEx(const size_t* indptr, + const unsigned* indices, + const real_t* data, + size_t nindptr, + size_t nelem, + size_t num_row, + XL* out) { + API_BEGIN(); + xLearn::DMatrix* mat = new xLearn::DMatrix(); + mat->row_length = num_row; + mat->row.reserve(num_row); + for (size_t i = 0; i < num_row; ++ i) { + mat->row.emplace_back(new xLearn::SparseRow()); + } + for (size_t i = 1; i < nindptr; ++ i) { + for (size_t j = indptr[i - 1]; j < indptr[i]; ++ j) { + if (!std::isnan(data[j])) { + mat->row[indices[j]]->emplace_back(xLearn::Node(0, i - 1, data[j])); + } + } + } + *out = mat; + API_END(); +} + XL_DLL int XLDMatrixSetLabel(XL* out, const real_t* label, const size_t& len) { @@ -287,3 +312,6 @@ XL_DLL int XLDMatrixSetLabel(XL* out, p->SetLabel(label, len); API_END(); } + +// TODO: add the method of setting field +XL_DLL int XLDMatrixSetField(XL* out); diff --git a/src/c_api/c_api.h b/src/c_api/c_api.h index 6ab0bec0..c44fff29 100644 --- a/src/c_api/c_api.h +++ b/src/c_api/c_api.h @@ -130,6 +130,14 @@ XL_DLL int XLDMatrixCreateFromCSR(const size_t* indptr, size_t nelem, XL* out); +XL_DLL int XLDMatrixCreateFromCSCEx(const size_t* indptr, + const unsigned* indices, + const real_t* data, + size_t nindptr, + size_t nelem, + size_t num_row, + XL* out); + XL_DLL int XLDMatrixSetLabel(XL* out, const real_t* label); From dfb9ea455aaabcd0762ecb0942a0f398a708132f Mon Sep 17 00:00:00 2001 From: peikai zheng Date: Tue, 26 Dec 2017 13:51:18 +0800 Subject: [PATCH 07/18] fix c_api --- src/c_api/c_api.cc | 4 +++- src/c_api/c_api.h | 3 ++- src/reader/reader.cc | 12 ++++++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 6b280878..be0bd5bd 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -272,12 +272,14 @@ XL_DLL int XLDMatrixCreateFromCSREx(const size_t* indptr, API_END(); } +/* XL_DLL int XLDMatrixCreateFromCSR(const size_t* indptr, const unsigned* indices, const real_t* data, size_t nindptr, size_t nelem, XL* out); + */ XL_DLL int XLDMatrixCreateFromCSCEx(const size_t* indptr, const unsigned* indices, @@ -314,4 +316,4 @@ XL_DLL int XLDMatrixSetLabel(XL* out, } // TODO: add the method of setting field -XL_DLL int XLDMatrixSetField(XL* out); +//XL_DLL int XLDMatrixSetField(XL* out); diff --git a/src/c_api/c_api.h b/src/c_api/c_api.h index c44fff29..3b867661 100644 --- a/src/c_api/c_api.h +++ b/src/c_api/c_api.h @@ -139,6 +139,7 @@ XL_DLL int XLDMatrixCreateFromCSCEx(const size_t* indptr, XL* out); XL_DLL int XLDMatrixSetLabel(XL* out, - const real_t* label); + const real_t* label, + const size_t& len); #endif // XLEARN_C_API_C_API_H_ \ No newline at end of file diff --git a/src/reader/reader.cc b/src/reader/reader.cc index 66769604..8325541d 100644 --- a/src/reader/reader.cc +++ b/src/reader/reader.cc @@ -110,6 +110,18 @@ void InmemReader::Initialize(const std::string& filename) { } } +void InmemReader::Initialize(const DMatrix *const matrix) { + data_buf_ = *matrix; + data_buf_.has_label = matrix->has_label; + // Init data_samples_ + num_samples_ = data_buf_.row_length; + // for shuffle + order_.resize(num_samples_); + for (int i = 0; i < order_.size(); ++i) { + order_[i] = i; + } +} + // Check wheter current path has a binary file. // We use double check here, that is, we first check // the hash value of a small data block, then check the whole file. From 7b450ff28f38c908ddf73535a1c9e21d647fc026 Mon Sep 17 00:00:00 2001 From: peikai zheng Date: Tue, 26 Dec 2017 23:30:40 +0800 Subject: [PATCH 08/18] fix python core --- python-package/xlearn/core.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python-package/xlearn/core.py b/python-package/xlearn/core.py index f731833b..05d864cd 100644 --- a/python-package/xlearn/core.py +++ b/python-package/xlearn/core.py @@ -66,3 +66,7 @@ def _init_from_csc(self, csc): ctypes.c_size_t(len(csc.data)), ctypes.c_size_t(csc.shape[0]), ctypes.byref(self.handle))) + +def train(dmatrix, params={}): + + pass \ No newline at end of file From a9e3805d5d5b82d4aeb5489bc5d220b0a6043c0b Mon Sep 17 00:00:00 2001 From: peikai zheng Date: Tue, 26 Dec 2017 23:36:11 +0800 Subject: [PATCH 09/18] fix pip error --- python-package/setup_pip.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python-package/setup_pip.py b/python-package/setup_pip.py index 9719771c..af89fd86 100644 --- a/python-package/setup_pip.py +++ b/python-package/setup_pip.py @@ -21,6 +21,10 @@ def silent_call(cmd, raise_error=False, error_msg=''): with open(os.devnull, 'w') as shut_up: subprocess.check_output(cmd, stderr=shut_up) return 0 + except OSError: + if raise_error: + raise Exception("open devnull error") + return 1 except Exception: if raise_error: raise Exception(error_msg); From d85c9cb51e37f4b44cbf64e47c5188f1ca26a6a8 Mon Sep 17 00:00:00 2001 From: peikai zheng Date: Fri, 5 Jan 2018 22:27:56 +0800 Subject: [PATCH 10/18] fix c_api --- src/c_api/c_api.cc | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index be0bd5bd..82d16032 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -256,17 +256,13 @@ XL_DLL int XLDMatrixCreateFromCSREx(const size_t* indptr, XL* out) { API_BEGIN(); xLearn::DMatrix* mat = new xLearn::DMatrix(); - mat->row_length = nindptr; - mat->row.reserve(nindptr - 1); + mat->ResetMatrix(nindptr, false); for (size_t i = 1; i < nindptr; ++ i) { - xLearn::SparseRow* row = new xLearn::SparseRow(); - row->reserve(indptr[i] - indptr[i - 1]); for (size_t j = indptr[i - 1]; j < indptr[i]; ++ j) { if (!std::isnan(data[j])) { - row->emplace_back(xLearn::Node(0, indices[j], data[j])); + mat->AddNode(i, indices[j], data[j]); } } - mat->row.push_back(row); } *out = mat; API_END(); @@ -290,15 +286,11 @@ XL_DLL int XLDMatrixCreateFromCSCEx(const size_t* indptr, XL* out) { API_BEGIN(); xLearn::DMatrix* mat = new xLearn::DMatrix(); - mat->row_length = num_row; - mat->row.reserve(num_row); - for (size_t i = 0; i < num_row; ++ i) { - mat->row.emplace_back(new xLearn::SparseRow()); - } + mat->ResetMatrix(num_row, false); for (size_t i = 1; i < nindptr; ++ i) { for (size_t j = indptr[i - 1]; j < indptr[i]; ++ j) { if (!std::isnan(data[j])) { - mat->row[indices[j]]->emplace_back(xLearn::Node(0, i - 1, data[j])); + mat->AddNode(indices[j], i - 1, data[j]); } } } From afe7a1f3b53a5316b7c5d0b4f8ed7d1fae536f58 Mon Sep 17 00:00:00 2001 From: peikai zheng Date: Sat, 6 Jan 2018 00:03:52 +0800 Subject: [PATCH 11/18] add features and fields --- python-package/CMakeLists.txt | 10 ++++++++-- python-package/test_python_module.py | 16 ++++++++++++++++ python-package/xlearn/core.py | 19 ++++++++++++------- src/c_api/c_api.cc | 26 ++++++++++++++++++-------- src/c_api/c_api.h | 6 ++++-- src/data/data_structure.h | 1 + 6 files changed, 59 insertions(+), 19 deletions(-) create mode 100755 python-package/test_python_module.py diff --git a/python-package/CMakeLists.txt b/python-package/CMakeLists.txt index 77840ac0..d2745d7a 100644 --- a/python-package/CMakeLists.txt +++ b/python-package/CMakeLists.txt @@ -1,7 +1,11 @@ # Copy Python file FILE(COPY "${CMAKE_CURRENT_SOURCE_DIR}/xlearn/base.py" DESTINATION ${PROJECT_BINARY_DIR}/python-package/xlearn) -FILE(COPY "${CMAKE_CURRENT_SOURCE_DIR}/xlearn/libpath.py" +FILE(COPY "${CMAKE_CURRENT_SOURCE_DIR}/xlearn/core.py" +DESTINATION ${PROJECT_BINARY_DIR}/python-package/xlearn) +FILE(COPY "${CMAKE_CURRENT_SOURCE_DIR}/xlearn/compat.py" +DESTINATION ${PROJECT_BINARY_DIR}/python-package/xlearn) +FILE(COPY "${CMAKE_CURRENT_SOURCE_DIR}/xlearn/libpath.py" DESTINATION ${PROJECT_BINARY_DIR}/python-package/xlearn) FILE(COPY "${CMAKE_CURRENT_SOURCE_DIR}/xlearn/xlearn.py" DESTINATION ${PROJECT_BINARY_DIR}/python-package/xlearn) @@ -15,5 +19,7 @@ FILE(COPY "${CMAKE_CURRENT_SOURCE_DIR}/MANIFEST.in" DESTINATION ${PROJECT_BINARY_DIR}/python-package) FILE(COPY "${CMAKE_CURRENT_SOURCE_DIR}/test_python.py" DESTINATION ${PROJECT_BINARY_DIR}/python-package/test) -FILE(COPY "${CMAKE_CURRENT_SOURCE_DIR}/install-python.sh" +FILE(COPY "${CMAKE_CURRENT_SOURCE_DIR}/test_python_module.py" +DESTINATION ${PROJECT_BINARY_DIR}/python-package/) +FILE(COPY "${CMAKE_CURRENT_SOURCE_DIR}/install-python.sh" DESTINATION ${PROJECT_BINARY_DIR}/python-package) diff --git a/python-package/test_python_module.py b/python-package/test_python_module.py new file mode 100755 index 00000000..575328da --- /dev/null +++ b/python-package/test_python_module.py @@ -0,0 +1,16 @@ +#!/usr/bin/python +# coding: utf-8 + +import numpy as np +import xlearn as xl + +from scipy.sparse import csr_matrix + +row = np.array([0, 0, 1, 2, 2, 2]) +col = np.array([0, 2, 2, 0, 1, 2]) +data = np.array([1, 2, 3, 4, 5, 6]) +csr = csr_matrix((data, (row, col)), shape=(3, 3)) + +xl.DMatrix(csr, field=csr) + + diff --git a/python-package/xlearn/core.py b/python-package/xlearn/core.py index 05d864cd..2c8cb905 100644 --- a/python-package/xlearn/core.py +++ b/python-package/xlearn/core.py @@ -33,36 +33,41 @@ def __init__(self, data, label=None, field=None, silent=None, ctypes.c_int(silent), ctypes.byref(self.handle))) elif isinstance(data, scipy.sparse.csr_matrix): - self._init_from_csr(data); + self._init_from_csr(data, field); elif isinstance(data, scipy.sparse.csc_matrix): - self._init_from_csc(data); + self._init_from_csc(data, field); else: try: csr = scipy.sparse.csr_matrix(data) - self._init_from_csr(csr) + csr_field = scipy.sparse.csr_matrix(field); + self._init_from_csr(csr, csr_field) except: raise TypeError('can not initialize DMatrix from {}'.format(type(data).__name__)) - def _init_from_csr(self, csr): + def _init_from_csr(self, csr, field): if len(csr.indices) != len(csr.data): raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data))) self.handle = ctypes.c_void_p() _check_call(_LIB.XLDMatrixCreateFromCSREx(c_array(ctypes.c_size_t, csr.indptr), c_array(ctypes.c_uint, csr.indices), c_array(ctypes.c_float, csr.data), - ctypes.c_size_t(len(csr.indptr)), + c_array(ctypes.c_uint, + field.data if field is not None else []), + ctypes.c_size_t(len(csr.indptr) - 1), ctypes.c_size_t(len(csr.data)), ctypes.c_size_t(csr.shape[1]), ctypes.byref(self.handle))) - def _init_from_csc(self, csc): + def _init_from_csc(self, csc, field): if len(csc.indices) != len(csc.data): raise ValueError('length mismatch: {} vs {}'.format(len(csc.indices), len(csc.data))) self.handle = ctypes.c_void_p(); _check_call(_LIB.XLDMatrixCreateFromCSCEx(c_array(ctypes.c_size_t, csc.indptr), c_array(ctypes.c_uint, csc.indices), c_array(ctypes.c_float, csc.data), - ctypes.c_size_t(len(csc.indptr)), + c_array(ctypes.c_uint, + field.data if field is not None else []), + ctypes.c_size_t(len(csc.indptr) - 1), ctypes.c_size_t(len(csc.data)), ctypes.c_size_t(csc.shape[0]), ctypes.byref(self.handle))) diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index fcbf5d01..dcfef40d 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -253,7 +253,8 @@ XL_DLL int XLDMatrixCreateFromFile(const char *fname, XL_DLL int XLDMatrixCreateFromCSREx(const size_t* indptr, const unsigned* indices, - const real_t* data, + const real_t* features, + const real_t* fields, size_t nindptr, size_t nelem, size_t num_col, @@ -261,10 +262,14 @@ XL_DLL int XLDMatrixCreateFromCSREx(const size_t* indptr, API_BEGIN(); xLearn::DMatrix* mat = new xLearn::DMatrix(); mat->ResetMatrix(nindptr, false); - for (size_t i = 1; i < nindptr; ++ i) { + for (size_t i = 1; i <= nindptr; ++ i) { for (size_t j = indptr[i - 1]; j < indptr[i]; ++ j) { - if (!std::isnan(data[j])) { - mat->AddNode(i, indices[j], data[j]); + if (!std::isnan(features[j])) { + if (fields) { + mat->AddNode(i - 1, indices[j], features[j], fields[j]); + } else { + mat->AddNode(i - 1, indices[j], features[j]); + } } } } @@ -283,7 +288,8 @@ XL_DLL int XLDMatrixCreateFromCSR(const size_t* indptr, XL_DLL int XLDMatrixCreateFromCSCEx(const size_t* indptr, const unsigned* indices, - const real_t* data, + const real_t* features, + const real_t* fields, size_t nindptr, size_t nelem, size_t num_row, @@ -291,10 +297,14 @@ XL_DLL int XLDMatrixCreateFromCSCEx(const size_t* indptr, API_BEGIN(); xLearn::DMatrix* mat = new xLearn::DMatrix(); mat->ResetMatrix(num_row, false); - for (size_t i = 1; i < nindptr; ++ i) { + for (size_t i = 1; i <= nindptr; ++ i) { for (size_t j = indptr[i - 1]; j < indptr[i]; ++ j) { - if (!std::isnan(data[j])) { - mat->AddNode(indices[j], i - 1, data[j]); + if (!std::isnan(features[j])) { + if (fields) { + mat->AddNode(indices[j], i - 1, features[j], fields[j]); + } else { + mat->AddNode(indices[j], i - 1, features[j]); + } } } } diff --git a/src/c_api/c_api.h b/src/c_api/c_api.h index 3b867661..cc79f201 100644 --- a/src/c_api/c_api.h +++ b/src/c_api/c_api.h @@ -117,7 +117,8 @@ XL_DLL int XLDMatrixCreateFromFile(const char *fname, XL_DLL int XLDMatrixCreateFromCSREx(const size_t* indptr, const unsigned* indices, - const real_t* data, + const real_t* features, + const real_t* fields, size_t nindptr, size_t nelem, size_t num_col, @@ -132,7 +133,8 @@ XL_DLL int XLDMatrixCreateFromCSR(const size_t* indptr, XL_DLL int XLDMatrixCreateFromCSCEx(const size_t* indptr, const unsigned* indices, - const real_t* data, + const real_t* features, + const real_t* fields, size_t nindptr, size_t nelem, size_t num_row, diff --git a/src/data/data_structure.h b/src/data/data_structure.h index c27db935..1c610722 100644 --- a/src/data/data_structure.h +++ b/src/data/data_structure.h @@ -23,6 +23,7 @@ This file defines the basic data structures used by xLearn. #ifndef XLEARN_DATA_DATA_STRUCTURE_H_ #define XLEARN_DATA_DATA_STRUCTURE_H_ +#include #include #include #include From d3e67a13e95d9bbeaa6f0ba862cc5680b0ff9067 Mon Sep 17 00:00:00 2001 From: peikai zheng Date: Sun, 7 Jan 2018 10:57:02 +0800 Subject: [PATCH 12/18] add PythonReader --- src/base/common.h | 2 +- src/reader/reader.cc | 44 +++++++++++++++++++++++++++++++++++++++-- src/reader/reader.h | 47 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+), 3 deletions(-) diff --git a/src/base/common.h b/src/base/common.h index 4c502f43..d73c846f 100644 --- a/src/base/common.h +++ b/src/base/common.h @@ -135,7 +135,7 @@ programming convenient. } while (0) //------------------------------------------------------------------------------ -// This marcro is used to disallow copy constructor and assign operator in +// This macro is used to disallow copy constructor and assign operator in // class definition. For more details, please refer to Google coding style // document // [http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml diff --git a/src/reader/reader.cc b/src/reader/reader.cc index 8325541d..0b263f16 100644 --- a/src/reader/reader.cc +++ b/src/reader/reader.cc @@ -110,9 +110,10 @@ void InmemReader::Initialize(const std::string& filename) { } } + void InmemReader::Initialize(const DMatrix *const matrix) { data_buf_ = *matrix; - data_buf_.has_label = matrix->has_label; + has_label_ = data_buf_.has_label; // Init data_samples_ num_samples_ = data_buf_.row_length; // for shuffle @@ -123,7 +124,7 @@ void InmemReader::Initialize(const DMatrix *const matrix) { } // Check wheter current path has a binary file. -// We use double check here, that is, we first check +// We use double check here, that is, we first check // the hash value of a small data block, then check the whole file. bool InmemReader::hash_binary(const std::string& filename) { std::string bin_file = filename + ".bin"; @@ -217,6 +218,45 @@ index_t InmemReader::Samples(DMatrix* &matrix) { // Return to the begining of the data buffer. void InmemReader::Reset() { pos_ = 0; } +void PythonReader::Initialize(const DMatrix *const matrix) { + data_buf_ = *matrix; + has_label_ = data_buf_.has_label; + // Init data_samples_ + num_samples_ = data_buf_.row_length; + // for shuffle + order_.resize(num_samples_); + for (int i = 0; i < order_.size(); ++i) { + order_[i] = i; + } +} + +// Smaple data from memory buffer. +index_t PythonReader::Samples(DMatrix* &matrix) { + for (int i = 0; i < num_samples_; ++i) { + if (pos_ >= data_buf_.row_length) { + // End of the data buffer + if (i == 0) { + if (shuffle_) { + random_shuffle(order_.begin(), order_.end()); + } + matrix = nullptr; + return 0; + } + break; + } + // Copy data between different DMatrix. + data_samples_.row[i] = data_buf_.row[order_[pos_]]; + data_samples_.Y[i] = data_buf_.Y[order_[pos_]]; + data_samples_.norm[i] = data_buf_.norm[order_[pos_]]; + pos_++; + } + matrix = &data_samples_; + return num_samples_; +} + +// Return to the begining of the data buffer. +void PythonReader::Reset() { pos_ = 0; } + //------------------------------------------------------------------------------ // Implementation of OndiskReader. //------------------------------------------------------------------------------ diff --git a/src/reader/reader.h b/src/reader/reader.h index 23856d1c..223f51d9 100644 --- a/src/reader/reader.h +++ b/src/reader/reader.h @@ -198,6 +198,53 @@ class InmemReader : public Reader { DISALLOW_COPY_AND_ASSIGN(InmemReader); }; +//------------------------------------------------------------------------------ +// PythonReader is used for Python +// Sampling data from memory buffer. +//------------------------------------------------------------------------------ +class PythonReader : public Reader { + public: + // Constructor and Destructor + PythonReader() : pos_(0) { } + ~PythonReader() { } + + // Initialized from DMatrix + virtual void Initialize(const DMatrix* const matrix); + + // Sample data from the memory buffer. + virtual index_t Samples(DMatrix* &matrix); + + // Return to the begining of the data. + virtual void Reset(); + + // Free the memory of data matrix. + virtual void Clear() { + data_buf_.Release(); + } + + // If shuffle data ? + virtual inline void SetShuffle(bool shuffle) { + this->shuffle_ = shuffle; + if (shuffle_ && !order_.empty()) { + random_shuffle(order_.begin(), order_.end()); + } + } + + protected: + /* Reader will load all the data + into this buffer */ + DMatrix data_buf_; + /* Number of record at each samplling */ + index_t num_samples_; + /* Position for samplling */ + index_t pos_; + /* For random shuffle */ + std::vector order_; + + private: + DISALLOW_COPY_AND_ASSIGN(PythonReader); +}; + //------------------------------------------------------------------------------ // Samplling data from disk file. // OndiskReader is used to train very big data, which cannot be From 96c06a3305ed56ea13a02182c1990b6109bd5e92 Mon Sep 17 00:00:00 2001 From: peikai zheng Date: Sun, 7 Jan 2018 17:58:56 +0800 Subject: [PATCH 13/18] add c_api interface --- python-package/xlearn/core.py | 3 -- python-package/xlearn/xlearn.py | 42 +++++++++++++++++------- src/c_api/c_api.cc | 22 ++++++++++++- src/c_api/c_api.h | 8 +++++ src/data/hyper_parameters.h | 9 ++++++ src/reader/reader.cc | 6 ++++ src/reader/reader.h | 9 ++++++ src/solver/solver.cc | 57 ++++++++++++++++++++++++++++++++- src/solver/solver.h | 2 ++ 9 files changed, 141 insertions(+), 17 deletions(-) diff --git a/python-package/xlearn/core.py b/python-package/xlearn/core.py index 2c8cb905..5cc303dc 100644 --- a/python-package/xlearn/core.py +++ b/python-package/xlearn/core.py @@ -72,6 +72,3 @@ def _init_from_csc(self, csc, field): ctypes.c_size_t(csc.shape[0]), ctypes.byref(self.handle))) -def train(dmatrix, params={}): - - pass \ No newline at end of file diff --git a/python-package/xlearn/xlearn.py b/python-package/xlearn/xlearn.py index 8d146840..29eac7ce 100644 --- a/python-package/xlearn/xlearn.py +++ b/python-package/xlearn/xlearn.py @@ -4,6 +4,8 @@ import ctypes from .base import _LIB, XLearnHandle from .base import _check_call, c_str +from .compat import STRING_TYPES +from .core import DMatrix class XLearn(object): """XLearn is the core interface used by python API.""" @@ -83,27 +85,37 @@ def show(self): """ _check_call(_LIB.XLearnShow(ctypes.byref(self.handle))) - def setTrain(self, train_path): - """Set file path of training data. + def setTrain(self, train_data): + """Set file path of training data / DMatrix of training Parameters ---------- - train_path : str - the path of training data + train_path : str / DMatrix + the path of training data / DMatrix of training """ - _check_call(_LIB.XLearnSetTrain(ctypes.byref(self.handle), c_str(train_path))) + if isinstance(train_data, STRING_TYPES): + _check_call(_LIB.XLearnSetTrain(ctypes.byref(self.handle), c_str(train_data))) + elif isinstance(train_data, DMatrix): + _check_call(_LIB.XLearnSetTrainDMatrix(ctypes.byref(self.handle), ctypes.byref(train_data.handle))) + else: + raise Exception("Unkown Type") - def setTest(self, test_path): - """Set file path of test data. + def setTest(self, test_data): + """Set file path of test data / DMatrix Parameters ---------- test_path : str the path of test data. """ - _check_call(_LIB.XLearnSetTest(ctypes.byref(self.handle), c_str(test_path))) - - def setValidate(self, val_path): + if isinstance(test_data, STRING_TYPES): + _check_call(_LIB.XLearnSetTest(ctypes.byref(self.handle), c_str(test_data))) + elif isinstance(test_data, DMatrix): + _check_call(_LIB.XLearnSetTestDMatrix(ctypes.byref(self.handle), ctypes.byref(test_data.handle))) + else: + raise Exception("Unkown Type") + + def setValidate(self, val_data): """Set file path of validation data. Parameters @@ -111,7 +123,12 @@ def setValidate(self, val_path): val_path : str the path of validation data. """ - _check_call(_LIB.XLearnSetValidate(ctypes.byref(self.handle), c_str(val_path))) + if isinstance(val_data, STRING_TYPES): + _check_call(_LIB.XLearnSetValidate(ctypes.byref(self.handle), c_str(val_path))) + elif isinstance(val_data, DMatrix): + _check_call(_LIB.XLearnSetValidateDMatrix(ctypes.byref(self.handle), ctypes.byref(val_data.handle))) + else: + raise Exception("Unkown Type") def setQuiet(self): """Set xlearn to quiet model""" @@ -155,7 +172,7 @@ def setSigmoid(self): _check_call(_LIB.XLearnSetBool(ctypes.byref(self.handle), c_str(key), ctypes.c_bool(True))) - def fit(self, param, model_path): + def fit(self, dmatrix, param, model_path): """Check hyper-parameters, train model, and dump model. Parameters @@ -165,6 +182,7 @@ def fit(self, param, model_path): model_path : str path of model checkpoint. """ + self.setTrain(dmatrix) self._set_Param(param) _check_call(_LIB.XLearnFit(ctypes.byref(self.handle), c_str(model_path))) diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index dcfef40d..8af42781 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -30,7 +30,6 @@ This file is the implementation of C API for xLearn. #include "src/base/timer.h" #include "src/c_api/c_api.h" #include "src/c_api/c_api_error.h" -#include "src/data/data_structure.h" // Say hello to user XL_DLL int XLearnHello() { @@ -88,6 +87,13 @@ XL_DLL int XLearnSetTrain(XL *out, const char *train_path) { API_END(); } +XL_DLL int XLearnSetTrainDMatrix(XL *out, xLearn::DMatrix* dmatrix) { + API_BEGIN(); + XLearn* xl = reinterpret_cast(*out); + xl->GetHyperParam().train_dmatrix = dmatrix; + API_END(); +} + // Set file path of the test data XL_DLL int XLearnSetTest(XL *out, const char *test_path) { API_BEGIN(); @@ -96,6 +102,13 @@ XL_DLL int XLearnSetTest(XL *out, const char *test_path) { API_END(); } +XL_DLL int XLearnSetTestDMatrix(XL *out, xLearn::DMatrix* dmatrix) { + API_BEGIN(); + XLearn* xl = reinterpret_cast(*out); + xl->GetHyperParam().test_dmatrix = dmatrix; + API_END(); +} + // Set file path of the validation data XL_DLL int XLearnSetValidate(XL *out, const char *val_path) { API_BEGIN(); @@ -104,6 +117,13 @@ XL_DLL int XLearnSetValidate(XL *out, const char *val_path) { API_END(); } +XL_DLL int XLearnSetValidateDMatrix(XL *out, xLearn::DMatrix* dmatrix) { + API_BEGIN(); + XLearn* xl = reinterpret_cast(*out); + xl->GetHyperParam().validate_dmatrix = dmatrix; + API_END(); +} + // Start to train XL_DLL int XLearnFit(XL *out, const char *model_path) { API_BEGIN(); diff --git a/src/c_api/c_api.h b/src/c_api/c_api.h index cc79f201..9ecf2c21 100644 --- a/src/c_api/c_api.h +++ b/src/c_api/c_api.h @@ -26,6 +26,7 @@ to other languages. #include "src/base/common.h" #include "src/data/hyper_parameters.h" +#include "src/data/data_structure.h" #include "src/solver/solver.h" #ifdef __cplusplus @@ -61,12 +62,19 @@ XL_DLL int XLearnShow(XL *out); // Set file path of the training data XL_DLL int XLearnSetTrain(XL *out, const char *train_path); +// Set DMatrix of training data +XL_DLL int XLearnSetTrainDMatrix(XL *out, xLearn::DMatrix *dmatrix); + // Set file path of the test data XL_DLL int XLearnSetTest(XL *out, const char *test_path); +XL_DLL int XLearnSetTestDMatrix(XL *out, xLearn::DMatrix *dmatrix); + // Set file path of the validation data XL_DLL int XLearnSetValidate(XL *out, const char *val_path); +XL_DLL int XLearnSetValidateDMatrix(XL *out, xLearn::DMatrix *dmatrix); + // Start to train XL_DLL int XLearnFit(XL *out, const char *model_path); diff --git a/src/data/hyper_parameters.h b/src/data/hyper_parameters.h index b6d7f42f..ba1527fa 100644 --- a/src/data/hyper_parameters.h +++ b/src/data/hyper_parameters.h @@ -57,6 +57,9 @@ struct HyperParam { For now, it can be 'acc', 'prec', 'recall', 'f1', 'mae', 'rmsd', 'mape', or 'none' */ std::string metric = "none"; + /* Reader Type. + For now, it can be 'memory', 'disk', 'python'*/ + std::string reader_type; /* Block size for on-disk training. On default this value will be set to 500 MB */ uint64 block_size = 500; @@ -108,6 +111,12 @@ struct HyperParam { /* Filename for validation set This value can be empty. */ std::string validate_set_file; + /* DMatrix of training dataset from python*/ + DMatrix* train_dmatrix; + /* DMatrix of test dataset from python*/ + DMatrix* test_dmatrix; + /* DMatrix of validation set from python */ + DMatrix* validate_dmatrix; /* Filename of model checkpoint On default, model_file = train_set_file + ".model" */ std::string model_file; diff --git a/src/reader/reader.cc b/src/reader/reader.cc index 0b263f16..eb27cfb1 100644 --- a/src/reader/reader.cc +++ b/src/reader/reader.cc @@ -37,6 +37,7 @@ namespace xLearn { CLASS_REGISTER_IMPLEMENT_REGISTRY(xLearn_reader_registry, Reader); REGISTER_READER("memory", InmemReader); REGISTER_READER("disk", OndiskReader); +REGISTER_READER("python", PythonReader); // Check current file format and // return 'libsvm', 'libffm', or 'csv'. @@ -218,6 +219,11 @@ index_t InmemReader::Samples(DMatrix* &matrix) { // Return to the begining of the data buffer. void InmemReader::Reset() { pos_ = 0; } +void PythonReader::Initialize(const std::string &filename) { + // cause the Initialize is pure abstruct func + CHECK(false); +} + void PythonReader::Initialize(const DMatrix *const matrix) { data_buf_ = *matrix; has_label_ = data_buf_.has_label; diff --git a/src/reader/reader.h b/src/reader/reader.h index 223f51d9..cf06ba1e 100644 --- a/src/reader/reader.h +++ b/src/reader/reader.h @@ -89,6 +89,12 @@ class Reader { // training, and this is good for SGD. virtual void Initialize(const std::string& filename) = 0; + // initialize from DMatrix + // this method should be implemented by PythonReader + virtual void Initialize(const DMatrix* const dmatrix) { + CHECK(false); + } + // Sample data from disk or from memory buffer. // Return the number of record in each samplling. // Samples() will return 0 when reaching end of the data. @@ -208,6 +214,9 @@ class PythonReader : public Reader { PythonReader() : pos_(0) { } ~PythonReader() { } + // Pre-load all the data into memory buffer. + virtual void Initialize(const std::string& filename); + // Initialized from DMatrix virtual void Initialize(const DMatrix* const matrix); diff --git a/src/solver/solver.cc b/src/solver/solver.cc index f3c97a57..8e7c5601 100644 --- a/src/solver/solver.cc +++ b/src/solver/solver.cc @@ -72,7 +72,11 @@ void Solver::print_logo() const { Reader* Solver::create_reader() { Reader* reader; std::string str = hyper_param_.on_disk ? "disk" : "memory"; - reader = CREATE_READER(str.c_str()); + if (!hyper_param_.reader_type.empty()) { + reader = CREATE_READER(hyper_param_.reader_type.c_str()); + } else { + reader = CREATE_READER(str.c_str()); + } if (reader == nullptr) { LOG(FATAL) << "Cannot create reader: " << str; } @@ -186,6 +190,57 @@ void Solver::init_log() { StringPrintf("%s.ERROR", prefix.c_str())); } +// Initialize reader +void Solver::init_reader_by_dmatrix() { + std::vector dmatrix_list; + if (hyper_param_.cross_validation) { + CHECK_GT(hyper_param_.num_folds, 0); + LOG(ERR) << "python interface not support cross validation"; + } + int num_reader{0}; + if (hyper_param_.cross_validation) { + num_reader += hyper_param_.num_folds; + int batch_size = std::ceil( + static_cast(hyper_param_.train_dmatrix->row_length) / hyper_param_.num_folds); + DMatrix batch; + batch.ResetMatrix(batch_size, hyper_param_.train_dmatrix->has_label); + for (int i = 0; i < hyper_param_.num_folds; ++ i) { + size_t real_size = hyper_param_.train_dmatrix->GetMiniBatch(batch_size, batch); + dmatrix_list.emplace_back(batch); + } + } else { + num_reader += 1; + CHECK(hyper_param_.train_dmatrix != nullptr); + dmatrix_list.push_back(*hyper_param_.train_dmatrix); + if (hyper_param_.validate_dmatrix != nullptr) { + num_reader += 1; + dmatrix_list.push_back(*hyper_param_.validate_dmatrix); + } + } + LOG(INFO) << "Number of Readers: " << num_reader; + reader_.resize(num_reader, nullptr); + // Create Reader + for (int i = 0; i < num_reader; ++i) { + reader_[i] = create_reader(); + reader_[i]->Initialize(&dmatrix_list[i]); + if (!hyper_param_.on_disk) { + reader_[i]->SetShuffle(true); + } + if (reader_[i] == nullptr) { + print_error( + StringPrintf("Cannot create reader from DMatrix %d", + i) + ); + exit(0); + } + LOG(INFO) << "Init Reader Number: " << i; + } +} + +// Initialize reader +void Solver::init_reader_by_file() { +} + // Initialize training task void Solver::init_train() { /********************************************************* diff --git a/src/solver/solver.h b/src/solver/solver.h index d08cd461..42b52f8c 100644 --- a/src/solver/solver.h +++ b/src/solver/solver.h @@ -116,6 +116,8 @@ class Solver { void init_train(); void init_predict(); void init_log(); + void init_reader_by_dmatrix(); + void init_reader_by_file(); void checker(int argc, char* argv[]); void checker(HyperParam& hyper_param); From d9c9f326b19ad4eff2098e2b6daf40a4f0647dde Mon Sep 17 00:00:00 2001 From: peikai zheng Date: Mon, 8 Jan 2018 08:46:49 +0800 Subject: [PATCH 14/18] add python DMatrix interface with debug info --- python-package/test_python_module.py | 33 +++++++++-- python-package/xlearn/core.py | 2 +- python-package/xlearn/xlearn.py | 5 +- src/c_api/c_api.cc | 23 +++++--- src/c_api/c_api.h | 6 +- src/data/hyper_parameters.h | 6 +- src/reader/reader.cc | 4 ++ src/reader/reader.h | 4 ++ src/solver/checker.cc | 16 +++++- src/solver/inference.cc | 3 +- src/solver/inference.h | 2 +- src/solver/solver.cc | 83 ++++++++++++++++++---------- src/solver/solver.h | 8 +-- 13 files changed, 135 insertions(+), 60 deletions(-) diff --git a/python-package/test_python_module.py b/python-package/test_python_module.py index 575328da..ef15de71 100755 --- a/python-package/test_python_module.py +++ b/python-package/test_python_module.py @@ -1,16 +1,37 @@ #!/usr/bin/python # coding: utf-8 - +# This file test the xlearn python package. +# We create a ffm model for binary classification problem. +# The dataset comes from the criteo CTR. +from __future__ import absolute_import import numpy as np import xlearn as xl from scipy.sparse import csr_matrix -row = np.array([0, 0, 1, 2, 2, 2]) -col = np.array([0, 2, 2, 0, 1, 2]) -data = np.array([1, 2, 3, 4, 5, 6]) -csr = csr_matrix((data, (row, col)), shape=(3, 3)) +# Create factorazation machine +ffm_model = xl.create_ffm() + +# Set training data and validation data +dtrain = xl.DMatrix("./small_train.txt") +dtest = xl.DMatrix("./small_test.txt") +#ffm_model.setTrain("./small_train.txt") +ffm_model.setTrain(dtrain) +ffm_model.setValidate(dtest); +#ffm_model.setValidate("./small_test.txt") + +# Set hyper-parameters +param = { 'task':'binary', + 'lr' : 0.2, + 'lambda' : 0.002, + 'metric' : 'acc' } + +# Tarin model +ffm_model.fit(param, "model.out") -xl.DMatrix(csr, field=csr) +# Predict +#ffm_model.setTest("./small_test.txt") +ffm_model.setTest(dtest) +ffm_model.predict("model.out", "output") diff --git a/python-package/xlearn/core.py b/python-package/xlearn/core.py index 5cc303dc..cf202276 100644 --- a/python-package/xlearn/core.py +++ b/python-package/xlearn/core.py @@ -21,7 +21,7 @@ class DMatrix(object): _feature_names = None _field_names = None - def __init__(self, data, label=None, field=None, silent=None, + def __init__(self, data, label=None, field=None, silent=1, feature_names=None): if data is None: self.handle = None diff --git a/python-package/xlearn/xlearn.py b/python-package/xlearn/xlearn.py index 29eac7ce..ccdf59c3 100644 --- a/python-package/xlearn/xlearn.py +++ b/python-package/xlearn/xlearn.py @@ -124,7 +124,7 @@ def setValidate(self, val_data): the path of validation data. """ if isinstance(val_data, STRING_TYPES): - _check_call(_LIB.XLearnSetValidate(ctypes.byref(self.handle), c_str(val_path))) + _check_call(_LIB.XLearnSetValidate(ctypes.byref(self.handle), c_str(val_data))) elif isinstance(val_data, DMatrix): _check_call(_LIB.XLearnSetValidateDMatrix(ctypes.byref(self.handle), ctypes.byref(val_data.handle))) else: @@ -172,7 +172,7 @@ def setSigmoid(self): _check_call(_LIB.XLearnSetBool(ctypes.byref(self.handle), c_str(key), ctypes.c_bool(True))) - def fit(self, dmatrix, param, model_path): + def fit(self, param, model_path): """Check hyper-parameters, train model, and dump model. Parameters @@ -182,7 +182,6 @@ def fit(self, dmatrix, param, model_path): model_path : str path of model checkpoint. """ - self.setTrain(dmatrix) self._set_Param(param) _check_call(_LIB.XLearnFit(ctypes.byref(self.handle), c_str(model_path))) diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 8af42781..b2c0862a 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -87,10 +87,11 @@ XL_DLL int XLearnSetTrain(XL *out, const char *train_path) { API_END(); } -XL_DLL int XLearnSetTrainDMatrix(XL *out, xLearn::DMatrix* dmatrix) { +XL_DLL int XLearnSetTrainDMatrix(XL *out, XL *dmatrix) { API_BEGIN(); XLearn* xl = reinterpret_cast(*out); - xl->GetHyperParam().train_dmatrix = dmatrix; + xl->GetHyperParam().train_dmatrix = reinterpret_cast(*dmatrix); + xl->GetHyperParam().reader_type = "python"; API_END(); } @@ -102,10 +103,10 @@ XL_DLL int XLearnSetTest(XL *out, const char *test_path) { API_END(); } -XL_DLL int XLearnSetTestDMatrix(XL *out, xLearn::DMatrix* dmatrix) { +XL_DLL int XLearnSetTestDMatrix(XL *out, XL* dmatrix) { API_BEGIN(); XLearn* xl = reinterpret_cast(*out); - xl->GetHyperParam().test_dmatrix = dmatrix; + xl->GetHyperParam().test_dmatrix = reinterpret_cast(*dmatrix); API_END(); } @@ -117,10 +118,10 @@ XL_DLL int XLearnSetValidate(XL *out, const char *val_path) { API_END(); } -XL_DLL int XLearnSetValidateDMatrix(XL *out, xLearn::DMatrix* dmatrix) { +XL_DLL int XLearnSetValidateDMatrix(XL *out, XL *dmatrix) { API_BEGIN(); XLearn* xl = reinterpret_cast(*out); - xl->GetHyperParam().validate_dmatrix = dmatrix; + xl->GetHyperParam().validate_dmatrix = reinterpret_cast(*dmatrix); API_END(); } @@ -170,7 +171,7 @@ XL_DLL int XLearnPredict(XL *out, const char *model_path, const char *out_path) xl->GetHyperParam().is_train = false; xl->GetSolver().Initialize(xl->GetHyperParam()); xl->GetSolver().SetPredict(); - xl->GetSolver().StartWork(); + std::vector out = xl->GetSolver().StartWork(); xl->GetSolver().Clear(); print_info( StringPrintf("Total time cost: %.2f (sec)", @@ -268,6 +269,14 @@ XL_DLL int XLDMatrixCreateFromFile(const char *fname, int silent, XL* out) { API_BEGIN(); + xLearn::InmemReader *reader = new xLearn::InmemReader(); + reader->Initialize(fname); + std::cout << reader->GetDataBuf().row_length << std::endl; + xLearn::DMatrix *mat = new xLearn::DMatrix(); + mat->CopyFrom(&reader->GetDataBuf()); + *out = mat; + xLearn::DMatrix *tmp = reinterpret_cast(*out); + std::cout << tmp->row_length << std::endl; API_END(); } diff --git a/src/c_api/c_api.h b/src/c_api/c_api.h index 9ecf2c21..7c4f2c3a 100644 --- a/src/c_api/c_api.h +++ b/src/c_api/c_api.h @@ -63,17 +63,17 @@ XL_DLL int XLearnShow(XL *out); XL_DLL int XLearnSetTrain(XL *out, const char *train_path); // Set DMatrix of training data -XL_DLL int XLearnSetTrainDMatrix(XL *out, xLearn::DMatrix *dmatrix); +XL_DLL int XLearnSetTrainDMatrix(XL *out, XL *dmatrix); // Set file path of the test data XL_DLL int XLearnSetTest(XL *out, const char *test_path); -XL_DLL int XLearnSetTestDMatrix(XL *out, xLearn::DMatrix *dmatrix); +XL_DLL int XLearnSetTestDMatrix(XL *out, XL *dmatrix); // Set file path of the validation data XL_DLL int XLearnSetValidate(XL *out, const char *val_path); -XL_DLL int XLearnSetValidateDMatrix(XL *out, xLearn::DMatrix *dmatrix); +XL_DLL int XLearnSetValidateDMatrix(XL *out, XL *dmatrix); // Start to train XL_DLL int XLearnFit(XL *out, const char *model_path); diff --git a/src/data/hyper_parameters.h b/src/data/hyper_parameters.h index ba1527fa..f6ea69ed 100644 --- a/src/data/hyper_parameters.h +++ b/src/data/hyper_parameters.h @@ -112,11 +112,11 @@ struct HyperParam { This value can be empty. */ std::string validate_set_file; /* DMatrix of training dataset from python*/ - DMatrix* train_dmatrix; + DMatrix* train_dmatrix = nullptr; /* DMatrix of test dataset from python*/ - DMatrix* test_dmatrix; + DMatrix* test_dmatrix = nullptr; /* DMatrix of validation set from python */ - DMatrix* validate_dmatrix; + DMatrix* validate_dmatrix = nullptr; /* Filename of model checkpoint On default, model_file = train_set_file + ".model" */ std::string model_file; diff --git a/src/reader/reader.cc b/src/reader/reader.cc index eb27cfb1..e37dec95 100644 --- a/src/reader/reader.cc +++ b/src/reader/reader.cc @@ -229,6 +229,7 @@ void PythonReader::Initialize(const DMatrix *const matrix) { has_label_ = data_buf_.has_label; // Init data_samples_ num_samples_ = data_buf_.row_length; + data_samples_.ResetMatrix(num_samples_, has_label_); // for shuffle order_.resize(num_samples_); for (int i = 0; i < order_.size(); ++i) { @@ -238,6 +239,8 @@ void PythonReader::Initialize(const DMatrix *const matrix) { // Smaple data from memory buffer. index_t PythonReader::Samples(DMatrix* &matrix) { + std::cout << "num samples: " << num_samples_ << std::endl; + std::cout << "row_length: " << data_buf_.row_length << " " << pos_ << std::endl; for (int i = 0; i < num_samples_; ++i) { if (pos_ >= data_buf_.row_length) { // End of the data buffer @@ -257,6 +260,7 @@ index_t PythonReader::Samples(DMatrix* &matrix) { pos_++; } matrix = &data_samples_; + std::cout << "finish samples" << std::endl; return num_samples_; } diff --git a/src/reader/reader.h b/src/reader/reader.h index cf06ba1e..607bb1e1 100644 --- a/src/reader/reader.h +++ b/src/reader/reader.h @@ -180,6 +180,10 @@ class InmemReader : public Reader { } } + DMatrix& GetDataBuf() { + return data_buf_; + } + protected: /* Reader will load all the data into this buffer */ diff --git a/src/solver/checker.cc b/src/solver/checker.cc index 2c938f75..7beb3e28 100644 --- a/src/solver/checker.cc +++ b/src/solver/checker.cc @@ -525,13 +525,19 @@ bool Checker::check_train_param(HyperParam& hyper_param) { /********************************************************* * Check file path * *********************************************************/ - if (!FileExist(hyper_param.train_set_file.c_str())) { + if (!FileExist(hyper_param.train_set_file.c_str()) && + hyper_param.reader_type != "python") { print_error( StringPrintf("Training data file: %s does not exist.", hyper_param.train_set_file.c_str()) ); bo = false; } + if (hyper_param.reader_type == "python" && + hyper_param.train_dmatrix == nullptr) { + print_error("Training DMatrix does not exist."); + bo = false; + } if (!hyper_param.validate_set_file.empty() && !FileExist(hyper_param.validate_set_file.c_str())) { print_error( @@ -794,13 +800,19 @@ bool Checker::check_prediction_param(HyperParam& hyper_param) { /********************************************************* * Check the path of test set file * *********************************************************/ - if (!FileExist(hyper_param.test_set_file.c_str())) { + if (!FileExist(hyper_param.test_set_file.c_str()) && + hyper_param.reader_type != "python") { print_error( StringPrintf("Test set file: %s does not exist.", hyper_param.test_set_file.c_str()) ); bo = false; } + if (hyper_param.reader_type == "python" && + hyper_param.test_dmatrix == nullptr) { + print_error("Test DMatrix is null"); + bo = false; + } /********************************************************* * Check the path of model file * *********************************************************/ diff --git a/src/solver/inference.cc b/src/solver/inference.cc index eabdce12..ca402bcb 100644 --- a/src/solver/inference.cc +++ b/src/solver/inference.cc @@ -30,7 +30,7 @@ namespace xLearn { // Given a pre-trained model and test data, the predictor // will return the prediction output -void Predictor::Predict() { +std::vector Predictor::Predict() { std::ofstream o_file(out_file_); static std::vector out; DMatrix* matrix = nullptr; @@ -59,6 +59,7 @@ void Predictor::Predict() { loss_->GetLoss()) ); } + return std::vector(out); } // Convert output by using the sigmoid function. diff --git a/src/solver/inference.h b/src/solver/inference.h index eb367362..cba76b4c 100644 --- a/src/solver/inference.h +++ b/src/solver/inference.h @@ -63,7 +63,7 @@ class Predictor { } // The core function - void Predict(); + std::vector Predict(); protected: Reader* reader_; diff --git a/src/solver/solver.cc b/src/solver/solver.cc index 8e7c5601..a0173ecf 100644 --- a/src/solver/solver.cc +++ b/src/solver/solver.cc @@ -191,13 +191,13 @@ void Solver::init_log() { } // Initialize reader -void Solver::init_reader_by_dmatrix() { +void Solver::init_reader_by_dmatrix(int &num_reader) { std::vector dmatrix_list; if (hyper_param_.cross_validation) { CHECK_GT(hyper_param_.num_folds, 0); LOG(ERR) << "python interface not support cross validation"; } - int num_reader{0}; + num_reader = 0; if (hyper_param_.cross_validation) { num_reader += hyper_param_.num_folds; int batch_size = std::ceil( @@ -211,8 +211,14 @@ void Solver::init_reader_by_dmatrix() { } else { num_reader += 1; CHECK(hyper_param_.train_dmatrix != nullptr); + std::cout << "check train dmatrix" << std::endl; + std::cout << hyper_param_.train_dmatrix->row_length << std::endl; + DMatrix tmp = *hyper_param_.train_dmatrix; + std::cout << "copy is ok" << std::endl; dmatrix_list.push_back(*hyper_param_.train_dmatrix); + std::cout << "push back ok" << std::endl; if (hyper_param_.validate_dmatrix != nullptr) { + std::cout << "in validate" << std::endl; num_reader += 1; dmatrix_list.push_back(*hyper_param_.validate_dmatrix); } @@ -238,26 +244,7 @@ void Solver::init_reader_by_dmatrix() { } // Initialize reader -void Solver::init_reader_by_file() { -} - -// Initialize training task -void Solver::init_train() { - /********************************************************* - * Initialize thread pool * - *********************************************************/ - size_t threadNumber = std::thread::hardware_concurrency();; - if (hyper_param_.thread_number != 0) { - threadNumber = hyper_param_.thread_number; - } - pool_ = new ThreadPool(threadNumber); - /********************************************************* - * Initialize Reader * - *********************************************************/ - Timer timer; - timer.tic(); - print_action("Read Problem ..."); - LOG(INFO) << "Start to init Reader"; +void Solver::init_reader_by_file(int &num_reader) { // Split file if (hyper_param_.cross_validation) { CHECK_GT(hyper_param_.num_folds, 0); @@ -268,7 +255,7 @@ void Solver::init_train() { << " parts."; } // Get the Reader list - int num_reader = 0; + num_reader = 0; std::vector file_list; if (hyper_param_.cross_validation) { num_reader += hyper_param_.num_folds; @@ -304,12 +291,42 @@ void Solver::init_train() { } LOG(INFO) << "Init Reader: " << file_list[i]; } +} + +// Initialize training task +void Solver::init_train() { + /********************************************************* + * Initialize thread pool * + *********************************************************/ + size_t threadNumber = std::thread::hardware_concurrency();; + if (hyper_param_.thread_number != 0) { + threadNumber = hyper_param_.thread_number; + } + pool_ = new ThreadPool(threadNumber); + /********************************************************* + * Initialize Reader * + *********************************************************/ + Timer timer; + timer.tic(); + print_action("Read Problem ..."); + LOG(INFO) << "Start to init Reader"; + int num_reader{0}; + if (hyper_param_.reader_type == "python" && + hyper_param_.train_set_file.empty()) { + std::cout << "using python" << std::endl; + init_reader_by_dmatrix(num_reader); + } else { + init_reader_by_file(num_reader); + } + std::cout << "finish create reader" << std::endl; + /********************************************************* * Read problem * *********************************************************/ DMatrix* matrix = nullptr; index_t max_feat = 0, max_field = 0; for (int i = 0; i < num_reader; ++i) { + std::cout << "start count" << std::endl; while(reader_[i]->Samples(matrix)) { int tmp = matrix->MaxFeat(); if (tmp > max_feat) { max_feat = tmp; } @@ -317,6 +334,7 @@ void Solver::init_train() { tmp = matrix->MaxField(); if (tmp > max_field) { max_field = tmp; } } + std::cout << "Counting" << std::endl; } // Return to the begining of target file. reader_[i]->Reset(); @@ -469,8 +487,14 @@ void Solver::init_predict() { timer.tic(); // Create Reader reader_.resize(1, create_reader()); - CHECK_NE(hyper_param_.test_set_file.empty(), true); - reader_[0]->Initialize(hyper_param_.test_set_file); + if (hyper_param_.reader_type == "python" && + hyper_param_.test_set_file.empty()) { + CHECK(hyper_param_.test_dmatrix != nullptr); + reader_[0]->Initialize(hyper_param_.test_dmatrix); + } else { + CHECK_NE(hyper_param_.test_set_file.empty(), true); + reader_[0]->Initialize(hyper_param_.test_set_file); + } reader_[0]->SetShuffle(false); if (reader_[0] == nullptr) { print_info( @@ -502,14 +526,15 @@ void Solver::init_predict() { ******************************************************************************/ // Start training or inference -void Solver::StartWork() { +std::vector Solver::StartWork() { if (hyper_param_.is_train) { LOG(INFO) << "Start training work."; start_train_work(); } else { LOG(INFO) << "Start inference work."; - start_prediction_work(); + return start_prediction_work(); } + return std::vector (); } // Train @@ -583,7 +608,7 @@ void Solver::start_train_work() { } // Inference -void Solver::start_prediction_work() { +std::vector Solver::start_prediction_work() { print_action("Start to predict ..."); Predictor pdc; pdc.Initialize(reader_[0], @@ -593,7 +618,7 @@ void Solver::start_prediction_work() { hyper_param_.sign, hyper_param_.sigmoid); // Predict and write output - pdc.Predict(); + return pdc.Predict(); } /****************************************************************************** diff --git a/src/solver/solver.h b/src/solver/solver.h index 42b52f8c..476155d2 100644 --- a/src/solver/solver.h +++ b/src/solver/solver.h @@ -78,7 +78,7 @@ class Solver { void Initialize(HyperParam& hyper_param); // Start a training task or start an inference task. - void StartWork(); + std::vector StartWork(); // Clear the xLearn environment. void Clear(); @@ -116,14 +116,14 @@ class Solver { void init_train(); void init_predict(); void init_log(); - void init_reader_by_dmatrix(); - void init_reader_by_file(); + void init_reader_by_dmatrix(int &num_reader); + void init_reader_by_file(int &num_reader); void checker(int argc, char* argv[]); void checker(HyperParam& hyper_param); // Start function void start_train_work(); - void start_prediction_work(); + std::vector start_prediction_work(); private: DISALLOW_COPY_AND_ASSIGN(Solver); From 4a2f9f78006af7d60d9ae75371c8f06907db6ded Mon Sep 17 00:00:00 2001 From: peikai zheng Date: Mon, 8 Jan 2018 08:53:09 +0800 Subject: [PATCH 15/18] rm debug info --- src/c_api/c_api.cc | 3 --- src/reader/reader.cc | 3 --- src/solver/checker.cc | 5 ++++- src/solver/solver.cc | 9 --------- 4 files changed, 4 insertions(+), 16 deletions(-) diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index b2c0862a..8f744a45 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -271,12 +271,9 @@ XL_DLL int XLDMatrixCreateFromFile(const char *fname, API_BEGIN(); xLearn::InmemReader *reader = new xLearn::InmemReader(); reader->Initialize(fname); - std::cout << reader->GetDataBuf().row_length << std::endl; xLearn::DMatrix *mat = new xLearn::DMatrix(); mat->CopyFrom(&reader->GetDataBuf()); *out = mat; - xLearn::DMatrix *tmp = reinterpret_cast(*out); - std::cout << tmp->row_length << std::endl; API_END(); } diff --git a/src/reader/reader.cc b/src/reader/reader.cc index e37dec95..d4fc9f60 100644 --- a/src/reader/reader.cc +++ b/src/reader/reader.cc @@ -239,8 +239,6 @@ void PythonReader::Initialize(const DMatrix *const matrix) { // Smaple data from memory buffer. index_t PythonReader::Samples(DMatrix* &matrix) { - std::cout << "num samples: " << num_samples_ << std::endl; - std::cout << "row_length: " << data_buf_.row_length << " " << pos_ << std::endl; for (int i = 0; i < num_samples_; ++i) { if (pos_ >= data_buf_.row_length) { // End of the data buffer @@ -260,7 +258,6 @@ index_t PythonReader::Samples(DMatrix* &matrix) { pos_++; } matrix = &data_samples_; - std::cout << "finish samples" << std::endl; return num_samples_; } diff --git a/src/solver/checker.cc b/src/solver/checker.cc index 7beb3e28..70d882ac 100644 --- a/src/solver/checker.cc +++ b/src/solver/checker.cc @@ -659,13 +659,16 @@ void Checker::check_conflict_train(HyperParam& hyper_param) { "xLearn will not dump model checkpoint to disk."); hyper_param.model_file.clear(); } - if (hyper_param.validate_set_file.empty() && hyper_param.early_stop) { + if (hyper_param.validate_set_file.empty() && + hyper_param.validate_dmatrix == nullptr && + hyper_param.early_stop) { print_warning("Validation file not found, xLearn has already " "disable early-stopping."); hyper_param.early_stop = false; } if (hyper_param.metric.compare("none") != 0 && hyper_param.validate_set_file.empty() && + hyper_param.validate_dmatrix == nullptr && !hyper_param.cross_validation) { print_warning( StringPrintf("Validation file not found, xLearn has already " diff --git a/src/solver/solver.cc b/src/solver/solver.cc index a0173ecf..93642f7e 100644 --- a/src/solver/solver.cc +++ b/src/solver/solver.cc @@ -211,14 +211,9 @@ void Solver::init_reader_by_dmatrix(int &num_reader) { } else { num_reader += 1; CHECK(hyper_param_.train_dmatrix != nullptr); - std::cout << "check train dmatrix" << std::endl; - std::cout << hyper_param_.train_dmatrix->row_length << std::endl; DMatrix tmp = *hyper_param_.train_dmatrix; - std::cout << "copy is ok" << std::endl; dmatrix_list.push_back(*hyper_param_.train_dmatrix); - std::cout << "push back ok" << std::endl; if (hyper_param_.validate_dmatrix != nullptr) { - std::cout << "in validate" << std::endl; num_reader += 1; dmatrix_list.push_back(*hyper_param_.validate_dmatrix); } @@ -313,12 +308,10 @@ void Solver::init_train() { int num_reader{0}; if (hyper_param_.reader_type == "python" && hyper_param_.train_set_file.empty()) { - std::cout << "using python" << std::endl; init_reader_by_dmatrix(num_reader); } else { init_reader_by_file(num_reader); } - std::cout << "finish create reader" << std::endl; /********************************************************* * Read problem * @@ -326,7 +319,6 @@ void Solver::init_train() { DMatrix* matrix = nullptr; index_t max_feat = 0, max_field = 0; for (int i = 0; i < num_reader; ++i) { - std::cout << "start count" << std::endl; while(reader_[i]->Samples(matrix)) { int tmp = matrix->MaxFeat(); if (tmp > max_feat) { max_feat = tmp; } @@ -334,7 +326,6 @@ void Solver::init_train() { tmp = matrix->MaxField(); if (tmp > max_field) { max_field = tmp; } } - std::cout << "Counting" << std::endl; } // Return to the begining of target file. reader_[i]->Reset(); From e35df4429fda37b17f6129d4905a09237c369e96 Mon Sep 17 00:00:00 2001 From: peikai zheng Date: Mon, 8 Jan 2018 09:57:25 +0800 Subject: [PATCH 16/18] test python sparse interface --- python-package/test_python_module.py | 23 +++++++++++++++++------ python-package/xlearn/core.py | 6 ++++++ src/c_api/c_api.cc | 16 +++++++++++----- src/c_api/c_api.h | 4 +++- src/data/data_structure.h | 9 --------- 5 files changed, 37 insertions(+), 21 deletions(-) diff --git a/python-package/test_python_module.py b/python-package/test_python_module.py index ef15de71..43a851d4 100755 --- a/python-package/test_python_module.py +++ b/python-package/test_python_module.py @@ -8,6 +8,23 @@ import xlearn as xl from scipy.sparse import csr_matrix +from sklearn.datasets import load_svmlight_file + +# Set hyper-parameters +param = { 'task':'binary', + 'lr' : 0.2, + 'lambda' : 0.002, + 'metric' : 'acc' } + +X, Y = load_svmlight_file("./test_dmatrix.txt") +print(type(X), type(Y)) +print(Y.dtype) +tmp_dmatrix = xl.DMatrix(X, Y) +fm_model = xl.create_fm() +fm_model.setTrain(tmp_dmatrix) +fm_model.setValidate(tmp_dmatrix) +fm_model.fit(param, "fm_model.out") + # Create factorazation machine ffm_model = xl.create_ffm() @@ -20,12 +37,6 @@ ffm_model.setValidate(dtest); #ffm_model.setValidate("./small_test.txt") -# Set hyper-parameters -param = { 'task':'binary', - 'lr' : 0.2, - 'lambda' : 0.002, - 'metric' : 'acc' } - # Tarin model ffm_model.fit(param, "model.out") diff --git a/python-package/xlearn/core.py b/python-package/xlearn/core.py index cf202276..6235f51c 100644 --- a/python-package/xlearn/core.py +++ b/python-package/xlearn/core.py @@ -43,6 +43,10 @@ def __init__(self, data, label=None, field=None, silent=1, self._init_from_csr(csr, csr_field) except: raise TypeError('can not initialize DMatrix from {}'.format(type(data).__name__)) + if label is not None: + _check_call(_LIB.XLDMatrixSetLabel(ctypes.byref(self.handle), + c_array(ctypes.c_float, label), + ctypes.c_size_t(len(label)))) def _init_from_csr(self, csr, field): if len(csr.indices) != len(csr.data): @@ -56,6 +60,7 @@ def _init_from_csr(self, csr, field): ctypes.c_size_t(len(csr.indptr) - 1), ctypes.c_size_t(len(csr.data)), ctypes.c_size_t(csr.shape[1]), + ctypes.c_bool(field is not None), ctypes.byref(self.handle))) def _init_from_csc(self, csc, field): @@ -70,5 +75,6 @@ def _init_from_csc(self, csc, field): ctypes.c_size_t(len(csc.indptr) - 1), ctypes.c_size_t(len(csc.data)), ctypes.c_size_t(csc.shape[0]), + ctypes.c_bool(field is not None), ctypes.byref(self.handle))) diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 8f744a45..eaeb080d 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -284,6 +284,7 @@ XL_DLL int XLDMatrixCreateFromCSREx(const size_t* indptr, size_t nindptr, size_t nelem, size_t num_col, + bool have_field, XL* out) { API_BEGIN(); xLearn::DMatrix* mat = new xLearn::DMatrix(); @@ -291,7 +292,7 @@ XL_DLL int XLDMatrixCreateFromCSREx(const size_t* indptr, for (size_t i = 1; i <= nindptr; ++ i) { for (size_t j = indptr[i - 1]; j < indptr[i]; ++ j) { if (!std::isnan(features[j])) { - if (fields) { + if (have_field) { mat->AddNode(i - 1, indices[j], features[j], fields[j]); } else { mat->AddNode(i - 1, indices[j], features[j]); @@ -319,6 +320,7 @@ XL_DLL int XLDMatrixCreateFromCSCEx(const size_t* indptr, size_t nindptr, size_t nelem, size_t num_row, + bool have_field, XL* out) { API_BEGIN(); xLearn::DMatrix* mat = new xLearn::DMatrix(); @@ -326,7 +328,7 @@ XL_DLL int XLDMatrixCreateFromCSCEx(const size_t* indptr, for (size_t i = 1; i <= nindptr; ++ i) { for (size_t j = indptr[i - 1]; j < indptr[i]; ++ j) { if (!std::isnan(features[j])) { - if (fields) { + if (have_field) { mat->AddNode(indices[j], i - 1, features[j], fields[j]); } else { mat->AddNode(indices[j], i - 1, features[j]); @@ -340,10 +342,14 @@ XL_DLL int XLDMatrixCreateFromCSCEx(const size_t* indptr, XL_DLL int XLDMatrixSetLabel(XL* out, const real_t* label, - const size_t& len) { + const size_t len) { API_BEGIN(); - auto p = reinterpret_cast(out); - p->SetLabel(label, len); + xLearn::DMatrix *p = reinterpret_cast(*out); + std::vector Y(len); + for (size_t i = 0; i < len; ++ i) { + Y[i] = label[i]; + } + p->SetLabel(Y); API_END(); } diff --git a/src/c_api/c_api.h b/src/c_api/c_api.h index 7c4f2c3a..87ca4345 100644 --- a/src/c_api/c_api.h +++ b/src/c_api/c_api.h @@ -130,6 +130,7 @@ XL_DLL int XLDMatrixCreateFromCSREx(const size_t* indptr, size_t nindptr, size_t nelem, size_t num_col, + bool have_field, XL* out); XL_DLL int XLDMatrixCreateFromCSR(const size_t* indptr, @@ -146,10 +147,11 @@ XL_DLL int XLDMatrixCreateFromCSCEx(const size_t* indptr, size_t nindptr, size_t nelem, size_t num_row, + bool have_field, XL* out); XL_DLL int XLDMatrixSetLabel(XL* out, const real_t* label, - const size_t& len); + const size_t len); #endif // XLEARN_C_API_C_API_H_ \ No newline at end of file diff --git a/src/data/data_structure.h b/src/data/data_structure.h index 1c610722..61b81c45 100644 --- a/src/data/data_structure.h +++ b/src/data/data_structure.h @@ -224,15 +224,6 @@ struct DMatrix { this->Y = label; } - void SetLabel(const real_t* label, - const size_t& len) { - has_label = true; - this->Y.reserve(len); - for (size_t i = 0; i < len; ++ i) { - this->Y.push_back(label[i]); - } - } - // Copy another data matrix to this matrix. // Note that here we do the deep copy and we will // allocate memory if current matrix is empty. From 35ceeb7bb86bd8991ddd30779fc92781ecee4f67 Mon Sep 17 00:00:00 2001 From: peikai zheng Date: Wed, 24 Jan 2018 14:44:17 +0800 Subject: [PATCH 17/18] fix field None --- python-package/xlearn/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/xlearn/core.py b/python-package/xlearn/core.py index 6235f51c..7f1b73ba 100644 --- a/python-package/xlearn/core.py +++ b/python-package/xlearn/core.py @@ -39,7 +39,7 @@ def __init__(self, data, label=None, field=None, silent=1, else: try: csr = scipy.sparse.csr_matrix(data) - csr_field = scipy.sparse.csr_matrix(field); + csr_field = scipy.sparse.csr_matrix(field) if field else None; self._init_from_csr(csr, csr_field) except: raise TypeError('can not initialize DMatrix from {}'.format(type(data).__name__)) From 971bf9f74b74ae74a8a1dcdc9c3ae7aba4f67640 Mon Sep 17 00:00:00 2001 From: peikai zheng Date: Thu, 25 Jan 2018 22:15:49 +0800 Subject: [PATCH 18/18] add pandas test --- python-package/test_python_module.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python-package/test_python_module.py b/python-package/test_python_module.py index 43a851d4..4f30b6e7 100755 --- a/python-package/test_python_module.py +++ b/python-package/test_python_module.py @@ -5,6 +5,7 @@ # The dataset comes from the criteo CTR. from __future__ import absolute_import import numpy as np +import pandas as pd import xlearn as xl from scipy.sparse import csr_matrix @@ -25,6 +26,14 @@ fm_model.setValidate(tmp_dmatrix) fm_model.fit(param, "fm_model.out") +# Test Pandas +df_x = pd.DataFrame(X.todense()) +tmp_dmatrix = xl.DMatrix(df_X, Y) +fm_model = xl.create_fm() +fm_model.setTrain(tmp_dmatrix) +fm_model.setValidate(tmp_dmatrix) +fm_model.fit(param, "fm_model.out") + # Create factorazation machine ffm_model = xl.create_ffm()