diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 26ae7568..6e3ad036 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 2.0.4 +current_version = 3.0.0rc1 commit = True tag = False diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 2fa045a5..ca005f9e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,8 +1,74 @@ .. _changelog: + Changelog + ========= +3.0.0rc1 (2025-06-02) +--------------------- + +This is the pre-release of a major release introducing array ingestion and some +minor breaking changes. + +Features +~~~~~~~~ +* Array Data Type Support. Adds native support for NumPy arrays + (currently only for ``np.float64`` element type and up to 32 dimensions). + +.. code-block:: python + + import numpy as np + + # Create 2D numpy array + array_2d = np.array([ + [1.1, 2.2, 3.3], + [4.4, 5.5, 6.6]], dtype=np.float64) + + sender.row( + 'table', + columns={'array_2d': array_2d}, + at=timestamp) + +* Implements binary protocol for columns of ``float`` (double-precision) and + ``numpy.ndarray[np.float64]``, with performance improvements for these + two datatypes. + +Breaking Changes +~~~~~~~~~~~~~~~~ +* Buffer Constructor Changes. The ``Buffer`` constructor now requires the ``protocol_version`` parameter. + You can create buffer through the sender for automatic ``protocol_version`` management: + +.. code-block:: python + + buf = sender.new_buffer() # protocol_version determined automatically + buf.row( + 'table', + columns={'arr': np.array([1.5, 3.0], dtype=np.float64)}, + at=timestamp) + +* To access the raw payload, call ``bytes(sender)`` or ``bytes(buffer)`` ( + rather than calling the ``str`` function on the same objects as in version + 2.x.x of the questdb library) method. + +* **NumPy Dependency** + + Array functionality mandates NumPy installation. + +* **Sender/Buffer String Conversion Removal** + + The legacy string conversion via `str(sender)` is removed. + Access raw binary payloads through the `bytes(sender)` method: + + .. code-block:: python + + # for debugging + payload = bytes(sender) + +* Python 3.8 support is dropped. + + The minimum supported Python version is now 3.9. + 2.0.4 (2025-04-02) ------------------ diff --git a/README.rst b/README.rst index 336d8a1b..04923f6f 100644 --- a/README.rst +++ b/README.rst @@ -5,7 +5,7 @@ QuestDB Client Library for Python This is the official Python client library for `QuestDB `_. This client library implements QuestDB's variant of the -`InfluxDB Line Protocol `_ +`Ingestion Line Protocol `_ (ILP) over HTTP and TCP. ILP provides the fastest way to insert data into QuestDB. @@ -15,16 +15,29 @@ This implementation supports `authentication and full-connection encryption with `TLS `_. -Quickstart -========== +Install +======= -The latest version of the library is 2.0.3 (`changelog `_). +The latest *stable* version of the library is **2.0.4** (`changelog `_). :: python3 -m pip install -U questdb[dataframe] -Please start by `setting up QuestDB `_ . Once set up, you can use this library to insert data. + +The latest *pre-release* version of the library is **3.0.0r1** (`changelog `_). +This release supports NumPy float64 arrays which are transmitted over a new +protocol version supported by QuestDB 8.4.0 or later. + +:: + + python3 -m pip install --pre -U questdb[dataframe] + +Quickstart +========== + +Start by `setting up QuestDB `_ . +Once set up, you can use this library to insert data. The most common way to insert data is from a Pandas dataframe. @@ -38,6 +51,13 @@ The most common way to insert data is from a Pandas dataframe. 'side': pd.Categorical(['sell', 'sell']), 'price': [2615.54, 39269.98], 'amount': [0.00044, 0.001], + + # NumPy float64 arrays are supported from v3.0.0rc1 onwards. + 'ord_book_bids': [ + np.array([2615.54, 2618.63]), + np.array([39269.98, 39270.00]) + ], + 'timestamp': pd.to_datetime(['2021-01-01', '2021-01-02'])}) conf = f'http::addr=localhost:9000;' @@ -57,7 +77,13 @@ You can also send individual rows. This only requires a more minimal installatio sender.row( 'trades', symbols={'symbol': 'ETH-USD', 'side': 'sell'}, - columns={'price': 2615.54, 'amount': 0.00044}, + columns={ + 'price': 2615.54, + 'amount': 0.00044, + + # NumPy float64 arrays are supported from v3.0.0rc1 onwards. + 'ord_book_bids': np.array([2615.54, 2618.63]), + }, at=TimestampNanos.now()) sender.flush() diff --git a/RELEASING.rst b/RELEASING.rst index d479afed..68c77e69 100644 --- a/RELEASING.rst +++ b/RELEASING.rst @@ -93,10 +93,9 @@ Inside the VM, open a terminal (or use the terminal Window in VSCode) and run th rustup update stable - /Library/Frameworks/Python.framework/Versions/3.8/bin/python3 \ - -m pip install -U pip - /Library/Frameworks/Python.framework/Versions/3.8/bin/python3 \ - -m pip install -U setuptools wheel twine Cython cibuildwheel pandas numpy pyarrow + python3 -m pip install -U pip + python3 -m pip install -U \ + setuptools wheel twine Cython cibuildwheel pandas numpy pyarrow Smoke-testing the build ----------------------- diff --git a/c-questdb-client b/c-questdb-client index 242c1f3c..fd24e025 160000 --- a/c-questdb-client +++ b/c-questdb-client @@ -1 +1 @@ -Subproject commit 242c1f3c6a830ce28ca515168bc90306c9c96ab4 +Subproject commit fd24e0258f6b86a457037013cc42459e5bb9475b diff --git a/ci/cibuildwheel.yaml b/ci/cibuildwheel.yaml index 6e5b156c..fdbb5850 100644 --- a/ci/cibuildwheel.yaml +++ b/ci/cibuildwheel.yaml @@ -52,7 +52,6 @@ stages: - job: linux_arm64 pool: name: "arm64-clients" - vmImage: demands: - Agent.Name -equals arm64-clients-$(Build.BuildId) dependsOn: @@ -92,23 +91,6 @@ stages: - task: PublishBuildArtifacts@1 inputs: {pathtoPublish: 'wheelhouse'} - - job: linux_x64_cpython_manylinux_i686 - pool: {vmImage: 'ubuntu-latest'} - timeoutInMinutes: 90 - steps: - - task: UsePythonVersion@0 - - bash: | - set -o errexit - python3 -m pip install --upgrade pip - python3 -m pip install cibuildwheel - displayName: Install dependencies - - bash: cibuildwheel --output-dir wheelhouse . - displayName: Build wheels - env: - CIBW_BUILD: cp*-manylinux_i686 - - task: PublishBuildArtifacts@1 - inputs: {pathtoPublish: 'wheelhouse'} - - job: linux_x64_cpython_musllinux pool: {vmImage: 'ubuntu-latest'} timeoutInMinutes: 90 @@ -168,9 +150,22 @@ stages: python3 -m pip install --upgrade pip python3 -m pip install cibuildwheel displayName: Install dependencies - - bash: cibuildwheel --output-dir wheelhouse . + - powershell: | + $vsPath = Resolve-Path "C:\Program Files (x86)\Microsoft Visual Studio\2019\*\VC\Auxiliary\Build\vcvars32.bat" + cmd /c "call `"$vsPath`" && set > env_vars.txt" + + Get-Content env_vars.txt | ForEach-Object { + if ($_ -match "^([^=]+?)=(.*)$" -and $matches[1] -notmatch '^(SYSTEM|AGENT|BUILD|RELEASE|VSTS|TASK|USE_|FAIL_|MSDEPLOY|AZP_75787|AZP_AGENT|AZP_ENABLE|AZURE_HTTP|COPYFILESOVERSSHV0|ENABLE_ISSUE_SOURCE_VALIDATION|MODIFY_NUMBER_OF_RETRIES_IN_ROBOCOPY|MSBUILDHELPERS_ENABLE_TELEMETRY|RETIRE_AZURERM_POWERSHELL_MODULE|ROSETTA2_WARNING|AZP_PS_ENABLE)') { + [System.Environment]::SetEnvironmentVariable($matches[1], $matches[2], "Process") + Write-Host "##vso[task.setvariable variable=$($matches[1])]$($matches[2])" + } + } + + where.exe cl.exe + cibuildwheel --output-dir wheelhouse . displayName: Build wheels env: + CIBW_ENVIRONMENT: "MSSdk=1 DISTUTILS_USE_SDK=1 SETUP_DO_GIT_SUBMODULE_INIT=1" CIBW_BUILD: "*win32*" - task: PublishBuildArtifacts@1 inputs: {pathtoPublish: 'wheelhouse'} @@ -185,9 +180,24 @@ stages: python3 -m pip install --upgrade pip python3 -m pip install cibuildwheel displayName: Install dependencies - - bash: cibuildwheel --output-dir wheelhouse . + - powershell: | + $vsPath = Resolve-Path "C:\Program Files (x86)\Microsoft Visual Studio\2019\*\VC\Auxiliary\Build\vcvars64.bat" + cmd /c "call `"$vsPath`" && set > env_vars.txt" + + Get-Content env_vars.txt | ForEach-Object { + if ($_ -match "^([^=]+?)=(.*)$" -and $matches[1] -notmatch '^(SYSTEM|AGENT|BUILD|RELEASE|VSTS|TASK|USE_|FAIL_|MSDEPLOY|AZP_75787|AZP_AGENT|AZP_ENABLE|AZURE_HTTP|COPYFILESOVERSSHV0|ENABLE_ISSUE_SOURCE_VALIDATION|MODIFY_NUMBER_OF_RETRIES_IN_ROBOCOPY|MSBUILDHELPERS_ENABLE_TELEMETRY|RETIRE_AZURERM_POWERSHELL_MODULE|ROSETTA2_WARNING|AZP_PS_ENABLE)') { + [System.Environment]::SetEnvironmentVariable($matches[1], $matches[2], "Process") + Write-Host "##vso[task.setvariable variable=$($matches[1])]$($matches[2])" + } + } + + where.exe cl.exe + cibuildwheel --output-dir wheelhouse . displayName: Build wheels env: CIBW_BUILD: "*win_amd64*" + CIBW_ENVIRONMENT: "MSSdk=1 DISTUTILS_USE_SDK=1 SETUP_DO_GIT_SUBMODULE_INIT=1" + CIBW_BUILD_VERBOSITY: "3" + DISTUTILS_DEBUG: "1" - task: PublishBuildArtifacts@1 inputs: {pathtoPublish: 'wheelhouse'} diff --git a/ci/pip_install_deps.py b/ci/pip_install_deps.py index 53e2391d..91862c7e 100644 --- a/ci/pip_install_deps.py +++ b/ci/pip_install_deps.py @@ -66,20 +66,19 @@ def install_old_pandas_and_numpy(args): try_pip_install('numpy<2') def install_new_pandas_and_numpy(): - try_pip_install('pandas') - try_pip_install('numpy') + try_pip_install('pandas>2') + try_pip_install('numpy>2') def main(args): ensure_timezone() pip_install('pip') pip_install('setuptools') - try_pip_install('fastparquet>=2023.10.1') - if args.pandas_version is not None and args.pandas_version != '': install_old_pandas_and_numpy(args) else: install_new_pandas_and_numpy() + try_pip_install('fastparquet>=2023.10.1') try_pip_install('pyarrow') on_linux_is_glibc = ( diff --git a/docs/api.rst b/docs/api.rst index 305dad6f..b3e1f11e 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -9,7 +9,61 @@ questdb.ingress from questdb.ingress import * -.. automodule:: questdb.ingress +.. autoclass:: questdb.ingress.Sender + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: questdb.ingress.Buffer + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: questdb.ingress.SenderTransaction + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: questdb.ingress.IngressError + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: questdb.ingress.IngressErrorCode + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: questdb.ingress.Protocol + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: questdb.ingress.TimestampMicros + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: questdb.ingress.TimestampNanos + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: questdb.ingress.TlsCa + :members: + :undoc-members: + :show-inheritance: + +.. autoclass:: questdb.ingress.ServerTimestampType + :members: + :undoc-members: + :show-inheritance: + +.. autodata:: questdb.ingress.ServerTimestamp + :annotation: + :no-value: + +.. autoclass:: questdb.ingress.TaggedEnum :members: :undoc-members: :show-inheritance: diff --git a/docs/conf.py b/docs/conf.py index edcf176d..1c68b50e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -3,6 +3,8 @@ import os +from questdb.ingress import * + extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', @@ -12,7 +14,8 @@ 'sphinx.ext.ifconfig', 'sphinx.ext.napoleon', 'sphinx.ext.todo', - 'sphinx.ext.viewcode' + 'sphinx.ext.viewcode', + "sphinx.ext.intersphinx", ] source_suffix = '.rst' master_doc = 'index' @@ -20,7 +23,7 @@ year = '2024' author = 'QuestDB' copyright = '{0}, {1}'.format(year, author) -version = release = '2.0.4' +version = release = '3.0.0rc1' github_repo_url = 'https://github.com/questdb/py-questdb-client' @@ -67,6 +70,11 @@ 'undoc-members': True } +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "numpy": ("https://numpy.org/doc/stable/", None), + "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), +} # def do_not_skip_dunder_members(_app, _what, name, _obj, would_skip, _options): # if name in ('__init__', '__call__', '__str__', '__enter__', '__exit__'): diff --git a/docs/conf.rst b/docs/conf.rst index f4e802d7..6b0853b2 100644 --- a/docs/conf.rst +++ b/docs/conf.rst @@ -223,11 +223,30 @@ If you need consistent flushing at specific intervals, you should set ``auto_flush_interval=off`` and implement your own timer-based logic. The :ref:`sender_advanced` documentation should help you. +.. _sender_conf_protocol_version: + +Protocol Version +================ + +Specifies the version of InfluxDB Line Protocol to use. Valid options are: + +* ``1`` - Text-based format compatible with InfluxDB database when used over HTTP. + +* ``2`` - Array support and binary format serialization for 64-bit floats (version specific to QuestDB). + +* ``auto`` (default) - Automatic version selection based on protocol type. + + HTTP/HTTPS: Auto-detects server capability during handshake (supports version negotiation) + + TCP/TCPS: Defaults to version 1 for compatibility + .. _sender_conf_buffer: Buffer ====== +* ``protocol_version`` - ``int (1, 2)``: Buffer protocol version. + * ``init_buf_size`` - ``int > 0``: Initial buffer capacity. Default: 65536 (64KiB). diff --git a/docs/sender.rst b/docs/sender.rst index 3ef45a28..602f1268 100644 --- a/docs/sender.rst +++ b/docs/sender.rst @@ -264,6 +264,44 @@ completely disabled: See the :ref:`sender_conf_auto_flush` section for more details. and note that ``auto_flush_interval`` :ref:`does NOT start a timer `. +.. _sender_protocol_version: + +Protocol Version +================ + +Specifies the version of InfluxDB Line Protocol to use for sender. + +Valid options are: + +* ``1`` - Text-based format compatible with InfluxDB database when used over HTTP. +* ``2`` - Array support and binary format serialization for 64-bit floats (version specific to QuestDB). +* ``auto`` (default) - Automatic version selection based on connection type. + +Behavior details: + +^^^^^^^^^^^^^^^^^ + ++----------------+--------------------------------------------------------------+ +| Value | Behavior | ++================+==============================================================+ +| | - Plain text serialization | +| ``1`` | - Compatible with InfluxDB servers | +| | - No array type support | ++----------------+--------------------------------------------------------------+ +| ``2`` | - Binary encoding for f64 | +| | - Full support for array | ++----------------+--------------------------------------------------------------+ +| | - **HTTP/HTTPS**: Auto-detects server capability during | +| ``auto`` | handshake (supports version negotiation) | +| | - **TCP/TCPS**: Defaults to version 1 for compatibility | ++----------------+--------------------------------------------------------------+ + +Here is a configuration string with ``protocol_version=2`` for ``TCP``: + +``tcp::addr=localhost:9000;protocol_version=2;`` + +See the :ref:`sender_conf_protocol_version` section for more details. + Error Reporting =============== diff --git a/proj.py b/proj.py index c0898b77..becb26a6 100755 --- a/proj.py +++ b/proj.py @@ -14,6 +14,19 @@ PROJ_ROOT = pathlib.Path(__file__).parent +def patch_mac_archflags_env(): + system = platform.system() + machine = platform.machine() + + if system == "Darwin": + if machine == "arm64": + os.environ["ARCHFLAGS"] = "-arch arm64" + elif machine == "x86_64": + os.environ["ARCHFLAGS"] = "-arch x86_64" + else: + raise RuntimeError(f"Unknown macOS architecture: {machine}") + + def _run(*args, env=None, cwd=None): """ Log and run a command within the build dir. @@ -173,6 +186,7 @@ def open_browser(port): def doc(http_serve=False, port=None): _run('python3', '-m', 'sphinx.cmd.build', '-b', 'html', 'docs', 'build/docs', + '-nW', '--keep-going', '-v', env={'PYTHONPATH': str(PROJ_ROOT / 'src')}) if _arg2bool(http_serve): serve(port) @@ -194,14 +208,14 @@ def cibuildwheel(*args): 'darwin': 'macos', 'linux': 'linux'}[sys.platform] python = 'python3' - if sys.platform == 'darwin': - # Launching with version other than 3.8 will - # fail saying the 3.8 wheel is unsupported. - # This is because the 3.8 wheel ends up getting loaded with another - # Python version. - # - # NB: Make sure to update `cibuildwheel` on py3.8 too before running! - python = '/Library/Frameworks/Python.framework/Versions/3.8/bin/python3' + # if sys.platform == 'darwin': + # # Launching with version other than 3.8 will + # # fail saying the 3.8 wheel is unsupported. + # # This is because the 3.8 wheel ends up getting loaded with another + # # Python version. + # # + # # NB: Make sure to update `cibuildwheel` on py3.8 too before running! + # python = '/Library/Frameworks/Python.framework/Versions/3.8/bin/python3' _run(python, '-m', 'cibuildwheel', '--platform', plat, @@ -269,6 +283,7 @@ def main(): sys.stderr.write(f' {command}\n') sys.stderr.write('\n') sys.exit(0) + patch_mac_archflags_env() fn = sys.argv[1] args = list(sys.argv)[2:] globals()[fn](*args) diff --git a/pyproject.toml b/pyproject.toml index 56cf6f86..b7b94fe3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,8 @@ [project] # See: https://packaging.python.org/en/latest/specifications/declaring-project-metadata/ name = "questdb" -requires-python = ">=3.8" -version = "2.0.4" +requires-python = ">=3.9" +version = "3.0.0rc1" description = "QuestDB client library for Python" readme = "README.rst" classifiers = [ @@ -17,9 +17,9 @@ classifiers = [ "Topic :: Software Development :: Libraries", "Topic :: System :: Networking", "Topic :: Database :: Front-Ends", - "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering" ] - +dependencies = ["numpy>=1.26.0"] # Keep in sync with build-system.requires [project.license] text = "Apache License 2.0" @@ -40,13 +40,13 @@ Source = "https://github.com/questdb/py-questdb-client/" Tracker = "https://github.com/questdb/py-questdb-client/issues" Community = "http://community.questdb.io" - [build-system] requires = [ # Setuptools 18.0 and above properly handles Cython extensions. "setuptools>=45.2.0", "wheel>=0.34.2", "cython>=0.29.24", + "numpy>=1.26.0", # keep in sync with project.dependencies ] @@ -57,9 +57,9 @@ before-build = "python {project}/install_rust.py" before-test = "python {project}/ci/pip_install_deps.py" test-command = "python {project}/test/test.py -v" skip = [ - # No 32-bit musl C native tool chain for Rust. - # There's a tier 2 target for it, but it would need cross-compiling. - "*-musllinux_i686", + # Skip all 32-bit builds, except for Windows. + # Those builds are named `*win32*` in cibuildwheel. + "*i686*", ] # [tool.cibuildwheel.windows] diff --git a/setup.py b/setup.py index 2d0ec2b8..0f1e6efe 100755 --- a/setup.py +++ b/setup.py @@ -5,6 +5,7 @@ import os import shutil import platform +import numpy as np from setuptools import setup, find_packages from setuptools.extension import Extension @@ -83,12 +84,17 @@ def ingress_extension(): ["src/questdb/ingress.pyx"], include_dirs=[ "c-questdb-client/include", - "pystr-to-utf8/include"], + "pystr-to-utf8/include", + np.get_include()], library_dirs=lib_paths, libraries=libraries, extra_compile_args=extra_compile_args, extra_link_args=extra_link_args, - extra_objects=extra_objects) + extra_objects=extra_objects, + define_macros = [ + ('NPY_NO_DEPRECATED_API', 'NPY_1_7_API_VERSION') + ] + ) def cargo_build(): @@ -165,7 +171,7 @@ def readme(): setup( name='questdb', - version='2.0.4', + version='3.0.0rc1', platforms=['any'], python_requires='>=3.8', install_requires=[], diff --git a/src/questdb/__init__.py b/src/questdb/__init__.py index f593cd5b..3885d125 100644 --- a/src/questdb/__init__.py +++ b/src/questdb/__init__.py @@ -1 +1 @@ -__version__ = '2.0.4' +__version__ = '3.0.0rc1' diff --git a/src/questdb/dataframe.pxi b/src/questdb/dataframe.pxi index 7601587b..2693250d 100644 --- a/src/questdb/dataframe.pxi +++ b/src/questdb/dataframe.pxi @@ -27,7 +27,6 @@ cdef auto_flush_t auto_flush_blank() noexcept nogil: af.last_flush_ms = NULL return af - cdef bint should_auto_flush( const auto_flush_mode_t* af_mode, line_sender_buffer* ls_buf, @@ -73,7 +72,8 @@ cdef enum col_target_t: col_target_column_f64 = 5 col_target_column_str = 6 col_target_column_ts = 7 - col_target_at = 8 + col_target_column_arr_f64 = 8 + col_target_at = 9 cdef dict _TARGET_NAMES = { @@ -85,6 +85,7 @@ cdef dict _TARGET_NAMES = { col_target_t.col_target_column_f64: "float", col_target_t.col_target_column_str: "string", col_target_t.col_target_column_ts: "timestamp", + col_target_t.col_target_column_arr_f64: "array", col_target_t.col_target_at: "designated timestamp", } @@ -125,6 +126,7 @@ cdef enum col_source_t: col_source_str_lrg_utf8_arrow = 406000 col_source_dt64ns_numpy = 501000 col_source_dt64ns_tz_arrow = 502000 + col_source_arr_f64_numpyobj = 601100 cdef bint col_source_needs_gil(col_source_t source) noexcept nogil: @@ -213,6 +215,9 @@ cdef dict _TARGET_TO_SOURCES = { col_source_t.col_source_dt64ns_numpy, col_source_t.col_source_dt64ns_tz_arrow, }, + col_target_t.col_target_column_arr_f64: { + col_source_t.col_source_arr_f64_numpyobj, + }, col_target_t.col_target_at: { col_source_t.col_source_dt64ns_numpy, col_source_t.col_source_dt64ns_tz_arrow, @@ -227,7 +232,8 @@ cdef tuple _FIELD_TARGETS = ( col_target_t.col_target_column_i64, col_target_t.col_target_column_f64, col_target_t.col_target_column_str, - col_target_t.col_target_column_ts) + col_target_t.col_target_column_ts, + col_target_t.col_target_column_arr_f64) # Targets that map directly from a meta target. @@ -349,6 +355,9 @@ cdef enum col_dispatch_code_t: col_dispatch_code_at__dt64ns_tz_arrow = \ col_target_t.col_target_at + col_source_t.col_source_dt64ns_tz_arrow + col_dispatch_code_column_arr_f64__arr_f64_numpyobj = \ + col_target_t.col_target_column_arr_f64 + col_source_t.col_source_arr_f64_numpyobj + # Int values in order for sorting (as needed for API's sequential coupling). cdef enum meta_target_t: @@ -915,10 +924,18 @@ cdef void_int _dataframe_series_sniff_pyobj( Object columns can contain pretty much anything, but they usually don't. We make an educated guess by finding the first non-null value in the column. """ + # To access elements. cdef size_t el_index cdef size_t n_elements = len(pandas_col.series) cdef PyObject** obj_arr cdef PyObject* obj + + # To access elements which are themselves arrays. + cdef PyArrayObject* arr + cdef npy_int arr_type + cdef cnp.dtype arr_descr # A cython defn for `PyArray_Descr*` + cdef str arr_type_name + _dataframe_series_as_pybuf(pandas_col, col) obj_arr = (col.setup.pybuf.buf) for el_index in range(n_elements): @@ -932,6 +949,21 @@ cdef void_int _dataframe_series_sniff_pyobj( col.setup.source = col_source_t.col_source_float_pyobj elif PyUnicode_CheckExact(obj): col.setup.source = col_source_t.col_source_str_pyobj + elif PyArray_CheckExact(obj): + arr = obj + arr_type = PyArray_TYPE(arr) + if arr_type == NPY_DOUBLE: + col.setup.source = col_source_t.col_source_arr_f64_numpyobj + else: + arr_type_name = '??unknown??' + arr_descr = cnp.PyArray_DescrFromType(arr_type) + if arr_descr is not None: + arr_type_name = arr_descr.name.decode('ascii') + raise IngressError( + IngressErrorCode.BadDataFrame, + f'Bad column {pandas_col.name!r}: ' + + 'Unsupported object column containing a numpy array ' + + f'of an unsupported element type {arr_type_name}.') elif PyBytes_CheckExact(obj): raise IngressError( IngressErrorCode.BadDataFrame, @@ -2016,6 +2048,34 @@ cdef void_int _dataframe_serialize_cell_column_ts__dt64ns_numpy( _ensure_has_gil(gs) raise c_err_to_py(err) +cdef void_int _dataframe_serialize_cell_column_arr_f64__arr_f64_numpyobj( + line_sender_buffer* ls_buf, + qdb_pystr_buf* b, + col_t* col) except -1: + cdef PyObject** access = col.cursor.chunk.buffers[1] + cdef PyObject* cell = access[col.cursor.offset] + cdef PyArrayObject* arr = cell + cdef npy_int arr_type = PyArray_TYPE(arr) + cdef cnp.dtype arr_descr + if arr_type != NPY_DOUBLE: + arr_descr = cnp.PyArray_DescrFromType(arr_type) + raise IngressError( + IngressErrorCode.ArrayWriteToBufferError, + f'Only float64 numpy arrays are supported, got dtype: {arr_descr}') + cdef: + size_t rank = PyArray_NDIM(arr) + const uint8_t* data_ptr = PyArray_DATA(arr) + line_sender_error * err = NULL + if not line_sender_buffer_column_f64_arr_byte_strides( + ls_buf, + col.name, + rank, + PyArray_DIMS(arr), + PyArray_STRIDES(arr), # N.B.: Strides expressed as byte jumps + data_ptr, + PyArray_NBYTES(arr), + &err): + raise c_err_to_py(err) cdef void_int _dataframe_serialize_cell_column_ts__dt64ns_tz_arrow( line_sender_buffer* ls_buf, @@ -2173,6 +2233,8 @@ cdef void_int _dataframe_serialize_cell( _dataframe_serialize_cell_column_str__str_i32_cat(ls_buf, b, col, gs) elif dc == col_dispatch_code_t.col_dispatch_code_column_ts__dt64ns_numpy: _dataframe_serialize_cell_column_ts__dt64ns_numpy(ls_buf, b, col, gs) + elif dc == col_dispatch_code_t.col_dispatch_code_column_arr_f64__arr_f64_numpyobj: + _dataframe_serialize_cell_column_arr_f64__arr_f64_numpyobj(ls_buf, b, col) elif dc == col_dispatch_code_t.col_dispatch_code_column_ts__dt64ns_tz_arrow: _dataframe_serialize_cell_column_ts__dt64ns_tz_arrow(ls_buf, b, col, gs) elif dc == col_dispatch_code_t.col_dispatch_code_at__dt64ns_numpy: @@ -2298,7 +2360,7 @@ cdef void_int _dataframe( table_name, table_name_col, symbols, - at if not isinstance(at, _ServerTimestamp) else None, + at if not isinstance(at, ServerTimestampType) else None, b, col_count, &c_table_name, diff --git a/src/questdb/extra_numpy.pxd b/src/questdb/extra_numpy.pxd new file mode 100644 index 00000000..3aa5f71f --- /dev/null +++ b/src/questdb/extra_numpy.pxd @@ -0,0 +1,33 @@ +# We'd _love_ to use the defns from +# https://github.com/numpy/numpy/blob/main/numpy/__init__.pxd +# unfortunately these usually take `object` instead of `PyObject*`. +# Annoyingly, this means that they can incur extra incref/decref +# operations that we most certainly want to avoid for perf reasons. + +from cpython.object cimport PyObject +from numpy cimport ( + # Constants + NPY_DOUBLE, # N.B.: From `#include `: `#define NPY_FLOAT64 NPY_DOUBLE` + + # Types + PyArrayObject, + PyArray_Descr, + npy_intp, + npy_int +) + +cdef extern from "numpy/arrayobject.h": + bint PyArray_CheckExact(PyObject * o) + + # PyArrayObject + npy_intp PyArray_NBYTES(PyArrayObject*) nogil + npy_intp* PyArray_STRIDES(PyArrayObject*) nogil + npy_intp* PyArray_DIMS(PyArrayObject*) nogil + npy_int PyArray_TYPE(PyArrayObject* arr) nogil + void* PyArray_DATA(PyArrayObject*) nogil + char* PyArray_BYTES(PyArrayObject*) nogil + npy_intp* PyArray_DIMS(PyArrayObject*) nogil + npy_intp* PyArray_STRIDES(PyArrayObject*) nogil + npy_intp PyArray_DIM(PyArrayObject*, size_t) nogil + npy_intp PyArray_STRIDE(PyArrayObject*, size_t) nogil + int PyArray_NDIM(PyArrayObject*) nogil diff --git a/src/questdb/ingress.pyi b/src/questdb/ingress.pyi index cc642c06..cd6f2085 100644 --- a/src/questdb/ingress.pyi +++ b/src/questdb/ingress.pyi @@ -28,7 +28,7 @@ __all__ = [ "IngressErrorCode", "Protocol", "Sender", - "ServerTimestamp", + "ServerTimestampType", "TimestampMicros", "TimestampNanos", "TlsCa", @@ -38,6 +38,7 @@ from datetime import datetime, timedelta from enum import Enum from typing import Any, Dict, List, Optional, Union +import numpy as np import pandas as pd class IngressErrorCode(Enum): @@ -54,8 +55,13 @@ class IngressErrorCode(Enum): HttpNotSupported = ... ServerFlushError = ... ConfigError = ... + ArrayLargeDimError = ... + ArrayInternalError = ... + ArrayWriteToBufferError = ... + ProtocolVersionError = ... BadDataFrame = ... + class IngressError(Exception): """An error whilst using the ``Sender`` or constructing its ``Buffer``.""" @@ -63,11 +69,13 @@ class IngressError(Exception): def code(self) -> IngressErrorCode: """Return the error code.""" -class ServerTimestamp: + +class ServerTimestampType: """ A placeholder value to indicate using a server-generated-timestamp. """ + class TimestampMicros: """ A timestamp in microseconds since the UNIX epoch (UTC). @@ -119,6 +127,7 @@ class TimestampMicros: def value(self) -> int: """Number of microseconds (Unix epoch timestamp, UTC).""" + class TimestampNanos: """ A timestamp in nanoseconds since the UNIX epoch (UTC). @@ -169,6 +178,7 @@ class TimestampNanos: def value(self) -> int: """Number of nanoseconds (Unix epoch timestamp, UTC).""" + class SenderTransaction: """ A transaction for a specific table. @@ -194,9 +204,9 @@ class SenderTransaction: *, symbols: Optional[Dict[str, Optional[str]]] = None, columns: Optional[ - Dict[str, Union[None, bool, int, float, str, TimestampMicros, datetime]] + Dict[str, Union[None, bool, int, float, str, TimestampMicros, datetime, np.ndarray]] ] = None, - at: Union[ServerTimestamp, TimestampNanos, datetime], + at: Union[ServerTimestampType, TimestampNanos, datetime], ) -> SenderTransaction: """ Write a row for the table in the transaction. @@ -209,7 +219,7 @@ class SenderTransaction: df: pd.DataFrame, *, symbols: Union[str, bool, List[int], List[str]] = "auto", - at: Union[ServerTimestamp, int, str, TimestampNanos, datetime], + at: Union[ServerTimestampType, int, str, TimestampNanos, datetime], ) -> SenderTransaction: """ Write a dataframe for the table in the transaction. @@ -235,9 +245,10 @@ class SenderTransaction: This will clear the buffer. """ + class Buffer: """ - Construct QuestDB-flavored InfluxDB Line Protocol (ILP) messages. + Construct QuestDB InfluxDB Line Protocol (ILP) messages. The :func:`Buffer.row` method is used to add a row to the buffer. @@ -247,7 +258,7 @@ class Buffer: from questdb.ingress import Buffer - buf = Buffer() + buf = Buffer(protocol_version=2) # or better yet, `sender.new_buffer()` buf.row( 'table_name1', symbols={'s1', 'v1', 's2', 'v2'}, @@ -270,8 +281,13 @@ class Buffer: # etc. + In general, it's best to create a new buffer from a sender instance, + via the :func:`Sender.new_buffer` method, as this will ensure the buffer + is configured with the same protocol version and maximum name length + as the sender. Buffer Constructor Arguments: + * protocol_version (``int``): The protocol version to use. * ``init_buf_size`` (``int``): Initial capacity of the buffer in bytes. Defaults to ``65536`` (64KiB). * ``max_name_len`` (``int``): Maximum length of a column name. @@ -282,8 +298,8 @@ class Buffer: .. code-block:: python # These two buffer constructions are equivalent. - buf1 = Buffer() - buf2 = Buffer(init_buf_size=65536, max_name_len=127) + buf1 = Buffer(protocol_version=2) + buf2 = Buffer(protocol_version=2, init_buf_size=65536, max_name_len=127) To avoid having to manually set these arguments every time, you can call the sender's ``new_buffer()`` method instead. @@ -293,16 +309,22 @@ class Buffer: from questdb.ingress import Sender, Buffer sender = Sender('http', 'localhost', 9009, - init_buf_size=16384, max_name_len=64) + init_buf_size=16384) buf = sender.new_buffer() assert buf.init_buf_size == 16384 - assert buf.max_name_len == 64 + assert buf.max_name_len == 127 """ - def __init__(self, init_buf_size: int = 65536, max_name_len: int = 127): + def __init__( + self, + *, + protocol_version: int, + init_buf_size: int = 65536, + max_name_len: int = 127): """ Create a new buffer with the an initial capacity and max name length. + :param int protocol_version: The protocol version to use. :param int init_buf_size: Initial capacity of the buffer in bytes. :param int max_name_len: Maximum length of a table or column name. """ @@ -345,11 +367,11 @@ class Buffer: """ The current number of bytes currently in the buffer. - Equivalent (but cheaper) to ``len(str(sender))``. + Equivalent (but cheaper) to ``len(bytes(buffer))``. """ - def __str__(self) -> str: - """Return the constructed buffer as a string. Use for debugging.""" + def __bytes__(self) -> bytes: + """Return the constructed buffer as bytes. Use for debugging.""" def row( self, @@ -357,9 +379,9 @@ class Buffer: *, symbols: Optional[Dict[str, Optional[str]]] = None, columns: Optional[ - Dict[str, Union[None, bool, int, float, str, TimestampMicros, datetime]] + Dict[str, Union[None, bool, int, float, str, TimestampMicros, datetime, np.ndarray]] ] = None, - at: Union[ServerTimestamp, TimestampNanos, datetime], + at: Union[ServerTimestampType, TimestampNanos, datetime], ) -> Buffer: """ Add a single row (line) to the buffer. @@ -377,7 +399,8 @@ class Buffer: 'col4': 'xyz', 'col5': TimestampMicros(123456789), 'col6': datetime(2019, 1, 1, 12, 0, 0), - 'col7': None}, + 'col7': np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), + 'col8': None}, at=TimestampNanos(123456789)) # Only symbols specified. Designated timestamp assigned by the db. @@ -420,6 +443,8 @@ class Buffer: - `FLOAT `_ * - ``str`` - `STRING `_ + * - ``np.ndarray`` + - `ARRAY `_ * - ``datetime.datetime`` and ``TimestampMicros`` - `TIMESTAMP `_ * - ``None`` @@ -457,7 +482,7 @@ class Buffer: table_name: Optional[str] = None, table_name_col: Union[None, int, str] = None, symbols: Union[str, bool, List[int], List[str]] = "auto", - at: Union[ServerTimestamp, int, str, TimestampNanos, datetime], + at: Union[ServerTimestampType, int, str, TimestampNanos, datetime], ) -> Buffer: """ Add a pandas DataFrame to the buffer. @@ -558,7 +583,7 @@ class Buffer: import pandas as pd import questdb.ingress as qi - buf = qi.Buffer() + buf = qi.Buffer(protocol_version=2) # ... df = pd.DataFrame({ @@ -663,6 +688,9 @@ class Buffer: * - ``'object'`` (``str`` objects) - Y - ``STRING`` (default), ``SYMBOL`` via ``symbols`` arg. **δ** + * - ``'object' (``numpy.ndarray[numpy.float64]``)`` + - Y + - ``ARRAY[DOUBLE]`` * - ``'datetime64[ns]'`` - Y - ``TIMESTAMP`` **ζ** @@ -925,6 +953,19 @@ class Sender: Time interval threshold for the auto-flush logic, or None if disabled. """ + @property + def protocol_version(self) -> int: + """ + Returns the QuestDB server's recommended default line protocol version. + """ + + @property + def max_name_len(self): + """ + Returns the sender's maximum-configured maximum name length for table + names and column names. + """ + def establish(self): """ Prepare the sender for use. @@ -941,20 +982,20 @@ class Sender: def __enter__(self) -> Sender: """Call :func:`Sender.establish` at the start of a ``with`` block.""" - def __str__(self) -> str: + def __len__(self) -> int: """ - Inspect the contents of the internal buffer. - - The ``str`` value returned represents the unsent data. + Number of bytes of unsent data in the internal buffer. - Also see :func:`Sender.__len__`. + Equivalent (but cheaper) to ``len(bytes(sender))``. """ - def __len__(self) -> int: + def __bytes__(self) -> bytes: """ - Number of bytes of unsent data in the internal buffer. + Inspect the contents of the internal buffer. + + The ``bytes`` value returned represents the unsent data. - Equivalent (but cheaper) to ``len(str(sender))``. + Also see :func:`Sender.__len__`. """ def transaction(self, table_name: str) -> SenderTransaction: @@ -968,9 +1009,9 @@ class Sender: *, symbols: Optional[Dict[str, str]] = None, columns: Optional[ - Dict[str, Union[bool, int, float, str, TimestampMicros, datetime]] + Dict[str, Union[bool, int, float, str, TimestampMicros, datetime, np.ndarray]] ] = None, - at: Union[TimestampNanos, datetime, ServerTimestamp], + at: Union[TimestampNanos, datetime, ServerTimestampType], ) -> Sender: """ Write a row to the internal buffer. @@ -988,7 +1029,7 @@ class Sender: table_name: Optional[str] = None, table_name_col: Union[None, int, str] = None, symbols: Union[str, bool, List[int], List[str]] = "auto", - at: Union[ServerTimestamp, int, str, TimestampNanos, datetime], + at: Union[ServerTimestampType, int, str, TimestampNanos, datetime], ) -> Sender: """ Write a Pandas DataFrame to the internal buffer. diff --git a/src/questdb/ingress.pyx b/src/questdb/ingress.pyx index 9af2238c..524ae2e3 100644 --- a/src/questdb/ingress.pyx +++ b/src/questdb/ingress.pyx @@ -37,6 +37,7 @@ __all__ = [ 'Protocol', 'Sender', 'ServerTimestamp', + 'ServerTimestampType', 'TimestampMicros', 'TimestampNanos', 'TlsCa', @@ -78,15 +79,25 @@ from enum import Enum from typing import List, Tuple, Dict, Union, Any, Optional, Callable, \ Iterable import pathlib +from cpython.bytes cimport PyBytes_FromStringAndSize import sys import os +import numpy as np +cimport numpy as cnp +from numpy cimport NPY_DOUBLE, PyArrayObject + +# Functions we need to import as `PyObject` to avoid Cython's `object` type +from .extra_numpy cimport * + +cnp.import_array() + # This value is automatically updated by the `bump2version` tool. # If you need to update it, also update the search definition in # .bumpversion.cfg. -VERSION = '2.0.4' +VERSION = '3.0.0rc1' cdef bint _has_gil(PyThreadState** gs): @@ -120,7 +131,11 @@ class IngressErrorCode(Enum): HttpNotSupported = line_sender_error_http_not_supported ServerFlushError = line_sender_error_server_flush_error ConfigError = line_sender_error_config_error - BadDataFrame = line_sender_error_server_flush_error + 1 + ArrayLargeDimError = line_sender_error_array_large_dim + ArrayInternalError = line_sender_error_array_view_internal_error + ArrayWriteToBufferError = line_sender_error_array_view_write_to_buffer_error + ProtocolVersionError = line_sender_error_protocol_version_error + BadDataFrame = line_sender_error_protocol_version_error + 1 def __str__(self) -> str: """Return the name of the enum.""" @@ -162,6 +177,14 @@ cdef inline object c_err_code_to_py(line_sender_error_code code): return IngressErrorCode.ServerFlushError elif code == line_sender_error_config_error: return IngressErrorCode.ConfigError + elif code == line_sender_error_array_large_dim: + return IngressErrorCode.ArrayLargeDimError + elif code == line_sender_error_array_view_internal_error: + return IngressErrorCode.ArrayInternalError + elif code == line_sender_error_array_view_write_to_buffer_error: + return IngressErrorCode.ArrayWriteToBufferError + elif code == line_sender_error_protocol_version_error: + return IngressErrorCode.ProtocolVersionError else: raise ValueError('Internal error converting error code.') @@ -365,7 +388,7 @@ cdef void_int str_to_column_name_copy( cdef int64_t datetime_to_micros(datetime dt): """ - Convert a `datetime.datetime` to microseconds since the epoch. + Convert a :class:`datetime.datetime` to microseconds since the epoch. """ return ( (dt.timestamp()) * @@ -382,13 +405,28 @@ cdef int64_t datetime_to_nanos(datetime dt): (1000000000) + (dt.microsecond * 1000)) -cdef class _ServerTimestamp: + +class ServerTimestampType: """ - A placeholder value to indicate using a server-generated-timestamp. + A placeholder value to indicate that the data should be inserted + using a server-generated-timestamp. + + Don't instantiate this class directly, use the singleton + :data:`ServerTimestamp` instead. + + This feature is mostly provided for legacy compatibility. + We recommend always specifying an explicit timestamp. + + Using ``ServerTimestamp`` will prevent QuestDB's deduplication + feature from working as it would generate unique rows on resubmission. """ pass -ServerTimestamp = _ServerTimestamp() + +#: Singleton instance used to request server-side timestamping. +#: See :class:`ServerTimestampType` for more details. +ServerTimestamp = ServerTimestampType() + cdef class TimestampMicros: """ @@ -433,7 +471,7 @@ cdef class TimestampMicros: @classmethod def from_datetime(cls, dt: datetime): """ - Construct a ``TimestampMicros`` from a ``datetime.datetime`` object. + Construct a ``TimestampMicros`` from a :class:`datetime.datetime` object. """ if not isinstance(dt, datetime): raise TypeError('dt must be a datetime object.') @@ -580,7 +618,7 @@ cdef class SenderTransaction: raise IngressError( IngressErrorCode.InvalidApiCall, 'Already inside a transaction, can\'t start another.') - if len(self._sender._buffer): + if self._sender._buffer is not None and len(self._sender._buffer): if self._sender._auto_flush_mode.enabled: self._sender.flush() else: @@ -607,9 +645,9 @@ cdef class SenderTransaction: symbols: Optional[Dict[str, Optional[str]]]=None, columns: Optional[Dict[ str, - Union[None, bool, int, float, str, TimestampMicros, datetime]] + Union[None, bool, int, float, str, TimestampMicros, datetime, np.ndarray]] ]=None, - at: Union[ServerTimestamp, TimestampNanos, datetime]): + at: Union[ServerTimestampType, TimestampNanos, datetime]): """ Write a row for the table in the transaction. @@ -620,6 +658,13 @@ cdef class SenderTransaction: IngressErrorCode.InvalidTimestamp, "`at` must be of type TimestampNanos, datetime, or ServerTimestamp" ) + + if self._sender._buffer is None: + raise IngressError( + IngressErrorCode.InvalidApiCall, + "row() can\'t be called: Sender is closed." + ) + self._sender._buffer._row( False, # allow_auto_flush self._table_name, @@ -633,7 +678,7 @@ cdef class SenderTransaction: df, # : pd.DataFrame *, symbols: Union[str, bool, List[int], List[str]] = 'auto', - at: Union[ServerTimestamp, int, str, TimestampNanos, datetime]): + at: Union[ServerTimestampType, int, str, TimestampNanos, datetime]): """ Write a dataframe for the table in the transaction. @@ -644,6 +689,11 @@ cdef class SenderTransaction: IngressErrorCode.InvalidTimestamp, "`at` must be of type TimestampNanos, datetime, or ServerTimestamp" ) + if self._sender._buffer is None: + raise IngressError( + IngressErrorCode.InvalidApiCall, + "dataframe() can\'t be called: Sender is closed." + ) _dataframe( auto_flush_blank(), self._sender._buffer._impl, @@ -684,14 +734,15 @@ cdef class SenderTransaction: raise IngressError( IngressErrorCode.InvalidApiCall, 'Transaction already completed, can\'t rollback.') - self._sender._buffer.clear() + if self._sender._buffer is not None: + self._sender._buffer.clear() self._sender._in_txn = False self._complete = True - cdef class Buffer: """ - Construct QuestDB-flavored InfluxDB Line Protocol (ILP) messages. + Construct QuestDB InfluxDB Line Protocol (ILP) messages. + Version 1 is compatible with the InfluxDB Line Protocol. The :func:`Buffer.row` method is used to add a row to the buffer. @@ -759,22 +810,27 @@ cdef class Buffer: cdef size_t _max_name_len cdef object _row_complete_sender - def __cinit__(self, init_buf_size: int=65536, max_name_len: int=127): + def __cinit__(self, protocol_version: int, init_buf_size: int=65536, max_name_len: int=127): """ Create a new buffer with the an initial capacity and max name length. :param int init_buf_size: Initial capacity of the buffer in bytes. :param int max_name_len: Maximum length of a table or column name. """ - self._cinit_impl(init_buf_size, max_name_len) + if protocol_version not in (1, 2): + raise IngressError( + IngressErrorCode.ProtocolVersionError, + 'Invalid protocol version. Supported versions are 1 and 2.') + self._cinit_impl(protocol_version, init_buf_size, max_name_len) - cdef inline _cinit_impl(self, size_t init_buf_size, size_t max_name_len): - self._impl = line_sender_buffer_with_max_name_len(max_name_len) + cdef inline _cinit_impl(self, line_sender_protocol_version version, size_t init_buf_size, size_t max_name_len): + self._impl = line_sender_buffer_with_max_name_len(version, max_name_len) self._b = qdb_pystr_buf_new() line_sender_buffer_reserve(self._impl, init_buf_size) self._init_buf_size = init_buf_size self._max_name_len = max_name_len self._row_complete_sender = None + def __dealloc__(self): self._row_complete_sender = None qdb_pystr_buf_free(self._b) @@ -825,18 +881,17 @@ cdef class Buffer: """ The current number of bytes currently in the buffer. - Equivalent (but cheaper) to ``len(str(sender))``. + Equivalent (but cheaper) to ``len(bytes(buffer))``. """ return line_sender_buffer_size(self._impl) - def __str__(self) -> str: - """Return the constructed buffer as a string. Use for debugging.""" - return self._to_str() + def __bytes__(self) -> bytes: + """Return the constructed buffer as bytes. Use for debugging.""" + return self._to_bytes() - cdef inline object _to_str(self): - cdef size_t size = 0 - cdef const char* utf8 = line_sender_buffer_peek(self._impl, &size) - return PyUnicode_FromStringAndSize(utf8, size) + cdef inline object _to_bytes(self): + cdef line_sender_buffer_view view = line_sender_buffer_peek(self._impl) + return PyBytes_FromStringAndSize( view.buf, view.len) cdef inline void_int _set_marker(self) except -1: cdef line_sender_error* err = NULL @@ -905,6 +960,28 @@ cdef class Buffer: if not line_sender_buffer_column_ts_micros(self._impl, c_name, ts._value, &err): raise c_err_to_py(err) + cdef inline void_int _column_numpy( + self, line_sender_column_name c_name, cnp.ndarray arr) except -1: + if cnp.PyArray_TYPE(arr) != cnp.NPY_FLOAT64: + raise IngressError( + IngressErrorCode.ArrayWriteToBufferError, + f'Only float64 numpy arrays are supported, got dtype: {arr.dtype}') + cdef: + size_t rank = cnp.PyArray_NDIM(arr) + const uint8_t * data_ptr = cnp.PyArray_DATA(arr) + line_sender_error * err = NULL + + if not line_sender_buffer_column_f64_arr_byte_strides( + self._impl, + c_name, + rank, + cnp.PyArray_DIMS(arr), + cnp.PyArray_STRIDES(arr), # N.B.: Strides expressed as byte jumps + data_ptr, + cnp.PyArray_NBYTES(arr), + &err): + raise c_err_to_py(err) + cdef inline void_int _column_dt( self, line_sender_column_name c_name, datetime dt) except -1: cdef line_sender_error* err = NULL @@ -925,6 +1002,8 @@ cdef class Buffer: self._column_str(c_name, value) elif isinstance(value, TimestampMicros): self._column_ts(c_name, value) + elif PyArray_CheckExact( value): + self._column_numpy(c_name, value) elif isinstance(value, datetime): self._column_dt(c_name, value) else: @@ -934,7 +1013,8 @@ cdef class Buffer: 'float', 'str', 'TimestampMicros', - 'datetime.datetime')) + 'datetime.datetime' + 'np.ndarray')) raise TypeError( f'Unsupported type: {_fqn(type(value))}. Must be one of: {valid}') @@ -999,7 +1079,7 @@ cdef class Buffer: self._column(name, value) wrote_fields = True if wrote_fields: - self._at(at if not isinstance(at, _ServerTimestamp) else None) + self._at(at if not isinstance(at, ServerTimestampType) else None) self._clear_marker() else: self._rewind_to_marker() @@ -1016,9 +1096,9 @@ cdef class Buffer: symbols: Optional[Dict[str, Optional[str]]]=None, columns: Optional[Dict[ str, - Union[None, bool, int, float, str, TimestampMicros, datetime]] + Union[None, bool, int, float, str, TimestampMicros, datetime, np.ndarray]] ]=None, - at: Union[ServerTimestamp, TimestampNanos, datetime]): + at: Union[ServerTimestampType, TimestampNanos, datetime]): """ Add a single row (line) to the buffer. @@ -1035,7 +1115,8 @@ cdef class Buffer: 'col4': 'xyz', 'col5': TimestampMicros(123456789), 'col6': datetime(2019, 1, 1, 12, 0, 0), - 'col7': None}, + 'col7': np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), + 'col8': None}, at=TimestampNanos(123456789)) # Only symbols specified. Designated timestamp assigned by the db. @@ -1078,6 +1159,8 @@ cdef class Buffer: - `FLOAT `_ * - ``str`` - `STRING `_ + * - ``np.ndarray`` + - `ARRAY `_ * - ``datetime.datetime`` and ``TimestampMicros`` - `TIMESTAMP `_ * - ``None`` @@ -1127,7 +1210,7 @@ cdef class Buffer: table_name: Optional[str] = None, table_name_col: Union[None, int, str] = None, symbols: Union[str, bool, List[int], List[str]] = 'auto', - at: Union[ServerTimestamp, int, str, TimestampNanos, datetime]): + at: Union[ServerTimestampType, int, str, TimestampNanos, datetime]): """ Add a pandas DataFrame to the buffer. @@ -1227,7 +1310,7 @@ cdef class Buffer: import pandas as pd import questdb.ingress as qi - buf = qi.Buffer() + buf = qi.Buffer(protocol_version=2) # ... df = pd.DataFrame({ @@ -1734,7 +1817,6 @@ cdef class Sender: cdef auto_flush_mode_t _auto_flush_mode cdef int64_t* _last_flush_ms cdef size_t _init_buf_size - cdef size_t _max_name_len cdef bint _in_txn cdef void_int _set_sender_fields( @@ -1759,6 +1841,7 @@ cdef class Sender: object auto_flush_rows, object auto_flush_bytes, object auto_flush_interval, + object protocol_version, object init_buf_size, object max_name_len) except -1: """ @@ -1791,7 +1874,8 @@ cdef class Sender: if bind_interface is not None: str_to_utf8(b, bind_interface, &c_bind_interface) - if not line_sender_opts_bind_interface(self._opts, c_bind_interface, &err): + if not line_sender_opts_bind_interface( + self._opts, c_bind_interface, &err): raise c_err_to_py(err) if username is not None: @@ -1819,6 +1903,23 @@ cdef class Sender: if not line_sender_opts_token_y(self._opts, c_token_y, &err): raise c_err_to_py(err) + if protocol_version is not None: + if protocol_version == 'auto': + pass + elif (protocol_version == 1) or (protocol_version == '1'): + if not line_sender_opts_protocol_version( + self._opts, line_sender_protocol_version_1, &err): + raise c_err_to_py(err) + elif (protocol_version == 2) or (protocol_version == '2'): + if not line_sender_opts_protocol_version( + self._opts, line_sender_protocol_version_2, &err): + raise c_err_to_py(err) + else: + raise IngressError( + IngressErrorCode.ConfigError, + '"protocol_version" must be None, "auto", 1 or 2' + + f' not {protocol_version!r}') + if auth_timeout is not None: if isinstance(auth_timeout, int): c_auth_timeout = auth_timeout @@ -1884,6 +1985,11 @@ cdef class Sender: if not line_sender_opts_request_min_throughput(self._opts, c_request_min_throughput, &err): raise c_err_to_py(err) + if max_name_len is not None: + c_max_name_len = max_name_len + if not line_sender_opts_max_name_len(self._opts, c_max_name_len, &err): + raise c_err_to_py(err) + if request_timeout is not None: if isinstance(request_timeout, int): c_request_timeout = request_timeout @@ -1907,10 +2013,6 @@ cdef class Sender: &self._auto_flush_mode) self._init_buf_size = init_buf_size or 65536 - self._max_name_len = max_name_len or 127 - self._buffer = Buffer( - init_buf_size=self._init_buf_size, - max_name_len=self._max_name_len) self._last_flush_ms = calloc(1, sizeof(int64_t)) def __cinit__(self): @@ -1921,7 +2023,6 @@ cdef class Sender: self._auto_flush_mode.enabled = False self._last_flush_ms = NULL self._init_buf_size = 0 - self._max_name_len = 0 self._in_txn = False def __init__( @@ -1948,6 +2049,7 @@ cdef class Sender: object auto_flush_rows=None, # Default 75000 (HTTP) or 600 (TCP) object auto_flush_bytes=None, # Default off object auto_flush_interval=None, # Default 1000 milliseconds + object protocol_version=None, # Default auto object init_buf_size=None, # 64KiB object max_name_len=None): # 127 @@ -1991,6 +2093,7 @@ cdef class Sender: auto_flush_rows, auto_flush_bytes, auto_flush_interval, + protocol_version, init_buf_size, max_name_len) finally: @@ -2018,6 +2121,7 @@ cdef class Sender: object auto_flush_rows=None, # Default 75000 (HTTP) or 600 (TCP) object auto_flush_bytes=None, # Default off object auto_flush_interval=None, # Default 1000 milliseconds + object protocol_version=None, # Default auto object init_buf_size=None, # 64KiB object max_name_len=None): # 127 """ @@ -2072,6 +2176,7 @@ cdef class Sender: 'auto_flush_rows': auto_flush_rows, 'auto_flush_bytes': auto_flush_bytes, 'auto_flush_interval': auto_flush_interval, + 'protocol_version': protocol_version, 'init_buf_size': init_buf_size, 'max_name_len': max_name_len, }.items(): @@ -2112,6 +2217,7 @@ cdef class Sender: params.get('auto_flush_rows'), params.get('auto_flush_bytes'), params.get('auto_flush_interval'), + params.get('protocol_version'), params.get('init_buf_size'), params.get('max_name_len')) @@ -2140,6 +2246,7 @@ cdef class Sender: object auto_flush_rows=None, # Default 75000 (HTTP) or 600 (TCP) object auto_flush_bytes=None, # Default off object auto_flush_interval=None, # Default 1000 milliseconds + object protocol_version=None, # Default auto object init_buf_size=None, # 64KiB object max_name_len=None): # 127 """ @@ -2179,6 +2286,7 @@ cdef class Sender: auto_flush_rows=auto_flush_rows, auto_flush_bytes=auto_flush_bytes, auto_flush_interval=auto_flush_interval, + protocol_version=protocol_version, init_buf_size=init_buf_size, max_name_len=max_name_len) @@ -2191,8 +2299,9 @@ cdef class Sender: `max_name_len`. """ return Buffer( + protocol_version=self.protocol_version, init_buf_size=self._init_buf_size, - max_name_len=self._max_name_len) + max_name_len=self.max_name_len) @property def init_buf_size(self) -> int: @@ -2202,7 +2311,11 @@ cdef class Sender: @property def max_name_len(self) -> int: """Maximum length of a table or column name.""" - return self._max_name_len + if self._impl == NULL: + raise IngressError( + IngressErrorCode.InvalidApiCall, + 'max_name_len() can\'t be called: Sender is closed.') + return line_sender_get_max_name_len(self._impl) @property def auto_flush(self) -> bint: @@ -2247,6 +2360,23 @@ cdef class Sender: return None return timedelta(milliseconds=self._auto_flush_mode.interval) + @property + def protocol_version(self) -> int: + """ + The protocol version used by the sender. + + Protocol version 1 is retained for backwards compatibility with + older QuestDB versions. + + Protocol version 2 introduces binary floating point support and + the array datatype. + """ + if self._impl == NULL: + raise IngressError( + IngressErrorCode.InvalidApiCall, + 'protocol_version() can\'t be called: Sender is closed.') + return line_sender_get_protocol_version(self._impl) + def establish(self): """ Prepare the sender for use. @@ -2260,20 +2390,32 @@ cdef class Sender: method will return only *after* the handshake(s) is/are complete. """ cdef line_sender_error* err = NULL + cdef PyThreadState * gs = NULL if self._opts == NULL: raise IngressError( IngressErrorCode.InvalidApiCall, 'establish() can\'t be called after close().') + + # We disable the GIL when calling `line_sender_build` since for HTTP + # it can make HTTP requests to auto-detect the protocol version. + _ensure_doesnt_have_gil(&gs) self._impl = line_sender_build(self._opts, &err) + _ensure_has_gil(&gs) + if self._impl == NULL: raise c_err_to_py(err) + + if self._buffer is None: + self._buffer = Buffer( + protocol_version=self.protocol_version, + init_buf_size=self._init_buf_size, + max_name_len=self.max_name_len) + line_sender_opts_free(self._opts) self._opts = NULL # Request callbacks when rows are complete. - if self._buffer is not None: - self._buffer._row_complete_sender = PyWeakref_NewRef(self, None) - + self._buffer._row_complete_sender = PyWeakref_NewRef(self, None) self._last_flush_ms[0] = line_sender_now_micros() // 1000 def __enter__(self) -> Sender: @@ -2281,23 +2423,29 @@ cdef class Sender: self.establish() return self - def __str__(self) -> str: + def __bytes__(self) -> bytes: """ Inspect the contents of the internal buffer. - The ``str`` value returned represents the unsent data. + The ``bytes`` value returned represents the unsent data. Also see :func:`Sender.__len__`. """ - return str(self._buffer) + if self._buffer is None: + return b'' + else: + return bytes(self._buffer) def __len__(self) -> int: """ Number of bytes of unsent data in the internal buffer. - Equivalent (but cheaper) to ``len(str(sender))``. + Equivalent (but cheaper) to ``len(bytes(sender))``. """ - return len(self._buffer) + if self._buffer is None: + return 0 + else: + return len(self._buffer) def transaction(self, table_name: str): """ @@ -2311,8 +2459,8 @@ cdef class Sender: symbols: Optional[Dict[str, str]]=None, columns: Optional[Dict[ str, - Union[bool, int, float, str, TimestampMicros, datetime]]]=None, - at: Union[TimestampNanos, datetime, ServerTimestamp]): + Union[bool, int, float, str, TimestampMicros, datetime, np.ndarray]]]=None, + at: Union[TimestampNanos, datetime, ServerTimestampType]): """ Write a row to the internal buffer. @@ -2330,6 +2478,12 @@ cdef class Sender: IngressErrorCode.InvalidTimestamp, "`at` must be of type TimestampNanos, datetime, or ServerTimestamp" ) + if self._buffer is None: + raise IngressError( + IngressErrorCode.InvalidApiCall, + "row() can\'t be called: Sender is closed." + ) + self._buffer.row(table_name, symbols=symbols, columns=columns, at=at) return self @@ -2340,7 +2494,7 @@ cdef class Sender: table_name: Optional[str] = None, table_name_col: Union[None, int, str] = None, symbols: Union[str, bool, List[int], List[str]] = 'auto', - at: Union[ServerTimestamp, int, str, TimestampNanos, datetime]): + at: Union[ServerTimestampType, int, str, TimestampNanos, datetime]): """ Write a Pandas DataFrame to the internal buffer. @@ -2395,6 +2549,12 @@ cdef class Sender: af.sender = self._impl af.mode = self._auto_flush_mode af.last_flush_ms = self._last_flush_ms + + if self._buffer is None: + raise IngressError( + IngressErrorCode.InvalidApiCall, + "dataframe() can\'t be called: Sender is closed." + ) _dataframe( af, self._buffer._impl, @@ -2454,7 +2614,7 @@ cdef class Sender: if sender == NULL: raise IngressError( IngressErrorCode.InvalidApiCall, - 'flush() can\'t be called: Not connected.') + 'flush() can\'t be called: Sender is closed.') if buffer is not None: c_buf = buffer._impl else: diff --git a/src/questdb/line_sender.pxd b/src/questdb/line_sender.pxd index 50490ab9..8a28c0d3 100644 --- a/src/questdb/line_sender.pxd +++ b/src/questdb/line_sender.pxd @@ -22,7 +22,7 @@ ## ################################################################################ -from libc.stdint cimport int64_t, uint16_t, uint64_t +from libc.stdint cimport int64_t, uint16_t, uint64_t, uint8_t, uint32_t, int32_t cdef extern from "questdb/ingress/line_sender.h": cdef struct line_sender_error: @@ -40,6 +40,10 @@ cdef extern from "questdb/ingress/line_sender.h": line_sender_error_http_not_supported, line_sender_error_server_flush_error, line_sender_error_config_error, + line_sender_error_array_large_dim + line_sender_error_array_view_internal_error + line_sender_error_array_view_write_to_buffer_error + line_sender_error_protocol_version_error cdef enum line_sender_protocol: line_sender_protocol_tcp, @@ -47,6 +51,10 @@ cdef extern from "questdb/ingress/line_sender.h": line_sender_protocol_http, line_sender_protocol_https, + cdef enum line_sender_protocol_version: + line_sender_protocol_version_1 = 1, + line_sender_protocol_version_2 = 2, + cdef enum line_sender_ca: line_sender_ca_webpki_roots, line_sender_ca_os_roots, @@ -102,6 +110,10 @@ cdef extern from "questdb/ingress/line_sender.h": size_t len const char* buf + cdef struct line_sender_buffer_view: + size_t len + const uint8_t* buf + bint line_sender_column_name_init( line_sender_column_name* name, size_t len, @@ -118,9 +130,11 @@ cdef extern from "questdb/ingress/line_sender.h": pass line_sender_buffer* line_sender_buffer_new( + line_sender_protocol_version version, ) noexcept nogil line_sender_buffer* line_sender_buffer_with_max_name_len( + line_sender_protocol_version version, size_t max_name_len ) noexcept nogil @@ -171,9 +185,8 @@ cdef extern from "questdb/ingress/line_sender.h": const line_sender_buffer* buffer ) noexcept nogil - const char* line_sender_buffer_peek( - const line_sender_buffer* buffer, - size_t* len_out + line_sender_buffer_view line_sender_buffer_peek( + const line_sender_buffer* buffer ) noexcept nogil bint line_sender_buffer_table( @@ -217,6 +230,17 @@ cdef extern from "questdb/ingress/line_sender.h": line_sender_error** err_out ) noexcept nogil + bint line_sender_buffer_column_f64_arr_byte_strides( + line_sender_buffer* buffer, + line_sender_column_name name, + size_t rank, + const size_t* shapes, + const ssize_t* strides, + const uint8_t* data_buffer, + size_t data_buffer_len, + line_sender_error** err_out + ) noexcept nogil + bint line_sender_buffer_column_ts_nanos( line_sender_buffer* buffer, line_sender_column_name name, @@ -248,6 +272,11 @@ cdef extern from "questdb/ingress/line_sender.h": line_sender_error** err_out ) noexcept nogil + bint line_sender_buffer_check_can_flush( + const line_sender_buffer* buffer, + line_sender_error** err_out + ) noexcept nogil + cdef struct line_sender: pass @@ -311,6 +340,12 @@ cdef extern from "questdb/ingress/line_sender.h": line_sender_error** err_out ) noexcept nogil + bint line_sender_opts_protocol_version( + line_sender_opts* opts, + line_sender_protocol_version version, + line_sender_error** err_out + ) noexcept nogil + bint line_sender_opts_auth_timeout( line_sender_opts* opts, uint64_t millis, @@ -341,6 +376,12 @@ cdef extern from "questdb/ingress/line_sender.h": line_sender_error** err_out ) noexcept nogil + bint line_sender_opts_max_name_len( + line_sender_opts* opts, + size_t max_name_len, + line_sender_error** err_out + ) noexcept nogil + bint line_sender_opts_retry_timeout( line_sender_opts* opts, uint64_t millis, @@ -381,6 +422,18 @@ cdef extern from "questdb/ingress/line_sender.h": line_sender_error** err_out ) noexcept nogil + line_sender_protocol_version line_sender_get_protocol_version( + const line_sender * sender + ) noexcept nogil + + size_t line_sender_get_max_name_len( + const line_sender * sender + ) noexcept nogil + + line_sender_buffer* line_sender_buffer_new_for_sender( + const line_sender * sender + ) noexcept nogil + bint line_sender_must_close( const line_sender* sender ) noexcept nogil diff --git a/test/benchmark.py b/test/benchmark.py index 245cbb00..5bf295f2 100644 --- a/test/benchmark.py +++ b/test/benchmark.py @@ -27,7 +27,7 @@ def test_pystr_i64_10m(self): 'a': slist, 'b': list(range(len(slist)))}) - buf = qi.Buffer() + buf = qi.Buffer(protocol_version=1) # Warm up and pre-size buffer buf.dataframe(df, table_name='tbl1', symbols=True, at=qi.ServerTimestamp) @@ -53,7 +53,7 @@ def test_mixed_10m(self): ['a', 'b', 'c', 'a', None, 'c', 'a', float('nan')] * (count // 8))}) - buf = qi.Buffer() + buf = qi.Buffer(protocol_version=2) # Warm up and pre-size buffer buf.dataframe(df, table_name='tbl1', symbols=True, at=qi.ServerTimestamp) @@ -77,7 +77,7 @@ def test_string_escaping_10m(self): 'col5': series, 'col6': series}) - buf = qi.Buffer() + buf = qi.Buffer(protocol_version=2) # Warm up and pre-size buffer buf.dataframe(df, table_name='tbl1', symbols=True, at=qi.ServerTimestamp) @@ -109,7 +109,7 @@ def test_string_encoding_10m(self): 'col4': slist, 'col5': slist}) - buf = qi.Buffer() + buf = qi.Buffer(protocol_version=2) # Warm up and pre-size buffer buf.dataframe(df, table_name='tbl1', symbols=False, at=qi.ServerTimestamp) @@ -133,7 +133,7 @@ def _test_gil_release_10m(self, threads): 'col6': series}) tpe = ThreadPoolExecutor(max_workers=threads) - bufs = [qi.Buffer() for _ in range(threads)] + bufs = [qi.Buffer(protocol_version=2) for _ in range(threads)] def benchmark_run(buf): t0 = time.monotonic() diff --git a/test/mock_server.py b/test/mock_server.py index 281b1742..bcac584d 100644 --- a/test/mock_server.py +++ b/test/mock_server.py @@ -1,9 +1,11 @@ +import json import socket import select import re import http.server as hs import threading import time +import struct NON_ESCAPED_NEW_LINE_RE = re.compile(rb'(? len(buf): + break + index = new_index + continue + + if index > 0 and buf[index] == ord('\n') and buf[index - 1] != ord('\\'): + new_msgs.append(buf[head:index]) + head = index + 1 + + index += 1 + self.msgs.extend(new_msgs) return new_msgs + def _parse_binary_data(self, buf, index): + if buf[index] != ord('=') or index + 1 >= len(buf) or buf[index + 1] != ord('='): + return index + + index += 2 # skip "==" + if index >= len(buf): + return index + binary_type = buf[index] + index += 1 + + if binary_type == 16: + index += 8 + elif binary_type == 14: + # dims + if index + 1 >= len(buf): + return index + index += 1 + if index >= len(buf): + return index + dims = buf[index] + index += 1 + + total_elements = 1 + for _ in range(dims): + if index + 4 > len(buf): + return index + dim_size = struct.unpack('= 15: # 5ms grace period. @@ -870,17 +963,17 @@ def test_auto_flush_interval(self): def _do_test_auto_flush_interval2(self): with HttpServer() as server, self.builder( 'http', - 'localhost', + '127.0.0.1', server.port, - auto_flush_interval=10, + auto_flush_interval=100, auto_flush_rows=False, auto_flush_bytes=False) as sender: sender.row('t', columns={'x': 1}, at=qi.ServerTimestamp) sender.row('t', columns={'x': 2}, at=qi.ServerTimestamp) - time.sleep(0.02) + time.sleep(0.2) sender.row('t', columns={'x': 3}, at=qi.ServerTimestamp) sender.row('t', columns={'x': 4}, at=qi.ServerTimestamp) - time.sleep(0.02) + time.sleep(0.2) sender.row('t', columns={'x': 5}, at=qi.ServerTimestamp) sender.row('t', columns={'x': 6}, at=qi.ServerTimestamp) return server.requests @@ -904,22 +997,22 @@ def test_auto_flush_interval2(self): self.assertEqual(len(requests), 3) def test_http_username_password(self): - with HttpServer() as server, self.builder('http', 'localhost', server.port, username='user', + with HttpServer() as server, self.builder('http', '127.0.0.1', server.port, username='user', password='pass') as sender: sender.row('tbl1', columns={'x': 42}, at=qi.ServerTimestamp) self.assertEqual(len(server.requests), 1) self.assertEqual(server.requests[0], b'tbl1 x=42i\n') - self.assertEqual(server.headers[0]['Authorization'], 'Basic dXNlcjpwYXNz') + self.assertEqual(server.headers[1]['authorization'], 'Basic dXNlcjpwYXNz') def test_http_token(self): - with HttpServer() as server, self.builder('http', 'localhost', server.port, token='Yogi') as sender: + with HttpServer() as server, self.builder('http', '127.0.0.1', server.port, token='Yogi') as sender: sender.row('tbl1', columns={'x': 42}, at=qi.ServerTimestamp) self.assertEqual(len(server.requests), 1) self.assertEqual(server.requests[0], b'tbl1 x=42i\n') - self.assertEqual(server.headers[0]['Authorization'], 'Bearer Yogi') + self.assertEqual(server.headers[1]['authorization'], 'Bearer Yogi') def test_max_buf_size(self): - with HttpServer() as server, self.builder('http', 'localhost', server.port, max_buf_size=1024, + with HttpServer() as server, self.builder('http', '127.0.0.1', server.port, max_buf_size=1024, auto_flush=False) as sender: while len(sender) < 1024: sender.row('tbl1', columns={'x': 42}, at=qi.ServerTimestamp) @@ -929,7 +1022,7 @@ def test_max_buf_size(self): def test_http_err(self): with HttpServer() as server, self.builder( 'http', - 'localhost', + '127.0.0.1', server.port, retry_timeout=datetime.timedelta(milliseconds=1)) as sender: server.responses.append((0, 500, 'text/plain', b'Internal Server Error')) @@ -942,7 +1035,7 @@ def test_http_err_retry(self): exp_payload = b'tbl1 x=42i\n' with HttpServer() as server, self.builder( 'http', - 'localhost', + '127.0.0.1', server.port, retry_timeout=datetime.timedelta(seconds=1)) as sender: server.responses.append((0, 500, 'text/plain', b'retriable error')) @@ -956,9 +1049,10 @@ def test_http_err_retry(self): def test_http_request_min_throughput(self): with HttpServer(delay_seconds=2) as server, self.builder( 'http', - 'localhost', + '127.0.0.1', server.port, request_timeout=1000, + protocol_version='2', # request_timeout is sufficiently high since it's also used as a connect timeout and we want to # survive hiccups on CI. it should be lower than the server delay though to actually test the # effect of request_min_throughput. @@ -970,38 +1064,223 @@ def test_http_request_min_throughput(self): def test_http_request_min_throughput_timeout(self): with HttpServer() as server, self.builder( 'http', - 'localhost', + '127.0.0.1', server.port, auto_flush='off', - request_timeout=1, + request_timeout=100, retry_timeout=0, - request_min_throughput=100000000) as sender: - sender.row('tbl1', columns={'x': 42}, at=qi.ServerTimestamp) - sender.row('tbl1', columns={'x': 42}, at=qi.ServerTimestamp) - sender.row('tbl1', columns={'x': 42}, at=qi.ServerTimestamp) - sender.row('tbl1', columns={'x': 42}, at=qi.ServerTimestamp) - sender.row('tbl1', columns={'x': 42}, at=qi.ServerTimestamp) - - # wait 5ms in the server to simulate a slow response - server.responses.append((5, 200, 'text/plain', b'OK')) - - with self.assertRaisesRegex(qi.IngressError, 'timed out reading response'): - sender.flush() + # effectively calculates a ~1ms timeout + request_min_throughput=100000000, + protocol_version=2) as sender: + buffer = sender.new_buffer() + buffer.row('tbl1', columns={'x': 42}, at=qi.ServerTimestamp) + buffer.row('tbl1', columns={'x': 42}, at=qi.ServerTimestamp) + buffer.row('tbl1', columns={'x': 42}, at=qi.ServerTimestamp) + buffer.row('tbl1', columns={'x': 42}, at=qi.ServerTimestamp) + buffer.row('tbl1', columns={'x': 42}, at=qi.ServerTimestamp) + + # wait 50ms in the server to simulate a slow response + with self.assertRaisesRegex(qi.IngressError, 'timeout: per call') as cm: + for _ in range(10): + server.responses.append((500, 200, 'text/plain', b'OK')) + # We retry in case the network thread gets descheduled + # and is only rescheduled after the timeout elapsed. + sender.flush(buffer, clear=False) def test_http_request_timeout(self): with HttpServer() as server, self.builder( 'http', - 'localhost', + '127.0.0.1', server.port, retry_timeout=0, request_min_throughput=0, # disable + protocol_version=2, request_timeout=datetime.timedelta(milliseconds=5)) as sender: - # wait for 10ms in the server to simulate a slow response - server.responses.append((20, 200, 'text/plain', b'OK')) + # wait for 50ms in the server to simulate a slow response + server.responses.append((50, 200, 'text/plain', b'OK')) sender.row('tbl1', columns={'x': 42}, at=qi.ServerTimestamp) - with self.assertRaisesRegex(qi.IngressError, 'timed out reading response'): + with self.assertRaisesRegex(qi.IngressError, 'timeout: per call'): sender.flush() + def test_http_server_not_serve(self): + with self.assertRaisesRegex(qi.IngressError, 'Could not detect server\'s line protocol version, settings url: http://127.0.0.1:1234/settings'): + with self.builder( + 'http', + '127.0.0.1', + 1234, + protocol_version='auto') as sender: + sender.row('tbl1', columns={'x': 42}) + + def test_http_auto_protocol_version_only_v1(self): + self._test_sender_http_auto_protocol_version(SETTINGS_WITH_PROTOCOL_VERSION_V1, 1) + + def test_http_auto_protocol_version_only_v2(self): + self._test_sender_http_auto_protocol_version(SETTINGS_WITH_PROTOCOL_VERSION_V2, 2) + + def test_http_auto_protocol_version_v1_v2(self): + self._test_sender_http_auto_protocol_version(SETTINGS_WITH_PROTOCOL_VERSION_V1_V2, 2) + + def test_http_auto_protocol_version_without_version(self): + self._test_sender_http_auto_protocol_version(SETTINGS_WITHOUT_PROTOCOL_VERSION, 1) + + def _test_sender_http_auto_protocol_version(self, settings, expected_version: int): + with HttpServer(settings) as server, self.builder('http', '127.0.0.1', server.port) as sender: + self.assertEqual(sender.protocol_version, expected_version) + buffer = sender.new_buffer() + buffer.row( + 'line_sender_buffer_old_server2', + symbols={'id': 'Hola'}, + columns={'price': '111222233333i', 'qty': 3.5}, + at=qi.TimestampNanos(111222233333)) + exp = b'line_sender_buffer_old_server2,id=Hola price="111222233333i",qty' + _float_binary_bytes( + 3.5, expected_version == 1) + b' 111222233333\n' + self.assertEqual(bytes(buffer), exp) + sender.flush(buffer) + self.assertEqual(len(server.requests), 1) + self.assertEqual(server.requests[0], exp) + + def test_http_auto_protocol_version_unsupported_client(self): + with self.assertRaisesRegex(qi.IngressError, 'Server does not support current client'): + with HttpServer(SETTINGS_WITH_PROTOCOL_VERSION_V3) as server, self.builder('http', '127.0.0.1', server.port) as sender: + sender.row('tbl1', columns={'x': 42}) + + def test_specify_line_protocol_explicitly(self): + with HttpServer() as server, self.builder('http', '127.0.0.1', server.port, protocol_version='1') as sender: + buffer = sender.new_buffer() + buffer.row( + 'line_sender_buffer', + symbols={'id': 'Hola'}, + columns={'qty': 3.5}, + at=qi.TimestampNanos(111222233333)) + exp = b'line_sender_buffer,id=Hola qty' + _float_binary_bytes( + 3.5, True) + b' 111222233333\n' + self.assertEqual(bytes(buffer), exp) + sender.flush(buffer) + self.assertEqual(len(server.requests), 1) + self.assertEqual(server.requests[0], exp) + + def test_line_protocol_version_on_tcp(self): + with Server() as server, self.builder('tcp', '127.0.0.1', server.port, protocol_version='1') as sender: + server.accept() + self.assertEqual(server.recv(), []) + buffer = sender.new_buffer() + buffer.row( + 'line_sender_buffer_tcp_v1', + symbols={'id': 'Hola'}, + columns={'qty': 3.5}, + at=qi.TimestampNanos(111222233333)) + exp = b'line_sender_buffer_tcp_v1,id=Hola qty=3.5 111222233333\n' + self.assertEqual(bytes(buffer), exp) + sender.flush(buffer) + self.assertEqual(server.recv()[0] + b'\n', exp) + + with Server() as server, self.builder('tcp', '127.0.0.1', server.port, protocol_version='2') as sender: + server.accept() + self.assertEqual(server.recv(), []) + buffer = sender.new_buffer() + buffer.row( + 'line_sender_buffer_tcp_v1', + symbols={'id': 'Hola'}, + columns={'qty': 3.5}, + at=qi.TimestampNanos(111222233333)) + exp = b'line_sender_buffer_tcp_v1,id=Hola qty' + _float_binary_bytes(3.5) + b' 111222233333\n' + self.assertEqual(bytes(buffer), exp) + sender.flush(buffer) + self.assertEqual(server.recv()[0] + b'\n', exp) + + with Server() as server, self.builder('tcp', '127.0.0.1', server.port, protocol_version='auto') as sender: + server.accept() + self.assertEqual(server.recv(), []) + buffer = sender.new_buffer() + buffer.row( + 'line_sender_buffer_tcp_v1', + symbols={'id': 'Hola'}, + columns={'qty': 3.5}, + at=qi.TimestampNanos(111222233333)) + exp = b'line_sender_buffer_tcp_v1,id=Hola qty=3.5 111222233333\n' + self.assertEqual(bytes(buffer), exp) + sender.flush(buffer) + self.assertEqual(server.recv()[0] + b'\n', exp)\ + + def _test_array_basic(self, arr: np.ndarray): + # http + with HttpServer() as server, self.builder('http', '127.0.0.1', server.port) as sender: + sender.row( + 'array_test', + columns={'array': arr}, + at=qi.TimestampNanos(11111)) + exp = b'array_test array=' + _array_binary_bytes(arr) + b' 11111\n' + sender.flush() + self.assertEqual(len(server.requests), 1) + self.assertEqual(server.requests[0], exp) + + #tcp + with Server() as server, self.builder('tcp', '127.0.0.1', server.port, protocol_version=2) as sender: + server.accept() + self.assertEqual(server.recv(), []) + sender.row( + 'array_test', + columns={'array': arr}, + at=qi.TimestampNanos(11111)) + exp = b'array_test array=' + _array_binary_bytes(arr) + b' 11111\n' + self.assertEqual(bytes(sender), exp) + sender.flush() + self.assertEqual(server.recv()[0] + b'\n', exp) + + def test_array_basic(self): + self._test_array_basic(np.array([1.2345678901234567, 2.3456789012345678], dtype=np.float64)) + + def test_empty_array(self): + self._test_array_basic(np.array([], dtype=np.float64)) + + def test_non_contigious_array(self): + base = np.arange(6, dtype=np.float64).reshape(2, 3) + non_contig_arr = base[:, ::2] + self._test_array_basic(non_contig_arr) + + def test_minus_stride_array(self): + self._test_array_basic(np.array([1.1, 2.2, 3.3], dtype=np.float64)[::-1]) + + def test_array_error_cases(self): + # zero dimensional array + with self.assertRaisesRegex(qi.IngressError, "Zero-dimensional arrays are not supported"): + scalar_arr = np.array(42.0, dtype=np.float64) + with HttpServer() as server, self.builder('http', '127.0.0.1', server.port) as sender: + sender.row( + 'array_test', + columns={'array': scalar_arr}, + at=qi.TimestampNanos(11111)) + + # not f64 dtype array + with self.assertRaisesRegex(qi.IngressError, "Only float64 numpy arrays are supported, got dtype: complex64"): + complex_arr = np.array([1 + 2j], dtype=np.complex64) + with HttpServer() as server, self.builder('http', '127.0.0.1', server.port) as sender: + sender.row( + 'array_test', + columns={'array': complex_arr}, + at=qi.TimestampNanos(11111)) + + # max dims + if NUMPY_VERSION >= (2,): + # Note: Older numpy versions don't support more than 32 dimensions. + with self.assertRaisesRegex(qi.IngressError, "Array dimension mismatch: expected at most 32 dimensions, but got 33"): + dims = (1,) * 33 + array = np.empty(dims, dtype=np.float64) + with Server() as server, self.builder('tcp', '127.0.0.1', server.port, protocol_version="2") as sender: + sender.row( + 'array_test', + columns={'array': array}, + at=qi.TimestampNanos(11111)) + + # default protocol version is v1, which does not support array datatype. + with self.assertRaisesRegex(qi.IngressError, "Protocol version v1 does not support array datatype"): + array = np.zeros([1,2], dtype=np.float64) + with Server() as server, self.builder('tcp', '127.0.0.1', server.port) as sender: + sender.row( + 'array_test', + columns={'array': array}, + at=qi.TimestampNanos(11111)) + class Timestamp(unittest.TestCase): def test_from_int(self): ns = 1670857929778202000 @@ -1093,6 +1372,7 @@ def encode_int_or_off(v): 'auto_flush_rows': encode_int_or_off, 'auto_flush_bytes': encode_int_or_off, 'auto_flush_interval': encode_duration_or_off, + 'protocol_version': str, 'init_buf_size': str, 'max_name_len': str, } @@ -1157,6 +1437,16 @@ class TestSenderEnv(TestBases.TestSender): builder = Builder.ENV +class TestBufferProtocolVersionV1(TestBases.TestBuffer): + name = 'protocol version 1' + version = 1 + + +class TestBufferProtocolVersionV2(TestBases.TestBuffer): + name = 'protocol version 1' + version = 2 + + if __name__ == '__main__': if os.environ.get('TEST_QUESTDB_PROFILE') == '1': import cProfile diff --git a/test/test_dataframe.py b/test/test_dataframe.py index cbd082e0..a3104cba 100644 --- a/test/test_dataframe.py +++ b/test/test_dataframe.py @@ -8,6 +8,7 @@ import functools import tempfile import pathlib +from test_tools import _float_binary_bytes, _array_binary_bytes BROKEN_TIMEZONES = True @@ -32,10 +33,11 @@ fastparquet = None -def _dataframe(*args, **kwargs): - buf = qi.Buffer() +def _dataframe(protocol_version: int, *args, **kwargs): + buf = qi.Buffer(protocol_version=protocol_version) buf.dataframe(*args, **kwargs) - return str(buf) + return bytes(buf) + DF1 = pd.DataFrame({ 'A': [1.0, 2.0, 3.0], @@ -60,6 +62,24 @@ def _dataframe(*args, **kwargs): pd.Timestamp('20180311'), pd.Timestamp('20180312')]}) +DF3 = pd.DataFrame({ + 'T': ['t1', 't2', 't1'], + 'A': ['a1', 'a2', 'a3'], + 'B': ['b1', None, 'b3'], + 'C': pd.Series(['b1', None, 'b3'], dtype='string'), + 'D': pd.Series(['a1', 'a2', 'a3'], dtype='string'), + 'E': [1.0, 2.0, 3.0], + 'F': [1, 2, 3], + "G": [ + np.array([1.0]), + np.array([10.0]), + np.array([100.0])], + 'H': [ + pd.Timestamp('20180310'), + pd.Timestamp('20180311'), + pd.Timestamp('20180312')]} +) + def with_tmp_dir(func): @functools.wraps(func) @@ -68,1524 +88,1584 @@ def wrapper(self, *args, **kwargs): return func(self, *args, pathlib.Path(tmpdir), **kwargs) return wrapper +class TestPandasBase: + class TestPandas(unittest.TestCase): + def test_mandatory_at_dataframe(self): + with self.assertRaisesRegex(TypeError, "needs keyword-only argument at"): + _dataframe(self.version, []) + with self.assertRaisesRegex(TypeError, "needs keyword-only argument at"): + buf = qi.Buffer(protocol_version=self.version) + buf.dataframe([]) + + buf = qi.Buffer(protocol_version=self.version) + buf.dataframe(pd.DataFrame(), at=qi.ServerTimestamp) + + def test_mandatory_at_row(self): + with self.assertRaisesRegex(TypeError, "needs keyword-only argument at"): + buf = qi.Buffer(protocol_version=self.version) + buf.row(table_name="test_buffer") + + buf = qi.Buffer(protocol_version=self.version) + buf.row(table_name="test_mandatory_at_row", at=qi.ServerTimestamp) + + def test_bad_dataframe(self): + with self.assertRaisesRegex(qi.IngressError, + 'Expected pandas'): + _dataframe(self.version, [], at=qi.ServerTimestamp) + + def test_no_table_name(self): + with self.assertRaisesRegex(qi.IngressError, + 'Must specify at least one of'): + _dataframe(self.version, DF1, at=qi.ServerTimestamp) + + def test_bad_table_name_type(self): + with self.assertRaisesRegex(TypeError, "'table_name' has incorrect type"): + _dataframe(self.version, DF1, table_name=1.5, at=qi.ServerTimestamp) -class TestPandas(unittest.TestCase): - def test_mandatory_at_dataframe(self): - with self.assertRaisesRegex(TypeError, "needs keyword-only argument at"): - _dataframe([]) - with self.assertRaisesRegex(TypeError, "needs keyword-only argument at"): - buf = qi.Buffer() - buf.dataframe([]) - - buf = qi.Buffer() - buf.dataframe(pd.DataFrame(), at=qi.ServerTimestamp) - - def test_mandatory_at_row(self): - with self.assertRaisesRegex(TypeError, "needs keyword-only argument at"): - buf = qi.Buffer() - buf.row(table_name="test_buffer") - - buf = qi.Buffer() - buf.row(table_name="test_mandatory_at_row", at=qi.ServerTimestamp) - - def test_bad_dataframe(self): - with self.assertRaisesRegex(qi.IngressError, - 'Expected pandas'): - _dataframe([], at=qi.ServerTimestamp) - - def test_no_table_name(self): - with self.assertRaisesRegex(qi.IngressError, - 'Must specify at least one of'): - _dataframe(DF1, at=qi.ServerTimestamp) - - def test_bad_table_name_type(self): - with self.assertRaisesRegex(TypeError, "'table_name' has incorrect type"): - _dataframe(DF1, table_name=1.5, at=qi.ServerTimestamp) - - def test_invalid_table_name(self): - with self.assertRaisesRegex(qi.IngressError, - '`table_name`: Bad string "."'): - _dataframe(DF1, table_name='.', at=qi.ServerTimestamp) - - def test_invalid_column_dtype(self): - with self.assertRaisesRegex(qi.IngressError, - '`table_name_col`: Bad dtype'): - _dataframe(DF1, table_name_col='B', at=qi.ServerTimestamp) - with self.assertRaisesRegex(qi.IngressError, - '`table_name_col`: Bad dtype'): - _dataframe(DF1, table_name_col=1, at=qi.ServerTimestamp) - with self.assertRaisesRegex(qi.IngressError, - '`table_name_col`: Bad dtype'): - _dataframe(DF1, table_name_col=-3, at=qi.ServerTimestamp) - with self.assertRaisesRegex(qi.IngressError, - '`table_name_col`: -5 index'): - _dataframe(DF1, table_name_col=-5, at=qi.ServerTimestamp) - - def test_bad_str_obj_col(self): - with self.assertRaisesRegex(qi.IngressError, - "`table_name_col`: Bad.*`object`.*bool.*'D'.*Must.*strings"): - _dataframe(DF1, table_name_col='D', at=qi.ServerTimestamp) - with self.assertRaisesRegex(qi.IngressError, - "`table_name_col`: Bad.*`object`.*bool.*'D'.*Must.*strings"): - _dataframe(DF1, table_name_col=3, at=qi.ServerTimestamp) - with self.assertRaisesRegex(qi.IngressError, - "`table_name_col`: Bad.*`object`.*bool.*'D'.*Must.*strings"): - _dataframe(DF1, table_name_col=-1, at=qi.ServerTimestamp) - - def test_bad_symbol(self): - with self.assertRaisesRegex(qi.IngressError, - '`symbols`.*bool.*tuple.*list'): - _dataframe(DF1, table_name='tbl1', symbols=0, at=qi.ServerTimestamp) - with self.assertRaisesRegex(qi.IngressError, - '`symbols`.*bool.*tuple.*list'): - _dataframe(DF1, table_name='tbl1', symbols={}, at=qi.ServerTimestamp) - with self.assertRaisesRegex(qi.IngressError, - '`symbols`.*bool.*tuple.*list'): - _dataframe(DF1, table_name='tbl1', symbols=None, at=qi.ServerTimestamp) - with self.assertRaisesRegex(qi.IngressError, - "`symbols`: Bad dtype `float64`.*'A'.*Must.*strings col"): - _dataframe(DF1, table_name='tbl1', symbols=(0,), at=qi.ServerTimestamp) - with self.assertRaisesRegex(qi.IngressError, - "`symbols`: Bad dtype `int64`.*'B'.*Must be a strings column."): - _dataframe(DF1, table_name='tbl1', symbols=[1], at=qi.ServerTimestamp) - - def test_bad_at(self): - with self.assertRaisesRegex(qi.IngressError, - '`at`.*2018.*not found in the'): - _dataframe(DF1, table_name='tbl1', at='2018-03-10T00:00:00Z') - with self.assertRaisesRegex(qi.IngressError, - '`at`.*float64.*be a datetime'): - _dataframe(DF1, table_name='tbl1', at='A') - with self.assertRaisesRegex(qi.IngressError, - '`at`.*int64.*be a datetime'): - _dataframe(DF1, table_name='tbl1', at=1) - with self.assertRaisesRegex(qi.IngressError, - '`at`.*object.*be a datetime'): - _dataframe(DF1, table_name='tbl1', at=-1) - - def test_empty_dataframe(self): - buf = _dataframe(pd.DataFrame(), table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual(buf, '') - - def test_zero_row_dataframe(self): - buf = _dataframe(pd.DataFrame(columns=['A', 'B']), table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual(buf, '') - - def test_zero_column_dataframe(self): - df = pd.DataFrame(index=[0, 1, 2]) - self.assertEqual(len(df), 3) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual(buf, '') - - def test_basic(self): - buf = _dataframe( - DF2, - table_name_col='T', - symbols=['A', 'B', 'C', 'D'], - at=-1) - self.assertEqual( - buf, - 't1,A=a1,B=b1,C=b1,D=a1 E=1.0,F=1i 1520640000000000000\n' + - 't2,A=a2,D=a2 E=2.0,F=2i 1520726400000000000\n' + - 't1,A=a3,B=b3,C=b3,D=a3 E=3.0,F=3i 1520812800000000000\n') - - def test_named_dataframe(self): - df = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': ['a', 'b', 'c']}) - df.index.name = 'table_name' - buf = _dataframe(df, at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'table_name a=1i,b="a"\n' + - 'table_name a=2i,b="b"\n' + - 'table_name a=3i,b="c"\n') - - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1i,b="a"\n' + - 'tbl1 a=2i,b="b"\n' + - 'tbl1 a=3i,b="c"\n') - - buf = _dataframe(df, table_name_col='b', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'a a=1i\n' + - 'b a=2i\n' + - 'c a=3i\n') - - df.index.name = 42 # bad type, not str - with self.assertRaisesRegex(qi.IngressError, - 'Bad dataframe index name as table.*: Expected str, not.*int.'): - _dataframe(df, at=qi.ServerTimestamp) - - @unittest.skipIf(BROKEN_TIMEZONES, 'requires accurate timezones') - def test_at_good(self): - df = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': ['a', 'b', 'c']}) - df.index.name = 'test_at_good' - with self.assertRaisesRegex(qi.IngressError, - 'Bad argument `at`: Column .2018-03.* not found .* dataframe.'): - _dataframe(df, at='2018-03-10T00:00:00Z') - - # Same timestamp, specified in various ways. - t1_setup = dt.datetime(2018, 3, 10, 0, 0, 0, tzinfo=dt.timezone.utc) - t1 = t1_setup.astimezone(tz=None).replace(tzinfo=None) # naive, local - t2 = dt.datetime(2018, 3, 10, 0, 0, 0, tzinfo=dt.timezone.utc) - t3 = dt.datetime(2018, 3, 9, 19, 0, 0, tzinfo=_TZ) - t4 = qi.TimestampNanos(1520640000000000000) - t5 = qi.TimestampNanos.from_datetime(t1) - t6 = qi.TimestampNanos.from_datetime(t2) - t7 = qi.TimestampNanos.from_datetime(t3) - timestamps = [t1, t2, t3, t4, t5, t6, t7] - for ts in timestamps: - buf = _dataframe(df, table_name='tbl1', at=ts) + def test_invalid_table_name(self): + with self.assertRaisesRegex(qi.IngressError, + '`table_name`: Bad string "."'): + _dataframe(self.version, DF1, table_name='.', at=qi.ServerTimestamp) + + def test_invalid_column_dtype(self): + with self.assertRaisesRegex(qi.IngressError, + '`table_name_col`: Bad dtype'): + _dataframe(self.version, DF1, table_name_col='B', at=qi.ServerTimestamp) + with self.assertRaisesRegex(qi.IngressError, + '`table_name_col`: Bad dtype'): + _dataframe(self.version, DF1, table_name_col=1, at=qi.ServerTimestamp) + with self.assertRaisesRegex(qi.IngressError, + '`table_name_col`: Bad dtype'): + _dataframe(self.version, DF1, table_name_col=-3, at=qi.ServerTimestamp) + with self.assertRaisesRegex(qi.IngressError, + '`table_name_col`: -5 index'): + _dataframe(self.version, DF1, table_name_col=-5, at=qi.ServerTimestamp) + + def test_bad_str_obj_col(self): + with self.assertRaisesRegex(qi.IngressError, + "`table_name_col`: Bad.*`object`.*bool.*'D'.*Must.*strings"): + _dataframe(self.version, DF1, table_name_col='D', at=qi.ServerTimestamp) + with self.assertRaisesRegex(qi.IngressError, + "`table_name_col`: Bad.*`object`.*bool.*'D'.*Must.*strings"): + _dataframe(self.version, DF1, table_name_col=3, at=qi.ServerTimestamp) + with self.assertRaisesRegex(qi.IngressError, + "`table_name_col`: Bad.*`object`.*bool.*'D'.*Must.*strings"): + _dataframe(self.version, DF1, table_name_col=-1, at=qi.ServerTimestamp) + + def test_bad_symbol(self): + with self.assertRaisesRegex(qi.IngressError, + '`symbols`.*bool.*tuple.*list'): + _dataframe(self.version, DF1, table_name='tbl1', symbols=0, at=qi.ServerTimestamp) + with self.assertRaisesRegex(qi.IngressError, + '`symbols`.*bool.*tuple.*list'): + _dataframe(self.version, DF1, table_name='tbl1', symbols={}, at=qi.ServerTimestamp) + with self.assertRaisesRegex(qi.IngressError, + '`symbols`.*bool.*tuple.*list'): + _dataframe(self.version, DF1, table_name='tbl1', symbols=None, at=qi.ServerTimestamp) + with self.assertRaisesRegex(qi.IngressError, + "`symbols`: Bad dtype `float64`.*'A'.*Must.*strings col"): + _dataframe(self.version, DF1, table_name='tbl1', symbols=(0,), at=qi.ServerTimestamp) + with self.assertRaisesRegex(qi.IngressError, + "`symbols`: Bad dtype `int64`.*'B'.*Must be a strings column."): + _dataframe(self.version, DF1, table_name='tbl1', symbols=[1], at=qi.ServerTimestamp) + + def test_bad_at(self): + with self.assertRaisesRegex(qi.IngressError, + '`at`.*2018.*not found in the'): + _dataframe(self.version, DF1, table_name='tbl1', at='2018-03-10T00:00:00Z') + with self.assertRaisesRegex(qi.IngressError, + '`at`.*float64.*be a datetime'): + _dataframe(self.version, DF1, table_name='tbl1', at='A') + with self.assertRaisesRegex(qi.IngressError, + '`at`.*int64.*be a datetime'): + _dataframe(self.version, DF1, table_name='tbl1', at=1) + with self.assertRaisesRegex(qi.IngressError, + '`at`.*object.*be a datetime'): + _dataframe(self.version, DF1, table_name='tbl1', at=-1) + + def test_empty_dataframe(self): + buf = _dataframe(self.version, pd.DataFrame(), table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual(buf, b'') + + def test_zero_row_dataframe(self): + buf = _dataframe(self.version, pd.DataFrame(columns=['A', 'B']), table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual(buf, b'') + + def test_zero_column_dataframe(self): + df = pd.DataFrame(index=[0, 1, 2]) + self.assertEqual(len(df), 3) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual(buf, b'') + + def test_basic(self): + buf = _dataframe( + self.version, + DF2, + table_name_col='T', + symbols=['A', 'B', 'C', 'D'], + at=-1) self.assertEqual( buf, - 'tbl1 a=1i,b="a" 1520640000000000000\n' + - 'tbl1 a=2i,b="b" 1520640000000000000\n' + - 'tbl1 a=3i,b="c" 1520640000000000000\n') - - @unittest.skipIf(BROKEN_TIMEZONES, 'requires accurate timezones') - def test_at_neg(self): - n1 = dt.datetime(1965, 1, 1, 0, 0, 0, tzinfo=dt.timezone.utc) - n2 = dt.datetime(1965, 1, 1, 0, 0, 0, tzinfo=_TZ) - n3 = dt.datetime(1965, 1, 1, 0, 0, 0) - neg_timestamps = [n1, n2, n3] - for ts in neg_timestamps: - with self.assertRaisesRegex(qi.IngressError, - 'Bad.*`at`: Cannot .* before the Unix epoch .1970-01-01.*'): - _dataframe(DF2, at=ts, table_name='test_at_neg') - - @unittest.skipIf(BROKEN_TIMEZONES, 'requires accurate timezones') - def test_at_ts_0(self): - df = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': ['a', 'b', 'c']}) - df.index.name = 'test_at_ts_0' - - # Epoch 0, specified in various ways. - e1_setup = dt.datetime(1970, 1, 1, 0, 0, 0, tzinfo=dt.timezone.utc) - e1 = e1_setup.astimezone(tz=None).replace(tzinfo=None) # naive, local - e2 = dt.datetime(1970, 1, 1, 0, 0, 0, tzinfo=dt.timezone.utc) - e3 = dt.datetime(1969, 12, 31, 19, 0, 0, tzinfo=_TZ) - e4 = qi.TimestampNanos(0) - e5 = qi.TimestampNanos.from_datetime(e1) - e6 = qi.TimestampNanos.from_datetime(e2) - e7 = qi.TimestampNanos.from_datetime(e3) - edge_timestamps = [e1, e2, e3, e4, e5, e6, e7] - - for ts in edge_timestamps: - buf = _dataframe(df, table_name='tbl1', at=ts) + b't1,A=a1,B=b1,C=b1,D=a1 E' + _float_binary_bytes(1.0, self.version == 1) + b',F=1i 1520640000000000000\n' + + b't2,A=a2,D=a2 E' + _float_binary_bytes(2.0, self.version == 1) + b',F=2i 1520726400000000000\n' + + b't1,A=a3,B=b3,C=b3,D=a3 E' + _float_binary_bytes(3.0, self.version == 1) + b',F=3i 1520812800000000000\n') + + def test_basic_with_arrays(self): + if self.version == 1: + self.skipTest('Protocol version v1 doesn\'t support arrays') + buf = _dataframe( + self.version, + DF3, + table_name_col='T', + symbols=['A', 'B', 'C', 'D'], + at=-1) + self.assertEqual( + buf, + b't1,A=a1,B=b1,C=b1,D=a1 E' + _float_binary_bytes(1.0, self.version == 1) + b',F=1i,G=' + _array_binary_bytes(np.array([1.0])) + b' 1520640000000000000\n' + + b't2,A=a2,D=a2 E' + _float_binary_bytes(2.0, self.version == 1) + b',F=2i,G=' + _array_binary_bytes(np.array([10.0])) + b' 1520726400000000000\n' + + b't1,A=a3,B=b3,C=b3,D=a3 E' + _float_binary_bytes(3.0, self.version == 1) + b',F=3i,G=' + _array_binary_bytes(np.array([100.0])) + b' 1520812800000000000\n') + + def test_named_dataframe(self): + df = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': ['a', 'b', 'c']}) + df.index.name = 'table_name' + buf = _dataframe(self.version, df, at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'table_name a=1i,b="a"\n' + + b'table_name a=2i,b="b"\n' + + b'table_name a=3i,b="c"\n') + + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) self.assertEqual( buf, - 'tbl1 a=1i,b="a" 0\n' + - 'tbl1 a=2i,b="b" 0\n' + - 'tbl1 a=3i,b="c" 0\n') - - def test_single_at_col(self): - df = pd.DataFrame({'timestamp': pd.to_datetime(['2023-01-01'])}) - with self.assertRaisesRegex(qi.IngressError, - 'Bad dataframe row at index 0: All values are nulls.'): - _dataframe(df, table_name='tbl1', at='timestamp') - - def test_row_of_nulls(self): - df = pd.DataFrame({'a': ['a1', None, 'a3']}) - with self.assertRaisesRegex( - qi.IngressError, 'Bad dataframe row.*1: All values are nulls.'): - _dataframe(df, table_name='tbl1', symbols=['a'], at=qi.ServerTimestamp) - - def test_u8_numpy_col(self): - df = pd.DataFrame({'a': pd.Series([ - 1, 2, 3, - 0, - 255], # u8 max - dtype='uint8')}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1i\n' + - 'tbl1 a=2i\n' + - 'tbl1 a=3i\n' + - 'tbl1 a=0i\n' + - 'tbl1 a=255i\n') - - def test_i8_numpy_col(self): - df = pd.DataFrame({'a': pd.Series([ - 1, 2, 3, - -128, # i8 min - 127, # i8 max - 0], dtype='int8')}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1i\n' + - 'tbl1 a=2i\n' + - 'tbl1 a=3i\n' + - 'tbl1 a=-128i\n' + - 'tbl1 a=127i\n' + - 'tbl1 a=0i\n') - - def test_u16_numpy_col(self): - df = pd.DataFrame({'a': pd.Series([ - 1, 2, 3, - 0, - 65535], # u16 max - dtype='uint16')}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1i\n' + - 'tbl1 a=2i\n' + - 'tbl1 a=3i\n' + - 'tbl1 a=0i\n' + - 'tbl1 a=65535i\n') - - def test_i16_numpy_col(self): - df = pd.DataFrame({'a': pd.Series([ - 1, 2, 3, - -32768, # i16 min - 32767, # i16 max - 0], dtype='int16')}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1i\n' + - 'tbl1 a=2i\n' + - 'tbl1 a=3i\n' + - 'tbl1 a=-32768i\n' + - 'tbl1 a=32767i\n' + - 'tbl1 a=0i\n') - - def test_u32_numpy_col(self): - df = pd.DataFrame({'a': pd.Series([ - 1, 2, 3, - 0, - 4294967295], # u32 max - dtype='uint32')}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1i\n' + - 'tbl1 a=2i\n' + - 'tbl1 a=3i\n' + - 'tbl1 a=0i\n' + - 'tbl1 a=4294967295i\n') - - def test_i32_numpy_col(self): - df = pd.DataFrame({'a': pd.Series([ - 1, 2, 3, - -2147483648, # i32 min - 0, - 2147483647], # i32 max - dtype='int32')}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1i\n' + - 'tbl1 a=2i\n' + - 'tbl1 a=3i\n' + - 'tbl1 a=-2147483648i\n' + - 'tbl1 a=0i\n' + - 'tbl1 a=2147483647i\n') - - def test_u64_numpy_col(self): - df = pd.DataFrame({'a': pd.Series([ - 1, 2, 3, - 0, - 9223372036854775807], # i64 max - dtype='uint64')}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1i\n' + - 'tbl1 a=2i\n' + - 'tbl1 a=3i\n' + - 'tbl1 a=0i\n' + - 'tbl1 a=9223372036854775807i\n') - - buf = qi.Buffer() - buf.dataframe(pd.DataFrame({'b': [.5, 1.0, 1.5]}), table_name='tbl2', at=qi.ServerTimestamp) - exp1 = ( - 'tbl2 b=0.5\n' + - 'tbl2 b=1.0\n' + - 'tbl2 b=1.5\n') - self.assertEqual( - str(buf), - exp1) - df2 = pd.DataFrame({'a': pd.Series([ - 1, 2, 3, - 0, - 9223372036854775808], # i64 max + 1 - dtype='uint64')}) - with self.assertRaisesRegex( - qi.IngressError, - '.* serialize .* column .a. .* 4 .*9223372036854775808.*int64.*'): - buf.dataframe(df2, table_name='tbl1', at=qi.ServerTimestamp) - - self.assertEqual( - str(buf), - exp1) # No partial write of `df2`. - - def test_i64_numpy_col(self): - df = pd.DataFrame({'a': pd.Series([ - 1, 2, 3, - -9223372036854775808, # i64 min - 0, - 9223372036854775807], # i64 max - dtype='int64')}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1i\n' + - 'tbl1 a=2i\n' + - 'tbl1 a=3i\n' + - 'tbl1 a=-9223372036854775808i\n' + - 'tbl1 a=0i\n' + - 'tbl1 a=9223372036854775807i\n') - - def test_f32_numpy_col(self): - df = pd.DataFrame({'a': pd.Series([ - 1.0, 2.0, 3.0, - 0.0, - float('inf'), - float('-inf'), - float('nan'), - 3.4028234663852886e38], # f32 max - dtype='float32')}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1.0\n' + - 'tbl1 a=2.0\n' + - 'tbl1 a=3.0\n' + - 'tbl1 a=0.0\n' + - 'tbl1 a=Infinity\n' + - 'tbl1 a=-Infinity\n' + - 'tbl1 a=NaN\n' + - 'tbl1 a=3.4028234663852886e38\n') - - def test_f64_numpy_col(self): - df = pd.DataFrame({'a': pd.Series([ - 1.0, 2.0, 3.0, - 0.0, - float('inf'), - float('-inf'), - float('nan'), - 1.7976931348623157e308], # f64 max - dtype='float64')}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1.0\n' + - 'tbl1 a=2.0\n' + - 'tbl1 a=3.0\n' + - 'tbl1 a=0.0\n' + - 'tbl1 a=Infinity\n' + - 'tbl1 a=-Infinity\n' + - 'tbl1 a=NaN\n' + - 'tbl1 a=1.7976931348623157e308\n') - - def test_u8_arrow_col(self): - df = pd.DataFrame({ - 'a': pd.Series([ + b'tbl1 a=1i,b="a"\n' + + b'tbl1 a=2i,b="b"\n' + + b'tbl1 a=3i,b="c"\n') + + buf = _dataframe(self.version, df, table_name_col='b', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'a a=1i\n' + + b'b a=2i\n' + + b'c a=3i\n') + + df.index.name = 42 # bad type, not str + with self.assertRaisesRegex(qi.IngressError, + 'Bad dataframe index name as table.*: Expected str, not.*int.'): + _dataframe(self.version, df, at=qi.ServerTimestamp) + + @unittest.skipIf(BROKEN_TIMEZONES, 'requires accurate timezones') + def test_at_good(self): + df = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': ['a', 'b', 'c']}) + df.index.name = 'test_at_good' + with self.assertRaisesRegex(qi.IngressError, + 'Bad argument `at`: Column .2018-03.* not found .* dataframe.'): + _dataframe(self.version, df, at='2018-03-10T00:00:00Z') + + # Same timestamp, specified in various ways. + t1_setup = dt.datetime(2018, 3, 10, 0, 0, 0, tzinfo=dt.timezone.utc) + t1 = t1_setup.astimezone(tz=None).replace(tzinfo=None) # naive, local + t2 = dt.datetime(2018, 3, 10, 0, 0, 0, tzinfo=dt.timezone.utc) + t3 = dt.datetime(2018, 3, 9, 19, 0, 0, tzinfo=_TZ) + t4 = qi.TimestampNanos(1520640000000000000) + t5 = qi.TimestampNanos.from_datetime(t1) + t6 = qi.TimestampNanos.from_datetime(t2) + t7 = qi.TimestampNanos.from_datetime(t3) + timestamps = [t1, t2, t3, t4, t5, t6, t7] + for ts in timestamps: + buf = _dataframe(self.version, df, table_name='tbl1', at=ts) + self.assertEqual( + buf, + b'tbl1 a=1i,b="a" 1520640000000000000\n' + + b'tbl1 a=2i,b="b" 1520640000000000000\n' + + b'tbl1 a=3i,b="c" 1520640000000000000\n') + + @unittest.skipIf(BROKEN_TIMEZONES, 'requires accurate timezones') + def test_at_neg(self): + n1 = dt.datetime(1965, 1, 1, 0, 0, 0, tzinfo=dt.timezone.utc) + n2 = dt.datetime(1965, 1, 1, 0, 0, 0, tzinfo=_TZ) + n3 = dt.datetime(1965, 1, 1, 0, 0, 0) + neg_timestamps = [n1, n2, n3] + for ts in neg_timestamps: + with self.assertRaisesRegex(qi.IngressError, + 'Bad.*`at`: Cannot .* before the Unix epoch .1970-01-01.*'): + _dataframe(self.version, DF2, at=ts, table_name='test_at_neg') + + @unittest.skipIf(BROKEN_TIMEZONES, 'requires accurate timezones') + def test_at_ts_0(self): + df = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': ['a', 'b', 'c']}) + df.index.name = 'test_at_ts_0' + + # Epoch 0, specified in various ways. + e1_setup = dt.datetime(1970, 1, 1, 0, 0, 0, tzinfo=dt.timezone.utc) + e1 = e1_setup.astimezone(tz=None).replace(tzinfo=None) # naive, local + e2 = dt.datetime(1970, 1, 1, 0, 0, 0, tzinfo=dt.timezone.utc) + e3 = dt.datetime(1969, 12, 31, 19, 0, 0, tzinfo=_TZ) + e4 = qi.TimestampNanos(0) + e5 = qi.TimestampNanos.from_datetime(e1) + e6 = qi.TimestampNanos.from_datetime(e2) + e7 = qi.TimestampNanos.from_datetime(e3) + edge_timestamps = [e1, e2, e3, e4, e5, e6, e7] + + for ts in edge_timestamps: + buf = _dataframe(self.version, df, table_name='tbl1', at=ts) + self.assertEqual( + buf, + b'tbl1 a=1i,b="a" 0\n' + + b'tbl1 a=2i,b="b" 0\n' + + b'tbl1 a=3i,b="c" 0\n') + + def test_single_at_col(self): + df = pd.DataFrame({'timestamp': pd.to_datetime(['2023-01-01'])}) + with self.assertRaisesRegex(qi.IngressError, + 'Bad dataframe row at index 0: All values are nulls.'): + _dataframe(self.version, df, table_name='tbl1', at='timestamp') + + def test_row_of_nulls(self): + df = pd.DataFrame({'a': ['a1', None, 'a3']}) + with self.assertRaisesRegex( + qi.IngressError, 'Bad dataframe row.*1: All values are nulls.'): + _dataframe(self.version, df, table_name='tbl1', symbols=['a'], at=qi.ServerTimestamp) + + def test_u8_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ 1, 2, 3, 0, - None, 255], # u8 max - dtype=pd.UInt8Dtype()), - 'b': ['a', 'b', 'c', 'd', 'e', 'f']}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1i,b="a"\n' + - 'tbl1 a=2i,b="b"\n' + - 'tbl1 a=3i,b="c"\n' + - 'tbl1 a=0i,b="d"\n' + - 'tbl1 b="e"\n' + - 'tbl1 a=255i,b="f"\n') - - def test_i8_arrow_col(self): - df = pd.DataFrame({ - 'a': pd.Series([ + dtype='uint8')}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=1i\n' + + b'tbl1 a=2i\n' + + b'tbl1 a=3i\n' + + b'tbl1 a=0i\n' + + b'tbl1 a=255i\n') + + def test_i8_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ 1, 2, 3, -128, # i8 min - 0, - None, - 127], # i8 max - dtype=pd.Int8Dtype()), - 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g']}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1i,b="a"\n' + - 'tbl1 a=2i,b="b"\n' + - 'tbl1 a=3i,b="c"\n' + - 'tbl1 a=-128i,b="d"\n' + - 'tbl1 a=0i,b="e"\n' + - 'tbl1 b="f"\n' + - 'tbl1 a=127i,b="g"\n') - - def test_u16_arrow_col(self): - df = pd.DataFrame({ - 'a': pd.Series([ + 127, # i8 max + 0], dtype='int8')}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=1i\n' + + b'tbl1 a=2i\n' + + b'tbl1 a=3i\n' + + b'tbl1 a=-128i\n' + + b'tbl1 a=127i\n' + + b'tbl1 a=0i\n') + + def test_u16_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ 1, 2, 3, 0, - None, 65535], # u16 max - dtype=pd.UInt16Dtype()), - 'b': ['a', 'b', 'c', 'd', 'e', 'f']}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1i,b="a"\n' + - 'tbl1 a=2i,b="b"\n' + - 'tbl1 a=3i,b="c"\n' + - 'tbl1 a=0i,b="d"\n' + - 'tbl1 b="e"\n' + - 'tbl1 a=65535i,b="f"\n') - - def test_i16_arrow_col(self): - df = pd.DataFrame({ - 'a': pd.Series([ + dtype='uint16')}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=1i\n' + + b'tbl1 a=2i\n' + + b'tbl1 a=3i\n' + + b'tbl1 a=0i\n' + + b'tbl1 a=65535i\n') + + def test_i16_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ 1, 2, 3, -32768, # i16 min - 0, - None, - 32767], # i16 max - dtype=pd.Int16Dtype()), - 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g']}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1i,b="a"\n' + - 'tbl1 a=2i,b="b"\n' + - 'tbl1 a=3i,b="c"\n' + - 'tbl1 a=-32768i,b="d"\n' + - 'tbl1 a=0i,b="e"\n' + - 'tbl1 b="f"\n' + - 'tbl1 a=32767i,b="g"\n') - - def test_u32_arrow_col(self): - df = pd.DataFrame({ - 'a': pd.Series([ + 32767, # i16 max + 0], dtype='int16')}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=1i\n' + + b'tbl1 a=2i\n' + + b'tbl1 a=3i\n' + + b'tbl1 a=-32768i\n' + + b'tbl1 a=32767i\n' + + b'tbl1 a=0i\n') + + def test_u32_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ 1, 2, 3, 0, - None, 4294967295], # u32 max - dtype=pd.UInt32Dtype()), - 'b': ['a', 'b', 'c', 'd', 'e', 'f']}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1i,b="a"\n' + - 'tbl1 a=2i,b="b"\n' + - 'tbl1 a=3i,b="c"\n' + - 'tbl1 a=0i,b="d"\n' + - 'tbl1 b="e"\n' + - 'tbl1 a=4294967295i,b="f"\n') - - def test_i32_arrow_col(self): - df = pd.DataFrame({ - 'a': pd.Series([ + dtype='uint32')}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=1i\n' + + b'tbl1 a=2i\n' + + b'tbl1 a=3i\n' + + b'tbl1 a=0i\n' + + b'tbl1 a=4294967295i\n') + + def test_i32_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ 1, 2, 3, -2147483648, # i32 min 0, - None, 2147483647], # i32 max - dtype=pd.Int32Dtype()), - 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g']}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1i,b="a"\n' + - 'tbl1 a=2i,b="b"\n' + - 'tbl1 a=3i,b="c"\n' + - 'tbl1 a=-2147483648i,b="d"\n' + - 'tbl1 a=0i,b="e"\n' + - 'tbl1 b="f"\n' + - 'tbl1 a=2147483647i,b="g"\n') - - def test_u64_arrow_col(self): - df = pd.DataFrame({ - 'a': pd.Series([ + dtype='int32')}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=1i\n' + + b'tbl1 a=2i\n' + + b'tbl1 a=3i\n' + + b'tbl1 a=-2147483648i\n' + + b'tbl1 a=0i\n' + + b'tbl1 a=2147483647i\n') + + def test_u64_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ 1, 2, 3, 0, - None, 9223372036854775807], # i64 max - dtype=pd.UInt64Dtype()), - 'b': ['a', 'b', 'c', 'd', 'e', 'f']}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1i,b="a"\n' + - 'tbl1 a=2i,b="b"\n' + - 'tbl1 a=3i,b="c"\n' + - 'tbl1 a=0i,b="d"\n' + - 'tbl1 b="e"\n' + - 'tbl1 a=9223372036854775807i,b="f"\n') - - df2 = pd.DataFrame({'a': pd.Series([ - 1, 2, 3, - 0, - 9223372036854775808], # i64 max + 1 - dtype=pd.UInt64Dtype())}) - with self.assertRaisesRegex( - qi.IngressError, - '.* serialize .* column .a. .* 4 .*9223372036854775808.*int64.*'): - _dataframe(df2, table_name='tbl1', at=qi.ServerTimestamp) - - def test_i64_arrow_col(self): - df = pd.DataFrame({ - 'a': pd.Series([ + dtype='uint64')}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=1i\n' + + b'tbl1 a=2i\n' + + b'tbl1 a=3i\n' + + b'tbl1 a=0i\n' + + b'tbl1 a=9223372036854775807i\n') + + buf = qi.Buffer(protocol_version=self.version) + buf.dataframe(pd.DataFrame({'b': [.5, 1.0, 1.5]}), table_name='tbl2', at=qi.ServerTimestamp) + exp1 = ( + b'tbl2 b' + _float_binary_bytes(0.5, self.version == 1) + b'\n' + + b'tbl2 b' + _float_binary_bytes(1.0, self.version == 1) + b'\n' + + b'tbl2 b' + _float_binary_bytes(1.5, self.version == 1) + b'\n') + self.assertEqual( + bytes(buf), + exp1) + df2 = pd.DataFrame({'a': pd.Series([ + 1, 2, 3, + 0, + 9223372036854775808], # i64 max + 1 + dtype='uint64')}) + with self.assertRaisesRegex( + qi.IngressError, + '.* serialize .* column .a. .* 4 .*9223372036854775808.*int64.*'): + buf.dataframe(df2, table_name='tbl1', at=qi.ServerTimestamp) + + self.assertEqual( + bytes(buf), + exp1) # No partial write of `df2`. + + def test_i64_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ 1, 2, 3, -9223372036854775808, # i64 min 0, - None, 9223372036854775807], # i64 max - dtype=pd.Int64Dtype()), - 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g']}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1i,b="a"\n' + - 'tbl1 a=2i,b="b"\n' + - 'tbl1 a=3i,b="c"\n' + - 'tbl1 a=-9223372036854775808i,b="d"\n' + - 'tbl1 a=0i,b="e"\n' + - 'tbl1 b="f"\n' + - 'tbl1 a=9223372036854775807i,b="g"\n') - - def test_f32_arrow_col(self): - df = pd.DataFrame({ - 'a': pd.Series([ + dtype='int64')}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=1i\n' + + b'tbl1 a=2i\n' + + b'tbl1 a=3i\n' + + b'tbl1 a=-9223372036854775808i\n' + + b'tbl1 a=0i\n' + + b'tbl1 a=9223372036854775807i\n') + + def test_f32_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ 1.0, 2.0, 3.0, 0.0, float('inf'), float('-inf'), float('nan'), - 3.4028234663852886e38, # f32 max - None], - dtype=pd.Float32Dtype()), - 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1.0,b="a"\n' + - 'tbl1 a=2.0,b="b"\n' + - 'tbl1 a=3.0,b="c"\n' + - 'tbl1 a=0.0,b="d"\n' + - 'tbl1 a=Infinity,b="e"\n' + - 'tbl1 a=-Infinity,b="f"\n' + - 'tbl1 b="g"\n' + # This one is wierd: `nan` gets 0 in the bitmask. - 'tbl1 a=3.4028234663852886e38,b="h"\n' + - 'tbl1 b="i"\n') - - def test_f64_arrow_col(self): - df = pd.DataFrame({ - 'a': pd.Series([ + 3.4028234663852886e38], # f32 max + dtype='float32')}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a' + _float_binary_bytes(1.0, self.version == 1) + b'\n' + + b'tbl1 a' + _float_binary_bytes(2.0, self.version == 1) + b'\n' + + b'tbl1 a' + _float_binary_bytes(3.0, self.version == 1) + b'\n' + + b'tbl1 a' + _float_binary_bytes(0.0, self.version == 1) + b'\n' + + b'tbl1 a' + _float_binary_bytes(float('inf'), self.version == 1) + b'\n' + + b'tbl1 a' + _float_binary_bytes(float('-inf'), self.version == 1) + b'\n' + + b'tbl1 a' + _float_binary_bytes(float('NaN'), self.version == 1) + b'\n' + + b'tbl1 a' + _float_binary_bytes(3.4028234663852886e38, self.version == 1) + b'\n') + + def test_f64_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ 1.0, 2.0, 3.0, 0.0, float('inf'), float('-inf'), float('nan'), - 1.7976931348623157e308, # f64 max - None], - dtype=pd.Float64Dtype()), - 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1.0,b="a"\n' + - 'tbl1 a=2.0,b="b"\n' + - 'tbl1 a=3.0,b="c"\n' + - 'tbl1 a=0.0,b="d"\n' + - 'tbl1 a=Infinity,b="e"\n' + - 'tbl1 a=-Infinity,b="f"\n' + - 'tbl1 b="g"\n' + # This one is wierd: `nan` gets 0 in the bitmask. - 'tbl1 a=1.7976931348623157e308,b="h"\n' + - 'tbl1 b="i"\n') - - def test_bool_numpy_col(self): - df = pd.DataFrame({'a': pd.Series([ - True, False, False, - False, True, False], - dtype='bool')}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=t\n' + - 'tbl1 a=f\n' + - 'tbl1 a=f\n' + - 'tbl1 a=f\n' + - 'tbl1 a=t\n' + - 'tbl1 a=f\n') - - def test_bool_arrow_col(self): - df = pd.DataFrame({'a': pd.Series([ - True, False, False, - False, True, False, - True, True, True, - False, False, False], - dtype='boolean')}) # Note `boolean` != `bool`. - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=t\n' + - 'tbl1 a=f\n' + - 'tbl1 a=f\n' + - 'tbl1 a=f\n' + - 'tbl1 a=t\n' + - 'tbl1 a=f\n' + - 'tbl1 a=t\n' + - 'tbl1 a=t\n' + - 'tbl1 a=t\n' + - 'tbl1 a=f\n' + - 'tbl1 a=f\n' + - 'tbl1 a=f\n') - - df2 = pd.DataFrame({'a': pd.Series([ - True, False, False, - None, True, False], - dtype='boolean')}) - with self.assertRaisesRegex( - qi.IngressError, - 'Failed.*at row index 3 .*.: .*insert null .*boolean col'): - _dataframe(df2, table_name='tbl1', at=qi.ServerTimestamp) - - def test_bool_obj_col(self): - df = pd.DataFrame({'a': pd.Series([ - True, False, False, - False, True, False], - dtype='object')}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=t\n' + - 'tbl1 a=f\n' + - 'tbl1 a=f\n' + - 'tbl1 a=f\n' + - 'tbl1 a=t\n' + - 'tbl1 a=f\n') - - df2 = pd.DataFrame({'a': pd.Series([ - True, False, 'false'], - dtype='object')}) - with self.assertRaisesRegex( - qi.IngressError, - 'serialize .* column .a. .* 2 .*false.*bool'): - _dataframe(df2, table_name='tbl1', at=qi.ServerTimestamp) - - df3 = pd.DataFrame({'a': pd.Series([ - None, True, False], - dtype='object')}) - with self.assertRaisesRegex( - qi.IngressError, - 'serialize.*\\(None\\): Cannot insert null.*boolean column'): - _dataframe(df3, table_name='tbl1', at=qi.ServerTimestamp) - - def test_datetime64_numpy_col(self): - df = pd.DataFrame({ - 'a': pd.Series([ - pd.Timestamp('2019-01-01 00:00:00'), - pd.Timestamp('2019-01-01 00:00:01'), - pd.Timestamp('2019-01-01 00:00:02'), - pd.Timestamp('2019-01-01 00:00:03'), - pd.Timestamp('2019-01-01 00:00:04'), - pd.Timestamp('2019-01-01 00:00:05'), - None, - float('nan'), - pd.NA], - dtype='datetime64[ns]'), - 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=1546300800000000t,b="a"\n' + - 'tbl1 a=1546300801000000t,b="b"\n' + - 'tbl1 a=1546300802000000t,b="c"\n' + - 'tbl1 a=1546300803000000t,b="d"\n' + - 'tbl1 a=1546300804000000t,b="e"\n' + - 'tbl1 a=1546300805000000t,b="f"\n' + - 'tbl1 b="g"\n' + - 'tbl1 b="h"\n' + - 'tbl1 b="i"\n') - - df = pd.DataFrame({'a': pd.Series([ - pd.Timestamp('1970-01-01 00:00:00'), - pd.Timestamp('1970-01-01 00:00:01'), - pd.Timestamp('1970-01-01 00:00:02')])}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a=0t\n' + - 'tbl1 a=1000000t\n' + - 'tbl1 a=2000000t\n') - - def test_datetime64_tz_arrow_col(self): - df = pd.DataFrame({ - 'a': [ - pd.Timestamp( - year=2019, month=1, day=1, - hour=0, minute=0, second=0, tz=_TZ), - pd.Timestamp( - year=2019, month=1, day=1, - hour=0, minute=0, second=1, tz=_TZ), - None, - pd.Timestamp( - year=2019, month=1, day=1, - hour=0, minute=0, second=3, tz=_TZ)], - 'b': ['sym1', 'sym2', 'sym3', 'sym4']}) - buf = _dataframe(df, table_name='tbl1', symbols=['b'], at=qi.ServerTimestamp) - self.assertEqual( - buf, - # Note how these are 5hr offset from `test_datetime64_numpy_col`. - 'tbl1,b=sym1 a=1546318800000000t\n' + - 'tbl1,b=sym2 a=1546318801000000t\n' + - 'tbl1,b=sym3\n' + - 'tbl1,b=sym4 a=1546318803000000t\n') - - # Not epoch 0. - df = pd.DataFrame({ - 'a': [ - pd.Timestamp( - year=1970, month=1, day=1, - hour=0, minute=0, second=0, tz=_TZ), - pd.Timestamp( - year=1970, month=1, day=1, - hour=0, minute=0, second=1, tz=_TZ), - pd.Timestamp( - year=1970, month=1, day=1, - hour=0, minute=0, second=2, tz=_TZ)], - 'b': ['sym1', 'sym2', 'sym3']}) - buf = _dataframe(df, table_name='tbl1', symbols=['b'], at=qi.ServerTimestamp) - self.assertEqual( - buf, - # Note how these are 5hr offset from `test_datetime64_numpy_col`. - 'tbl1,b=sym1 a=18000000000t\n' + - 'tbl1,b=sym2 a=18001000000t\n' + - 'tbl1,b=sym3 a=18002000000t\n') - - # Actual epoch 0. - df = pd.DataFrame({ - 'a': [ - pd.Timestamp( - year=1969, month=12, day=31, - hour=19, minute=0, second=0, tz=_TZ), - pd.Timestamp( - year=1969, month=12, day=31, - hour=19, minute=0, second=1, tz=_TZ), - pd.Timestamp( - year=1969, month=12, day=31, - hour=19, minute=0, second=2, tz=_TZ)], - 'b': ['sym1', 'sym2', 'sym3']}) - buf = _dataframe(df, table_name='tbl1', symbols=['b'], at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1,b=sym1 a=0t\n' + - 'tbl1,b=sym2 a=1000000t\n' + - 'tbl1,b=sym3 a=2000000t\n') - - df2 = pd.DataFrame({ - 'a': [ - pd.Timestamp( - year=1900, month=1, day=1, - hour=0, minute=0, second=0, tz=_TZ)], - 'b': ['sym1']}) - buf = _dataframe(df2, table_name='tbl1', symbols=['b'], at=qi.ServerTimestamp) - - # Accounting for different datatime library differences. - # Mostly, here assert that negative timestamps are allowed. - self.assertIn( - buf, - ['tbl1,b=sym1 a=-2208970800000000t\n', - 'tbl1,b=sym1 a=-2208971040000000t\n']) - - def test_datetime64_numpy_at(self): - df = pd.DataFrame({ - 'a': pd.Series([ - pd.Timestamp('2019-01-01 00:00:00'), - pd.Timestamp('2019-01-01 00:00:01'), - pd.Timestamp('2019-01-01 00:00:02'), - pd.Timestamp('2019-01-01 00:00:03'), - pd.Timestamp('2019-01-01 00:00:04'), - pd.Timestamp('2019-01-01 00:00:05'), - float('nan'), - None, - pd.NaT], - dtype='datetime64[ns]'), - 'b': [1, 2, 3, 4, 5, 6, 7, 8, 9]}) - buf = _dataframe(df, table_name='tbl1', at='a') - self.assertEqual( - buf, - 'tbl1 b=1i 1546300800000000000\n' + - 'tbl1 b=2i 1546300801000000000\n' + - 'tbl1 b=3i 1546300802000000000\n' + - 'tbl1 b=4i 1546300803000000000\n' + - 'tbl1 b=5i 1546300804000000000\n' + - 'tbl1 b=6i 1546300805000000000\n' + - 'tbl1 b=7i\n' + - 'tbl1 b=8i\n' + - 'tbl1 b=9i\n') - - df = pd.DataFrame({ - 'a': pd.Series([ + 1.7976931348623157e308], # f64 max + dtype='float64')}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a' + _float_binary_bytes(1.0, self.version == 1) + b'\n' + + b'tbl1 a' + _float_binary_bytes(2.0, self.version == 1) + b'\n' + + b'tbl1 a' + _float_binary_bytes(3.0, self.version == 1) + b'\n' + + b'tbl1 a' + _float_binary_bytes(0.0, self.version == 1) + b'\n' + + b'tbl1 a' + _float_binary_bytes(float('inf'), self.version == 1) + b'\n' + + b'tbl1 a' + _float_binary_bytes(float('-inf'), self.version == 1) + b'\n' + + b'tbl1 a' + _float_binary_bytes(float('NAN'), self.version == 1) + b'\n' + + b'tbl1 a' + _float_binary_bytes(1.7976931348623157e308, self.version == 1) + b'\n') + + def test_u8_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1, 2, 3, + 0, + None, + 255], # u8 max + dtype=pd.UInt8Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f']}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=1i,b="a"\n' + + b'tbl1 a=2i,b="b"\n' + + b'tbl1 a=3i,b="c"\n' + + b'tbl1 a=0i,b="d"\n' + + b'tbl1 b="e"\n' + + b'tbl1 a=255i,b="f"\n') + + def test_i8_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1, 2, 3, + -128, # i8 min + 0, + None, + 127], # i8 max + dtype=pd.Int8Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g']}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=1i,b="a"\n' + + b'tbl1 a=2i,b="b"\n' + + b'tbl1 a=3i,b="c"\n' + + b'tbl1 a=-128i,b="d"\n' + + b'tbl1 a=0i,b="e"\n' + + b'tbl1 b="f"\n' + + b'tbl1 a=127i,b="g"\n') + + def test_u16_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1, 2, 3, + 0, + None, + 65535], # u16 max + dtype=pd.UInt16Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f']}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + ('tbl1 a=1i,b="a"\n' + + 'tbl1 a=2i,b="b"\n' + + 'tbl1 a=3i,b="c"\n' + + 'tbl1 a=0i,b="d"\n' + + 'tbl1 b="e"\n' + + 'tbl1 a=65535i,b="f"\n').encode('utf-8')) + + def test_i16_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1, 2, 3, + -32768, # i16 min + 0, + None, + 32767], # i16 max + dtype=pd.Int16Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g']}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=1i,b="a"\n' + + b'tbl1 a=2i,b="b"\n' + + b'tbl1 a=3i,b="c"\n' + + b'tbl1 a=-32768i,b="d"\n' + + b'tbl1 a=0i,b="e"\n' + + b'tbl1 b="f"\n' + + b'tbl1 a=32767i,b="g"\n') + + def test_u32_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1, 2, 3, + 0, + None, + 4294967295], # u32 max + dtype=pd.UInt32Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f']}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=1i,b="a"\n' + + b'tbl1 a=2i,b="b"\n' + + b'tbl1 a=3i,b="c"\n' + + b'tbl1 a=0i,b="d"\n' + + b'tbl1 b="e"\n' + + b'tbl1 a=4294967295i,b="f"\n') + + def test_i32_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1, 2, 3, + -2147483648, # i32 min + 0, + None, + 2147483647], # i32 max + dtype=pd.Int32Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g']}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=1i,b="a"\n' + + b'tbl1 a=2i,b="b"\n' + + b'tbl1 a=3i,b="c"\n' + + b'tbl1 a=-2147483648i,b="d"\n' + + b'tbl1 a=0i,b="e"\n' + + b'tbl1 b="f"\n' + + b'tbl1 a=2147483647i,b="g"\n') + + def test_u64_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1, 2, 3, + 0, + None, + 9223372036854775807], # i64 max + dtype=pd.UInt64Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f']}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=1i,b="a"\n' + + b'tbl1 a=2i,b="b"\n' + + b'tbl1 a=3i,b="c"\n' + + b'tbl1 a=0i,b="d"\n' + + b'tbl1 b="e"\n' + + b'tbl1 a=9223372036854775807i,b="f"\n') + + df2 = pd.DataFrame({'a': pd.Series([ + 1, 2, 3, + 0, + 9223372036854775808], # i64 max + 1 + dtype=pd.UInt64Dtype())}) + with self.assertRaisesRegex( + qi.IngressError, + '.* serialize .* column .a. .* 4 .*9223372036854775808.*int64.*'): + _dataframe(self.version, df2, table_name='tbl1', at=qi.ServerTimestamp) + + def test_i64_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1, 2, 3, + -9223372036854775808, # i64 min + 0, + None, + 9223372036854775807], # i64 max + dtype=pd.Int64Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g']}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=1i,b="a"\n' + + b'tbl1 a=2i,b="b"\n' + + b'tbl1 a=3i,b="c"\n' + + b'tbl1 a=-9223372036854775808i,b="d"\n' + + b'tbl1 a=0i,b="e"\n' + + b'tbl1 b="f"\n' + + b'tbl1 a=9223372036854775807i,b="g"\n') + + def test_f32_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1.0, 2.0, 3.0, + 0.0, + float('inf'), + float('-inf'), + float('nan'), + 3.4028234663852886e38, # f32 max + None], + dtype=pd.Float32Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a' + _float_binary_bytes(1.0, self.version == 1) + b',b="a"\n' + + b'tbl1 a' + _float_binary_bytes(2.0, self.version == 1) + b',b="b"\n' + + b'tbl1 a' + _float_binary_bytes(3.0, self.version == 1) + b',b="c"\n' + + b'tbl1 a' + _float_binary_bytes(0.0, self.version == 1) + b',b="d"\n' + + b'tbl1 a' + _float_binary_bytes(float('inf'), self.version == 1) + b',b="e"\n' + + b'tbl1 a' + _float_binary_bytes(float('-inf'), self.version == 1) + b',b="f"\n' + + b'tbl1 b="g"\n' + # This one is wierd: `nan` gets 0 in the bitmask. + b'tbl1 a' + _float_binary_bytes(3.4028234663852886e38, self.version == 1) + b',b="h"\n' + + b'tbl1 b="i"\n') + + def test_f64_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 1.0, 2.0, 3.0, + 0.0, + float('inf'), + float('-inf'), + float('nan'), + 1.7976931348623157e308, # f64 max + None], + dtype=pd.Float64Dtype()), + 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a' + _float_binary_bytes(1.0, self.version == 1) + b',b="a"\n' + + b'tbl1 a' + _float_binary_bytes(2.0, self.version == 1) + b',b="b"\n' + + b'tbl1 a' + _float_binary_bytes(3.0, self.version == 1) + b',b="c"\n' + + b'tbl1 a' + _float_binary_bytes(0.0, self.version == 1) + b',b="d"\n' + + b'tbl1 a' + _float_binary_bytes(float('inf'), self.version == 1) + b',b="e"\n' + + b'tbl1 a' + _float_binary_bytes(float('-inf'), self.version == 1) + b',b="f"\n' + + b'tbl1 b="g"\n' + # This one is wierd: `nan` gets 0 in the bitmask. + b'tbl1 a' + _float_binary_bytes(1.7976931348623157e308, self.version == 1) + b',b="h"\n' + + b'tbl1 b="i"\n') + + def test_bool_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ + True, False, False, + False, True, False], + dtype='bool')}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=t\n' + + b'tbl1 a=f\n' + + b'tbl1 a=f\n' + + b'tbl1 a=f\n' + + b'tbl1 a=t\n' + + b'tbl1 a=f\n') + + def test_bool_arrow_col(self): + df = pd.DataFrame({'a': pd.Series([ + True, False, False, + False, True, False, + True, True, True, + False, False, False], + dtype='boolean')}) # Note `boolean` != `bool`. + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=t\n' + + b'tbl1 a=f\n' + + b'tbl1 a=f\n' + + b'tbl1 a=f\n' + + b'tbl1 a=t\n' + + b'tbl1 a=f\n' + + b'tbl1 a=t\n' + + b'tbl1 a=t\n' + + b'tbl1 a=t\n' + + b'tbl1 a=f\n' + + b'tbl1 a=f\n' + + b'tbl1 a=f\n') + + df2 = pd.DataFrame({'a': pd.Series([ + True, False, False, + None, True, False], + dtype='boolean')}) + with self.assertRaisesRegex( + qi.IngressError, + 'Failed.*at row index 3 .*.: .*insert null .*boolean col'): + _dataframe(self.version, df2, table_name='tbl1', at=qi.ServerTimestamp) + + def test_bool_obj_col(self): + df = pd.DataFrame({'a': pd.Series([ + True, False, False, + False, True, False], + dtype='object')}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=t\n' + + b'tbl1 a=f\n' + + b'tbl1 a=f\n' + + b'tbl1 a=f\n' + + b'tbl1 a=t\n' + + b'tbl1 a=f\n') + + df2 = pd.DataFrame({'a': pd.Series([ + True, False, 'false'], + dtype='object')}) + with self.assertRaisesRegex( + qi.IngressError, + 'serialize .* column .a. .* 2 .*false.*bool'): + _dataframe(self.version, df2, table_name='tbl1', at=qi.ServerTimestamp) + + df3 = pd.DataFrame({'a': pd.Series([ + None, True, False], + dtype='object')}) + with self.assertRaisesRegex( + qi.IngressError, + 'serialize.*\\(None\\): Cannot insert null.*boolean column'): + _dataframe(self.version, df3, table_name='tbl1', at=qi.ServerTimestamp) + + def test_datetime64_numpy_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + pd.Timestamp('2019-01-01 00:00:00'), + pd.Timestamp('2019-01-01 00:00:01'), + pd.Timestamp('2019-01-01 00:00:02'), + pd.Timestamp('2019-01-01 00:00:03'), + pd.Timestamp('2019-01-01 00:00:04'), + pd.Timestamp('2019-01-01 00:00:05'), + None, + float('nan'), + pd.NA], + dtype='datetime64[ns]'), + 'b': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=1546300800000000t,b="a"\n' + + b'tbl1 a=1546300801000000t,b="b"\n' + + b'tbl1 a=1546300802000000t,b="c"\n' + + b'tbl1 a=1546300803000000t,b="d"\n' + + b'tbl1 a=1546300804000000t,b="e"\n' + + b'tbl1 a=1546300805000000t,b="f"\n' + + b'tbl1 b="g"\n' + + b'tbl1 b="h"\n' + + b'tbl1 b="i"\n') + + df = pd.DataFrame({'a': pd.Series([ pd.Timestamp('1970-01-01 00:00:00'), pd.Timestamp('1970-01-01 00:00:01'), - pd.Timestamp('1970-01-01 00:00:02')], - dtype='datetime64[ns]'), - 'b': [1, 2, 3]}) - buf = _dataframe(df, table_name='tbl1', at='a') - self.assertEqual( - buf, - 'tbl1 b=1i 0\n' + - 'tbl1 b=2i 1000000000\n' + - 'tbl1 b=3i 2000000000\n') - - def test_datetime64_tz_arrow_at(self): - df = pd.DataFrame({ - 'a': [ - pd.Timestamp( - year=2019, month=1, day=1, - hour=0, minute=0, second=0, tz=_TZ), - pd.Timestamp( - year=2019, month=1, day=1, - hour=0, minute=0, second=1, tz=_TZ), - None, - pd.Timestamp( - year=2019, month=1, day=1, - hour=0, minute=0, second=3, tz=_TZ)], - 'b': ['sym1', 'sym2', 'sym3', 'sym4']}) - buf = _dataframe(df, table_name='tbl1', symbols=['b'], at='a') - self.assertEqual( - buf, - # Note how these are 5hr offset from `test_datetime64_numpy_col`. - 'tbl1,b=sym1 1546318800000000000\n' + - 'tbl1,b=sym2 1546318801000000000\n' + - 'tbl1,b=sym3\n' + - 'tbl1,b=sym4 1546318803000000000\n') - - df2 = pd.DataFrame({ - 'a': [ - pd.Timestamp( - year=1900, month=1, day=1, - hour=0, minute=0, second=0, tz=_TZ)], - 'b': ['sym1']}) - with self.assertRaisesRegex( - qi.IngressError, "Failed.*'a'.*-220897.* is neg"): - _dataframe(df2, table_name='tbl1', symbols=['b'], at='a') - - def _test_pyobjstr_table(self, dtype): - df = pd.DataFrame({ - '../bad col name/../it does not matter...': - pd.Series([ + pd.Timestamp('1970-01-01 00:00:02')])}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=0t\n' + + b'tbl1 a=1000000t\n' + + b'tbl1 a=2000000t\n') + + def test_datetime64_tz_arrow_col(self): + df = pd.DataFrame({ + 'a': [ + pd.Timestamp( + year=2019, month=1, day=1, + hour=0, minute=0, second=0, tz=_TZ), + pd.Timestamp( + year=2019, month=1, day=1, + hour=0, minute=0, second=1, tz=_TZ), + None, + pd.Timestamp( + year=2019, month=1, day=1, + hour=0, minute=0, second=3, tz=_TZ)], + 'b': ['sym1', 'sym2', 'sym3', 'sym4']}) + buf = _dataframe(self.version, df, table_name='tbl1', symbols=['b'], at=qi.ServerTimestamp) + self.assertEqual( + buf, + # Note how these are 5hr offset from `test_datetime64_numpy_col`. + b'tbl1,b=sym1 a=1546318800000000t\n' + + b'tbl1,b=sym2 a=1546318801000000t\n' + + b'tbl1,b=sym3\n' + + b'tbl1,b=sym4 a=1546318803000000t\n') + + # Not epoch 0. + df = pd.DataFrame({ + 'a': [ + pd.Timestamp( + year=1970, month=1, day=1, + hour=0, minute=0, second=0, tz=_TZ), + pd.Timestamp( + year=1970, month=1, day=1, + hour=0, minute=0, second=1, tz=_TZ), + pd.Timestamp( + year=1970, month=1, day=1, + hour=0, minute=0, second=2, tz=_TZ)], + 'b': ['sym1', 'sym2', 'sym3']}) + buf = _dataframe(self.version, df, table_name='tbl1', symbols=['b'], at=qi.ServerTimestamp) + self.assertEqual( + buf, + # Note how these are 5hr offset from `test_datetime64_numpy_col`. + b'tbl1,b=sym1 a=18000000000t\n' + + b'tbl1,b=sym2 a=18001000000t\n' + + b'tbl1,b=sym3 a=18002000000t\n') + + # Actual epoch 0. + df = pd.DataFrame({ + 'a': [ + pd.Timestamp( + year=1969, month=12, day=31, + hour=19, minute=0, second=0, tz=_TZ), + pd.Timestamp( + year=1969, month=12, day=31, + hour=19, minute=0, second=1, tz=_TZ), + pd.Timestamp( + year=1969, month=12, day=31, + hour=19, minute=0, second=2, tz=_TZ)], + 'b': ['sym1', 'sym2', 'sym3']}) + buf = _dataframe(self.version, df, table_name='tbl1', symbols=['b'], at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1,b=sym1 a=0t\n' + + b'tbl1,b=sym2 a=1000000t\n' + + b'tbl1,b=sym3 a=2000000t\n') + + df2 = pd.DataFrame({ + 'a': [ + pd.Timestamp( + year=1900, month=1, day=1, + hour=0, minute=0, second=0, tz=_TZ)], + 'b': ['sym1']}) + buf = _dataframe(self.version, df2, table_name='tbl1', symbols=['b'], at=qi.ServerTimestamp) + + # Accounting for different datatime library differences. + # Mostly, here assert that negative timestamps are allowed. + self.assertIn( + buf, + [b'tbl1,b=sym1 a=-2208970800000000t\n', + b'tbl1,b=sym1 a=-2208971040000000t\n']) + + def test_datetime64_numpy_at(self): + df = pd.DataFrame({ + 'a': pd.Series([ + pd.Timestamp('2019-01-01 00:00:00'), + pd.Timestamp('2019-01-01 00:00:01'), + pd.Timestamp('2019-01-01 00:00:02'), + pd.Timestamp('2019-01-01 00:00:03'), + pd.Timestamp('2019-01-01 00:00:04'), + pd.Timestamp('2019-01-01 00:00:05'), + float('nan'), + None, + pd.NaT], + dtype='datetime64[ns]'), + 'b': [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + buf = _dataframe(self.version, df, table_name='tbl1', at='a') + self.assertEqual( + buf, + b'tbl1 b=1i 1546300800000000000\n' + + b'tbl1 b=2i 1546300801000000000\n' + + b'tbl1 b=3i 1546300802000000000\n' + + b'tbl1 b=4i 1546300803000000000\n' + + b'tbl1 b=5i 1546300804000000000\n' + + b'tbl1 b=6i 1546300805000000000\n' + + b'tbl1 b=7i\n' + + b'tbl1 b=8i\n' + + b'tbl1 b=9i\n') + + df = pd.DataFrame({ + 'a': pd.Series([ + pd.Timestamp('1970-01-01 00:00:00'), + pd.Timestamp('1970-01-01 00:00:01'), + pd.Timestamp('1970-01-01 00:00:02')], + dtype='datetime64[ns]'), + 'b': [1, 2, 3]}) + buf = _dataframe(self.version, df, table_name='tbl1', at='a') + self.assertEqual( + buf, + b'tbl1 b=1i 0\n' + + b'tbl1 b=2i 1000000000\n' + + b'tbl1 b=3i 2000000000\n') + + def test_datetime64_tz_arrow_at(self): + df = pd.DataFrame({ + 'a': [ + pd.Timestamp( + year=2019, month=1, day=1, + hour=0, minute=0, second=0, tz=_TZ), + pd.Timestamp( + year=2019, month=1, day=1, + hour=0, minute=0, second=1, tz=_TZ), + None, + pd.Timestamp( + year=2019, month=1, day=1, + hour=0, minute=0, second=3, tz=_TZ)], + 'b': ['sym1', 'sym2', 'sym3', 'sym4']}) + buf = _dataframe(self.version, df, table_name='tbl1', symbols=['b'], at='a') + self.assertEqual( + buf, + # Note how these are 5hr offset from `test_datetime64_numpy_col`. + b'tbl1,b=sym1 1546318800000000000\n' + + b'tbl1,b=sym2 1546318801000000000\n' + + b'tbl1,b=sym3\n' + + b'tbl1,b=sym4 1546318803000000000\n') + + df2 = pd.DataFrame({ + 'a': [ + pd.Timestamp( + year=1900, month=1, day=1, + hour=0, minute=0, second=0, tz=_TZ)], + 'b': ['sym1']}) + with self.assertRaisesRegex( + qi.IngressError, "Failed.*'a'.*-220897.* is neg"): + _dataframe(self.version, df2, table_name='tbl1', symbols=['b'], at='a') + + def _test_pyobjstr_table(self, dtype): + df = pd.DataFrame({ + '../bad col name/../it does not matter...': + pd.Series([ + 'a', # ASCII + 'b' * 127, # Max table name length. + 'q❤️p', # Mixed ASCII and UCS-2 + '嚜꓂', # UCS-2, 3 bytes for UTF-8. + '💩🦞'], # UCS-4, 4 bytes for UTF-8. + dtype=dtype), + 'b': [1, 2, 3, 4, 5]}) + buf = _dataframe(self.version, df, table_name_col=0, at=qi.ServerTimestamp) + self.assertEqual( + buf, + ('a b=1i\n' + + ('b' * 127) + ' b=2i\n' + + 'q❤️p b=3i\n' + + '嚜꓂ b=4i\n' + + '💩🦞 b=5i\n').encode("utf-8")) + + with self.assertRaisesRegex( + qi.IngressError, "Too long"): + _dataframe(self.version, + pd.DataFrame({'a': pd.Series(['b' * 128], dtype=dtype)}), + table_name_col='a', at=qi.ServerTimestamp) + + with self.assertRaisesRegex( + qi.IngressError, 'Failed.*Expected a table name, got a null.*'): + _dataframe(self.version, + pd.DataFrame({ + '.': pd.Series(['x', None], dtype=dtype), + 'b': [1, 2]}), + table_name_col='.', at=qi.ServerTimestamp) + + with self.assertRaisesRegex( + qi.IngressError, 'Failed.*Expected a table name, got a null.*'): + _dataframe(self.version, + pd.DataFrame({ + '.': pd.Series(['x', float('nan')], dtype=dtype), + 'b': [1, 2]}), + table_name_col='.', at=qi.ServerTimestamp) + + with self.assertRaisesRegex( + qi.IngressError, 'Failed.*Expected a table name, got a null.*'): + _dataframe(self.version, + pd.DataFrame({ + '.': pd.Series(['x', pd.NA], dtype=dtype), + 'b': [1, 2]}), + table_name_col='.', at=qi.ServerTimestamp) + + with self.assertRaisesRegex( + qi.IngressError, "''.*must have a non-zero length"): + _dataframe(self.version, + pd.DataFrame({ + '/': pd.Series([''], dtype=dtype), + 'b': [1]}), + table_name_col='/', at=qi.ServerTimestamp) + + with self.assertRaisesRegex( + qi.IngressError, "'tab..1'.*invalid dot `\\.` at position 4"): + _dataframe(self.version, + pd.DataFrame({ + '/': pd.Series(['tab..1'], dtype=dtype), + 'b': [1]}), + table_name_col='/', at=qi.ServerTimestamp) + + def test_obj_str_table(self): + self._test_pyobjstr_table('object') + + with self.assertRaisesRegex( + qi.IngressError, 'table name .*got an object of type int'): + _dataframe(self.version, + pd.DataFrame({ + '.': pd.Series(['x', 42], dtype='object'), + 'z': [1, 2]}), + table_name_col='.', at=qi.ServerTimestamp) + + def test_obj_string_table(self): + self._test_pyobjstr_table('string') + + self.assertEqual( + _dataframe(self.version, + pd.DataFrame({ + '.': pd.Series(['x', 42], dtype='string'), + 'z': [1, 2]}), + table_name_col='.', at=qi.ServerTimestamp), + b'x z=1i\n' + + b'42 z=2i\n') + + def _test_pyobjstr_numpy_symbol(self, dtype): + df = pd.DataFrame({'a': pd.Series([ 'a', # ASCII - 'b' * 127, # Max table name length. 'q❤️p', # Mixed ASCII and UCS-2 + '❤️' * 1200, # Over the 1024 buffer prealloc. + 'Questo è un qualcosa', # Non-ASCII UCS-1 + 'щось', # UCS-2, 2 bytes for UTF-8. + '', # Empty string '嚜꓂', # UCS-2, 3 bytes for UTF-8. '💩🦞'], # UCS-4, 4 bytes for UTF-8. - dtype=dtype), - 'b': [1, 2, 3, 4, 5]}) - buf = _dataframe(df, table_name_col=0, at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'a b=1i\n' + - ('b' * 127) + ' b=2i\n' + - 'q❤️p b=3i\n' + - '嚜꓂ b=4i\n' + - '💩🦞 b=5i\n') - - with self.assertRaisesRegex( - qi.IngressError, "Too long"): - _dataframe( - pd.DataFrame({'a': pd.Series(['b' * 128], dtype=dtype)}), - table_name_col='a', at=qi.ServerTimestamp) - - with self.assertRaisesRegex( - qi.IngressError, 'Failed.*Expected a table name, got a null.*'): - _dataframe( - pd.DataFrame({ - '.': pd.Series(['x', None], dtype=dtype), - 'b': [1, 2]}), - table_name_col='.', at=qi.ServerTimestamp) - - with self.assertRaisesRegex( - qi.IngressError, 'Failed.*Expected a table name, got a null.*'): - _dataframe( - pd.DataFrame({ - '.': pd.Series(['x', float('nan')], dtype=dtype), - 'b': [1, 2]}), - table_name_col='.', at=qi.ServerTimestamp) - - with self.assertRaisesRegex( - qi.IngressError, 'Failed.*Expected a table name, got a null.*'): - _dataframe( - pd.DataFrame({ - '.': pd.Series(['x', pd.NA], dtype=dtype), - 'b': [1, 2]}), - table_name_col='.', at=qi.ServerTimestamp) - - with self.assertRaisesRegex( - qi.IngressError, "''.*must have a non-zero length"): - _dataframe( - pd.DataFrame({ - '/': pd.Series([''], dtype=dtype), - 'b': [1]}), - table_name_col='/', at=qi.ServerTimestamp) - - with self.assertRaisesRegex( - qi.IngressError, "'tab..1'.*invalid dot `\\.` at position 4"): - _dataframe( - pd.DataFrame({ - '/': pd.Series(['tab..1'], dtype=dtype), - 'b': [1]}), - table_name_col='/', at=qi.ServerTimestamp) - - def test_obj_str_table(self): - self._test_pyobjstr_table('object') - - with self.assertRaisesRegex( - qi.IngressError, 'table name .*got an object of type int'): - _dataframe( - pd.DataFrame({ - '.': pd.Series(['x', 42], dtype='object'), - 'z': [1, 2]}), - table_name_col='.', at=qi.ServerTimestamp) - - def test_obj_string_table(self): - self._test_pyobjstr_table('string') - - self.assertEqual( - _dataframe( - pd.DataFrame({ - '.': pd.Series(['x', 42], dtype='string'), - 'z': [1, 2]}), - table_name_col='.', at=qi.ServerTimestamp), - 'x z=1i\n' + - '42 z=2i\n') - - def _test_pyobjstr_numpy_symbol(self, dtype): - df = pd.DataFrame({'a': pd.Series([ - 'a', # ASCII - 'q❤️p', # Mixed ASCII and UCS-2 - '❤️' * 1200, # Over the 1024 buffer prealloc. - 'Questo è un qualcosa', # Non-ASCII UCS-1 - 'щось', # UCS-2, 2 bytes for UTF-8. - '', # Empty string - '嚜꓂', # UCS-2, 3 bytes for UTF-8. - '💩🦞'], # UCS-4, 4 bytes for UTF-8. - dtype=dtype)}) - buf = _dataframe(df, table_name='tbl1', symbols=True, at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1,a=a\n' + - 'tbl1,a=q❤️p\n' + - 'tbl1,a=' + ('❤️' * 1200) + '\n' + - 'tbl1,a=Questo\\ è\\ un\\ qualcosa\n' + - 'tbl1,a=щось\n' + - 'tbl1,a=\n' + - 'tbl1,a=嚜꓂\n' + - 'tbl1,a=💩🦞\n') - - for null_obj in (None, float('nan'), pd.NA): + dtype=dtype)}) + buf = _dataframe(self.version, df, table_name='tbl1', symbols=True, at=qi.ServerTimestamp) + self.assertEqual( + buf, + ('tbl1,a=a\n' + + 'tbl1,a=q❤️p\n' + + 'tbl1,a=' + ('❤️' * 1200) + '\n' + + 'tbl1,a=Questo\\ è\\ un\\ qualcosa\n' + + 'tbl1,a=щось\n' + + 'tbl1,a=\n' + + 'tbl1,a=嚜꓂\n' + + 'tbl1,a=💩🦞\n').encode("utf-8")) + + for null_obj in (None, float('nan'), pd.NA): + self.assertEqual( + _dataframe( + self.version, + pd.DataFrame({ + 'x': pd.Series(['a', null_obj], dtype=dtype), + 'y': [1, 2]}), + table_name='tbl1', symbols=[0], at=qi.ServerTimestamp), + b'tbl1,x=a y=1i\n' + + b'tbl1 y=2i\n') + + def test_obj_str_numpy_symbol(self): + self._test_pyobjstr_numpy_symbol('object') + + with self.assertRaisesRegex( + qi.IngressError, 'Expected a string, got an .* type int'): + _dataframe( + self.version, + pd.DataFrame({ + 'x': pd.Series(['x', 42], dtype='object'), + 'y': [1, 2]}), + table_name='tbl1', symbols=[0], at=qi.ServerTimestamp) + + def test_obj_string_numpy_symbol(self): + self._test_pyobjstr_numpy_symbol('string') + self.assertEqual( _dataframe( + self.version, pd.DataFrame({ - 'x': pd.Series(['a', null_obj], dtype=dtype), + 'x': pd.Series(['x', 42], dtype='string'), 'y': [1, 2]}), table_name='tbl1', symbols=[0], at=qi.ServerTimestamp), - 'tbl1,x=a y=1i\n' + - 'tbl1 y=2i\n') - - def test_obj_str_numpy_symbol(self): - self._test_pyobjstr_numpy_symbol('object') - - with self.assertRaisesRegex( - qi.IngressError, 'Expected a string, got an .* type int'): - _dataframe( - pd.DataFrame({ - 'x': pd.Series(['x', 42], dtype='object'), - 'y': [1, 2]}), - table_name='tbl1', symbols=[0], at=qi.ServerTimestamp) - - def test_obj_string_numpy_symbol(self): - self._test_pyobjstr_numpy_symbol('string') - - self.assertEqual( - _dataframe( - pd.DataFrame({ - 'x': pd.Series(['x', 42], dtype='string'), - 'y': [1, 2]}), - table_name='tbl1', symbols=[0], at=qi.ServerTimestamp), - 'tbl1,x=x y=1i\n' + - 'tbl1,x=42 y=2i\n') - - def test_str_numpy_col(self): - df = pd.DataFrame({'a': pd.Series([ - 'a', # ASCII - 'q❤️p', # Mixed ASCII and UCS-2 - '❤️' * 1200, # Over the 1024 buffer prealloc. - 'Questo è un qualcosa', # Non-ASCII UCS-1 - 'щось', # UCS-2, 2 bytes for UTF-8. - '', # Empty string - '嚜꓂', # UCS-2, 3 bytes for UTF-8. - '💩🦞'], # UCS-4, 4 bytes for UTF-8. - dtype='str')}) - buf = _dataframe(df, table_name='tbl1', at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a="a"\n' + - 'tbl1 a="q❤️p"\n' + - 'tbl1 a="' + ('❤️' * 1200) + '"\n' + - 'tbl1 a="Questo è un qualcosa"\n' + - 'tbl1 a="щось"\n' + - 'tbl1 a=""\n' + - 'tbl1 a="嚜꓂"\n' + - 'tbl1 a="💩🦞"\n') - - def test_str_arrow_table(self): - df = pd.DataFrame({ - '../bad col name/../it does not matter...': pd.Series([ - 'a', # ASCII - 'b' * 127, # Max table name length. - 'q❤️p', # Mixed ASCII and UCS-2 - '嚜꓂', # UCS-2, 3 bytes for UTF-8. - '💩🦞'], # UCS-4, 4 bytes for UTF-8. - dtype='string[pyarrow]'), - 'b': [1, 2, 3, 4, 5]}) - buf = _dataframe(df, table_name_col=0, at=qi.ServerTimestamp) - self.assertEqual( - buf, - 'a b=1i\n' + - ('b' * 127) + ' b=2i\n' + - 'q❤️p b=3i\n' + - '嚜꓂ b=4i\n' + - '💩🦞 b=5i\n') - - with self.assertRaisesRegex( - qi.IngressError, "Too long"): - _dataframe( - pd.DataFrame({ - 'a': pd.Series(['b' * 128], dtype='string[pyarrow]')}), - table_name_col='a', at = qi.ServerTimestamp) - - with self.assertRaisesRegex( - qi.IngressError, "Failed .*.*Table name cannot be null"): - _dataframe( - pd.DataFrame({ - '.': pd.Series(['x', None], dtype='string[pyarrow]'), - 'b': [1, 2]}), - table_name_col='.', at = qi.ServerTimestamp) - - with self.assertRaisesRegex( - qi.IngressError, "''.*must have a non-zero length"): - _dataframe( - pd.DataFrame({ - '/': pd.Series([''], dtype='string[pyarrow]')}), - table_name_col='/', at = qi.ServerTimestamp) - - with self.assertRaisesRegex( - qi.IngressError, "'tab..1'.*invalid dot `\\.` at position 4"): - _dataframe( - pd.DataFrame({ - '/': pd.Series(['tab..1'], dtype='string[pyarrow]')}), - table_name_col='/', at = qi.ServerTimestamp) - - def test_str_arrow_symbol(self): - df = pd.DataFrame({ - 'a': pd.Series([ - 'a', # ASCII - 'q❤️p', # Mixed ASCII and UCS-2 - '❤️' * 1200, # Over the 1024 buffer prealloc. - 'Questo è un qualcosa', # Non-ASCII UCS-1 - 'щось', # UCS-2, 2 bytes for UTF-8. - '', # Empty string - None, - '嚜꓂', # UCS-2, 3 bytes for UTF-8. - '💩🦞'], # UCS-4, 4 bytes for UTF-8. - dtype='string[pyarrow]'), - 'b': [1, 2, 3, 4, 5, 6, 7, 8, 9]}) - buf = _dataframe(df, table_name='tbl1', symbols=True, at = qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1,a=a b=1i\n' + - 'tbl1,a=q❤️p b=2i\n' + - 'tbl1,a=' + ('❤️' * 1200) + ' b=3i\n' + - 'tbl1,a=Questo\\ è\\ un\\ qualcosa b=4i\n' + - 'tbl1,a=щось b=5i\n' + - 'tbl1,a= b=6i\n' + - 'tbl1 b=7i\n' + - 'tbl1,a=嚜꓂ b=8i\n' + - 'tbl1,a=💩🦞 b=9i\n') - - def test_str_arrow_col(self): - df = pd.DataFrame({ - 'a': pd.Series([ - 'a', # ASCII - 'q❤️p', # Mixed ASCII and UCS-2 - '❤️' * 1200, # Over the 1024 buffer prealloc. - 'Questo è un qualcosa', # Non-ASCII UCS-1 - 'щось', # UCS-2, 2 bytes for UTF-8. - '', # Empty string - None, - '嚜꓂', # UCS-2, 3 bytes for UTF-8. - '💩🦞'], # UCS-4, 4 bytes for UTF-8. - dtype='string[pyarrow]'), - 'b': [1, 2, 3, 4, 5, 6, 7, 8, 9]}) - buf = _dataframe(df, table_name='tbl1', symbols=False, at = qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 a="a",b=1i\n' + - 'tbl1 a="q❤️p",b=2i\n' + - 'tbl1 a="' + ('❤️' * 1200) + '",b=3i\n' + - 'tbl1 a="Questo è un qualcosa",b=4i\n' + - 'tbl1 a="щось",b=5i\n' + - 'tbl1 a="",b=6i\n' + - 'tbl1 b=7i\n' + - 'tbl1 a="嚜꓂",b=8i\n' + - 'tbl1 a="💩🦞",b=9i\n') - - def test_pyobj_int_col(self): - int64_min = -2**63 - int64_max = 2**63 - 1 - self.assertEqual( - _dataframe( - pd.DataFrame({ - 'a': pd.Series([ - 1, 2, 3, None, float('nan'), pd.NA, 7, - 0, - int64_min, - int64_max], dtype='object'), - 'b': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}), - table_name='tbl1', at = qi.ServerTimestamp), - 'tbl1 a=1i,b=1i\n' + - 'tbl1 a=2i,b=2i\n' + - 'tbl1 a=3i,b=3i\n' + - 'tbl1 b=4i\n' + - 'tbl1 b=5i\n' + - 'tbl1 b=6i\n' + - 'tbl1 a=7i,b=7i\n' + - 'tbl1 a=0i,b=8i\n' + - 'tbl1 a=' + str(int64_min) + 'i,b=9i\n' + - 'tbl1 a=' + str(int64_max) + 'i,b=10i\n') - - with self.assertRaisesRegex( - qi.IngressError, "1 \\('STRING'\\): .*type int, got.*str\\."): - _dataframe( - pd.DataFrame({ - 'a': pd.Series([1, 'STRING'], dtype='object'), - 'b': [1, 2]}), - table_name='tbl1', at = qi.ServerTimestamp) - - out_of_range = [int64_min - 1, int64_max + 1] - for num in out_of_range: + b'tbl1,x=x y=1i\n' + + b'tbl1,x=42 y=2i\n') + + def test_str_numpy_col(self): + df = pd.DataFrame({'a': pd.Series([ + 'a', # ASCII + 'q❤️p', # Mixed ASCII and UCS-2 + '❤️' * 1200, # Over the 1024 buffer prealloc. + 'Questo è un qualcosa', # Non-ASCII UCS-1 + 'щось', # UCS-2, 2 bytes for UTF-8. + '', # Empty string + '嚜꓂', # UCS-2, 3 bytes for UTF-8. + '💩🦞'], # UCS-4, 4 bytes for UTF-8. + dtype='str')}) + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + ('tbl1 a="a"\n' + + 'tbl1 a="q❤️p"\n' + + 'tbl1 a="' + ('❤️' * 1200) + '"\n' + + 'tbl1 a="Questo è un qualcosa"\n' + + 'tbl1 a="щось"\n' + + 'tbl1 a=""\n' + + 'tbl1 a="嚜꓂"\n' + + 'tbl1 a="💩🦞"\n').encode("utf-8")) + + def test_str_arrow_table(self): + df = pd.DataFrame({ + '../bad col name/../it does not matter...': pd.Series([ + 'a', # ASCII + 'b' * 127, # Max table name length. + 'q❤️p', # Mixed ASCII and UCS-2 + '嚜꓂', # UCS-2, 3 bytes for UTF-8. + '💩🦞'], # UCS-4, 4 bytes for UTF-8. + dtype='string[pyarrow]'), + 'b': [1, 2, 3, 4, 5]}) + buf = _dataframe(self.version, df, table_name_col=0, at=qi.ServerTimestamp) + self.assertEqual( + buf, + ('a b=1i\n' + + ('b' * 127) + ' b=2i\n' + + 'q❤️p b=3i\n' + + '嚜꓂ b=4i\n' + + '💩🦞 b=5i\n').encode("utf-8")) + + with self.assertRaisesRegex( + qi.IngressError, "Too long"): + _dataframe( + self.version, + pd.DataFrame({ + 'a': pd.Series(['b' * 128], dtype='string[pyarrow]')}), + table_name_col='a', at = qi.ServerTimestamp) + + with self.assertRaisesRegex( + qi.IngressError, "Failed .*.*Table name cannot be null"): + _dataframe( + self.version, + pd.DataFrame({ + '.': pd.Series(['x', None], dtype='string[pyarrow]'), + 'b': [1, 2]}), + table_name_col='.', at = qi.ServerTimestamp) + + with self.assertRaisesRegex( + qi.IngressError, "''.*must have a non-zero length"): + _dataframe( + self.version, + pd.DataFrame({ + '/': pd.Series([''], dtype='string[pyarrow]')}), + table_name_col='/', at = qi.ServerTimestamp) + + with self.assertRaisesRegex( + qi.IngressError, "'tab..1'.*invalid dot `\\.` at position 4"): + _dataframe( + self.version, + pd.DataFrame({ + '/': pd.Series(['tab..1'], dtype='string[pyarrow]')}), + table_name_col='/', at = qi.ServerTimestamp) + + def test_str_arrow_symbol(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 'a', # ASCII + 'q❤️p', # Mixed ASCII and UCS-2 + '❤️' * 1200, # Over the 1024 buffer prealloc. + 'Questo è un qualcosa', # Non-ASCII UCS-1 + 'щось', # UCS-2, 2 bytes for UTF-8. + '', # Empty string + None, + '嚜꓂', # UCS-2, 3 bytes for UTF-8. + '💩🦞'], # UCS-4, 4 bytes for UTF-8. + dtype='string[pyarrow]'), + 'b': [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + buf = _dataframe(self.version, df, table_name='tbl1', symbols=True, at = qi.ServerTimestamp) + self.assertEqual( + buf, + ('tbl1,a=a b=1i\n' + + 'tbl1,a=q❤️p b=2i\n' + + 'tbl1,a=' + ('❤️' * 1200) + ' b=3i\n' + + 'tbl1,a=Questo\\ è\\ un\\ qualcosa b=4i\n' + + 'tbl1,a=щось b=5i\n' + + 'tbl1,a= b=6i\n' + + 'tbl1 b=7i\n' + + 'tbl1,a=嚜꓂ b=8i\n' + + 'tbl1,a=💩🦞 b=9i\n').encode('utf-8')) + + def test_str_arrow_col(self): + df = pd.DataFrame({ + 'a': pd.Series([ + 'a', # ASCII + 'q❤️p', # Mixed ASCII and UCS-2 + '❤️' * 1200, # Over the 1024 buffer prealloc. + 'Questo è un qualcosa', # Non-ASCII UCS-1 + 'щось', # UCS-2, 2 bytes for UTF-8. + '', # Empty string + None, + '嚜꓂', # UCS-2, 3 bytes for UTF-8. + '💩🦞'], # UCS-4, 4 bytes for UTF-8. + dtype='string[pyarrow]'), + 'b': [1, 2, 3, 4, 5, 6, 7, 8, 9]}) + buf = _dataframe(self.version, df, table_name='tbl1', symbols=False, at = qi.ServerTimestamp) + self.assertEqual( + buf, + ('tbl1 a="a",b=1i\n' + + 'tbl1 a="q❤️p",b=2i\n' + + 'tbl1 a="' + ('❤️' * 1200) + '",b=3i\n' + + 'tbl1 a="Questo è un qualcosa",b=4i\n' + + 'tbl1 a="щось",b=5i\n' + + 'tbl1 a="",b=6i\n' + + 'tbl1 b=7i\n' + + 'tbl1 a="嚜꓂",b=8i\n' + + 'tbl1 a="💩🦞",b=9i\n').encode('utf-8')) + + def test_pyobj_int_col(self): + int64_min = -2**63 + int64_max = 2**63 - 1 + self.assertEqual( + _dataframe( + self.version, + pd.DataFrame({ + 'a': pd.Series([ + 1, 2, 3, None, float('nan'), pd.NA, 7, + 0, + int64_min, + int64_max], dtype='object'), + 'b': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}), + table_name='tbl1', at = qi.ServerTimestamp), + ('tbl1 a=1i,b=1i\n' + + 'tbl1 a=2i,b=2i\n' + + 'tbl1 a=3i,b=3i\n' + + 'tbl1 b=4i\n' + + 'tbl1 b=5i\n' + + 'tbl1 b=6i\n' + + 'tbl1 a=7i,b=7i\n' + + 'tbl1 a=0i,b=8i\n' + + 'tbl1 a=' + str(int64_min) + 'i,b=9i\n' + + 'tbl1 a=' + str(int64_max) + 'i,b=10i\n').encode('utf-8')) + + with self.assertRaisesRegex( + qi.IngressError, "1 \\('STRING'\\): .*type int, got.*str\\."): + _dataframe( + self.version, + pd.DataFrame({ + 'a': pd.Series([1, 'STRING'], dtype='object'), + 'b': [1, 2]}), + table_name='tbl1', at = qi.ServerTimestamp) + + out_of_range = [int64_min - 1, int64_max + 1] + for num in out_of_range: + with self.assertRaisesRegex( + qi.IngressError, "index 1 .*922337203685477.*int too big"): + _dataframe( + self.version, + pd.DataFrame({ + 'a': pd.Series([1, num], dtype='object'), + 'b': [1, 2]}), + table_name='tbl1', at = qi.ServerTimestamp) + + def test_pyobj_float_col(self): + self.assertEqual( + _dataframe( + self.version, + pd.DataFrame({ + 'a': pd.Series( + [1.0, 2.0, 3.0, None, float('nan'), pd.NA, 7.0], + dtype='object'), + 'b': [1, 2, 3, 4, 5, 6, 7]}), + table_name='tbl1', at = qi.ServerTimestamp), + b'tbl1 a' + _float_binary_bytes(1.0, self.version == 1) + b',b=1i\n' + + b'tbl1 a' + _float_binary_bytes(2.0, self.version == 1) + b',b=2i\n' + + b'tbl1 a' + _float_binary_bytes(3.0, self.version == 1) + b',b=3i\n' + + b'tbl1 b=4i\n' + + b'tbl1 a' + _float_binary_bytes(float('NaN'), self.version == 1) + b',b=5i\n' + + b'tbl1 b=6i\n' + + b'tbl1 a' + _float_binary_bytes(7.0, self.version == 1) + b',b=7i\n') + with self.assertRaisesRegex( - qi.IngressError, "index 1 .*922337203685477.*int too big"): + qi.IngressError, "1 \\('STRING'\\): .*type float, got.*str\\."): _dataframe( + self.version, pd.DataFrame({ - 'a': pd.Series([1, num], dtype='object'), + 'a': pd.Series([1.0, 'STRING'], dtype='object'), 'b': [1, 2]}), table_name='tbl1', at = qi.ServerTimestamp) - def test_pyobj_float_col(self): - self.assertEqual( - _dataframe( - pd.DataFrame({ - 'a': pd.Series( - [1.0, 2.0, 3.0, None, float('nan'), pd.NA, 7.0], - dtype='object'), - 'b': [1, 2, 3, 4, 5, 6, 7]}), - table_name='tbl1', at = qi.ServerTimestamp), - 'tbl1 a=1.0,b=1i\n' + - 'tbl1 a=2.0,b=2i\n' + - 'tbl1 a=3.0,b=3i\n' + - 'tbl1 b=4i\n' + - 'tbl1 a=NaN,b=5i\n' + - 'tbl1 b=6i\n' + - 'tbl1 a=7.0,b=7i\n') - - with self.assertRaisesRegex( - qi.IngressError, "1 \\('STRING'\\): .*type float, got.*str\\."): - _dataframe( - pd.DataFrame({ - 'a': pd.Series([1.0, 'STRING'], dtype='object'), - 'b': [1, 2]}), - table_name='tbl1', at = qi.ServerTimestamp) - - def test_bad_category(self): - # We only support string categories - # (unless anyone asks for additional ones). - # We want to test others are rejected. - with self.assertRaisesRegex( - qi.IngressError, "Bad column 'a'.*got a category of .*int64"): - _dataframe( - pd.DataFrame({'a': pd.Series([1, 2, 3, 2], dtype='category')}), - table_name='tbl1', at = qi.ServerTimestamp) - - def _test_cat_table(self, count): - slist = [f's{i}' for i in range(count)] - - df = pd.DataFrame({ - 'a': pd.Series(slist, dtype='category'), - 'b': list(range(len(slist)))}) - - buf = _dataframe(df, table_name_col=0, at = qi.ServerTimestamp) - exp = ''.join( - f'{s} b={i}i\n' - for i, s in enumerate(slist)) - self.assertEqual(buf, exp) - - slist[2] = None - df2 = pd.DataFrame({ - 'a': pd.Series(slist, dtype='category'), - 'b': list(range(len(slist)))}) - with self.assertRaisesRegex( - qi.IngressError, 'Table name cannot be null'): - _dataframe(df2, table_name_col=0, at = qi.ServerTimestamp) - - def test_cat_i8_table(self): - self._test_cat_table(30) - self._test_cat_table(127) - - def test_cat_i16_table(self): - self._test_cat_table(128) - self._test_cat_table(4000) - self._test_cat_table(32767) - - def test_cat_i32_table(self): - self._test_cat_table(32768) - self._test_cat_table(40000) - - def _test_cat_symbol(self, count): - slist = [f's{i}' for i in range(count)] - - df = pd.DataFrame({ - 'a': pd.Series(slist, dtype='category'), - 'b': list(range(len(slist)))}) - - buf = _dataframe(df, table_name='tbl1', symbols=True, at = qi.ServerTimestamp) - exp = ''.join( - f'tbl1,a={s} b={i}i\n' - for i, s in enumerate(slist)) - self.assertEqual(buf, exp) - - slist[2] = None - df2 = pd.DataFrame({ - 'a': pd.Series(slist, dtype='category'), - 'b': list(range(len(slist)))}) - - exp2 = exp.replace('tbl1,a=s2 b=2i\n', 'tbl1 b=2i\n') - buf2 = _dataframe(df2, table_name='tbl1', symbols=True, at = qi.ServerTimestamp) - self.assertEqual(buf2, exp2) - - def test_cat_i8_symbol(self): - self._test_cat_symbol(30) - self._test_cat_symbol(127) - - def test_cat_i16_symbol(self): - self._test_cat_symbol(128) - self._test_cat_symbol(4000) - self._test_cat_symbol(32767) - - def test_cat_i32_symbol(self): - self._test_cat_symbol(32768) - self._test_cat_symbol(40000) - - def _test_cat_str(self, count): - slist = [f's{i}' for i in range(count)] - - df = pd.DataFrame({ - 'a': pd.Series(slist, dtype='category'), - 'b': list(range(len(slist)))}) - - buf = _dataframe(df, table_name='tbl1', symbols=False, at = qi.ServerTimestamp) - exp = ''.join( - f'tbl1 a="{s}",b={i}i\n' - for i, s in enumerate(slist)) - self.assertEqual(buf, exp) - - slist[2] = None - df2 = pd.DataFrame({ - 'a': pd.Series(slist, dtype='category'), - 'b': list(range(len(slist)))}) - - exp2 = exp.replace('tbl1 a="s2",b=2i\n', 'tbl1 b=2i\n') - buf2 = _dataframe(df2, table_name='tbl1', symbols=False, at = qi.ServerTimestamp) - self.assertEqual(buf2, exp2) - - def test_cat_i8_str(self): - self._test_cat_str(30) - self._test_cat_str(127) - - def test_cat_i16_str(self): - self._test_cat_str(128) - self._test_cat_str(4000) - self._test_cat_str(32767) - - def test_cat_i32_str(self): - self._test_cat_str(32768) - self._test_cat_str(40000) - - def test_all_nulls_pyobj_col(self): - df = pd.DataFrame({ - 'a': [None, pd.NA, float('nan')], - 'b': [1, 2, 3]}) - buf = _dataframe(df, table_name='tbl1', at = qi.ServerTimestamp) - self.assertEqual( - buf, - 'tbl1 b=1i\n' + - 'tbl1 b=2i\n' + - 'tbl1 b=3i\n') - - def test_strided_numpy_column(self): - two_d = np.array([ - [1, 10], - [2, 20], - [3, 30]], dtype='int64') - col2 = two_d[:, 1] - col2.flags['WRITEABLE'] = False - - # Checking our test case setup. - mv = memoryview(col2) - self.assertEqual(mv.contiguous, False) - self.assertEqual(mv.strides, (16,)) - - df = pd.DataFrame(col2, copy=False) - df.columns = ['a'] - - with self.assertRaisesRegex( - qi.IngressError, "Bad column 'a': .*not.*contiguous"): - _dataframe(df, table_name='tbl1', at = qi.ServerTimestamp) - - def test_serializing_in_chunks(self): - df = pd.DataFrame({ - 'a': pd.Series(np.arange(30), dtype='int64'), - 'b': pd.Series(np.arange(30), dtype='Int64')}) - parts = [ - df.iloc[:10], - df.iloc[10:20], - df.iloc[20:]] - for index, part in enumerate(parts): - buf = _dataframe(part, table_name='tbl1', at = qi.ServerTimestamp) + def test_bad_category(self): + # We only support string categories + # (unless anyone asks for additional ones). + # We want to test others are rejected. + with self.assertRaisesRegex( + qi.IngressError, "Bad column 'a'.*got a category of .*int64"): + _dataframe( + self.version, + pd.DataFrame({'a': pd.Series([1, 2, 3, 2], dtype='category')}), + table_name='tbl1', at = qi.ServerTimestamp) + + def _test_cat_table(self, count): + slist = [f's{i}' for i in range(count)] + + df = pd.DataFrame({ + 'a': pd.Series(slist, dtype='category'), + 'b': list(range(len(slist)))}) + + buf = _dataframe(self.version, df, table_name_col=0, at = qi.ServerTimestamp) + exp = ''.join( + f'{s} b={i}i\n' + for i, s in enumerate(slist)) + self.assertEqual(buf, exp.encode("utf-8")) + + slist[2] = None + df2 = pd.DataFrame({ + 'a': pd.Series(slist, dtype='category'), + 'b': list(range(len(slist)))}) + with self.assertRaisesRegex( + qi.IngressError, 'Table name cannot be null'): + _dataframe(self.version, df2, table_name_col=0, at = qi.ServerTimestamp) + + def test_cat_i8_table(self): + self._test_cat_table(30) + self._test_cat_table(127) + + def test_cat_i16_table(self): + self._test_cat_table(128) + self._test_cat_table(4000) + self._test_cat_table(32767) + + def test_cat_i32_table(self): + self._test_cat_table(32768) + self._test_cat_table(40000) + + def _test_cat_symbol(self, count): + slist = [f's{i}' for i in range(count)] + + df = pd.DataFrame({ + 'a': pd.Series(slist, dtype='category'), + 'b': list(range(len(slist)))}) + + buf = _dataframe(self.version, df, table_name='tbl1', symbols=True, at = qi.ServerTimestamp) + exp = ''.join( + f'tbl1,a={s} b={i}i\n' + for i, s in enumerate(slist)) + self.assertEqual(buf, exp.encode("utf-8")) + + slist[2] = None + df2 = pd.DataFrame({ + 'a': pd.Series(slist, dtype='category'), + 'b': list(range(len(slist)))}) + + exp2 = exp.replace('tbl1,a=s2 b=2i\n', 'tbl1 b=2i\n') + buf2 = _dataframe(self.version, df2, table_name='tbl1', symbols=True, at = qi.ServerTimestamp) + self.assertEqual(buf2, exp2.encode("utf-8")) + + def test_cat_i8_symbol(self): + self._test_cat_symbol(30) + self._test_cat_symbol(127) + + def test_cat_i16_symbol(self): + self._test_cat_symbol(128) + self._test_cat_symbol(4000) + self._test_cat_symbol(32767) + + def test_cat_i32_symbol(self): + self._test_cat_symbol(32768) + self._test_cat_symbol(40000) + + def _test_cat_str(self, count): + slist = [f's{i}' for i in range(count)] + + df = pd.DataFrame({ + 'a': pd.Series(slist, dtype='category'), + 'b': list(range(len(slist)))}) + + buf = _dataframe(self.version, df, table_name='tbl1', symbols=False, at = qi.ServerTimestamp) exp = ''.join( - f'tbl1 a={i}i,b={i}i\n' - for i in range(index * 10, (index + 1) * 10)) + f'tbl1 a="{s}",b={i}i\n' + for i, s in enumerate(slist)) + self.assertEqual(buf, exp.encode("utf-8")) + + slist[2] = None + df2 = pd.DataFrame({ + 'a': pd.Series(slist, dtype='category'), + 'b': list(range(len(slist)))}) + + exp2 = exp.replace('tbl1 a="s2",b=2i\n', 'tbl1 b=2i\n') + buf2 = _dataframe(self.version, df2, table_name='tbl1', symbols=False, at = qi.ServerTimestamp) + self.assertEqual(buf2, exp2.encode("utf-8")) + + def test_cat_i8_str(self): + self._test_cat_str(30) + self._test_cat_str(127) + + def test_cat_i16_str(self): + self._test_cat_str(128) + self._test_cat_str(4000) + self._test_cat_str(32767) + + def test_cat_i32_str(self): + self._test_cat_str(32768) + self._test_cat_str(40000) + + def test_all_nulls_pyobj_col(self): + df = pd.DataFrame({ + 'a': [None, pd.NA, float('nan')], + 'b': [1, 2, 3]}) + buf = _dataframe(self.version, df, table_name='tbl1', at = qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 b=1i\n' + + b'tbl1 b=2i\n' + + b'tbl1 b=3i\n') + + def test_strided_numpy_column(self): + two_d = np.array([ + [1, 10], + [2, 20], + [3, 30]], dtype='int64') + col2 = two_d[:, 1] + col2.flags['WRITEABLE'] = False + + # Checking our test case setup. + mv = memoryview(col2) + self.assertEqual(mv.contiguous, False) + self.assertEqual(mv.strides, (16,)) + + df = pd.DataFrame(col2, copy=False) + df.columns = ['a'] + + with self.assertRaisesRegex( + qi.IngressError, "Bad column 'a': .*not.*contiguous"): + _dataframe(self.version, df, table_name='tbl1', at = qi.ServerTimestamp) + + def test_serializing_in_chunks(self): + df = pd.DataFrame({ + 'a': pd.Series(np.arange(30), dtype='int64'), + 'b': pd.Series(np.arange(30), dtype='Int64')}) + parts = [ + df.iloc[:10], + df.iloc[10:20], + df.iloc[20:]] + for index, part in enumerate(parts): + buf = _dataframe(self.version, part, table_name='tbl1', at = qi.ServerTimestamp) + exp = ''.join( + f'tbl1 a={i}i,b={i}i\n' + for i in range(index * 10, (index + 1) * 10)) + self.assertEqual(buf, exp.encode("utf-8")) + + def test_arrow_chunked_array(self): + # We build a table with chunked arrow arrays as columns. + chunks_a = [ + pa.array([1, 2, 3], type=pa.int16()), + pa.array([4, 5, 6], type=pa.int16()), + pa.array([], type=pa.int16()), + pa.array([7, 8, 9], type=pa.int16())] + chunked_a = pa.chunked_array(chunks_a) + chunks_b = [ + pa.array([10, 20], type=pa.int32()), + pa.array([], type=pa.int32()), + pa.array([30, 40, 50, 60], type=pa.int32()), + pa.array([70, 80, 90], type=pa.int32())] + chunked_b = pa.chunked_array(chunks_b) + arr_tab = pa.Table.from_arrays([chunked_a, chunked_b], names=['a', 'b']) + + # NOTE! + # This does *not* preserve the chunking of the arrow arrays. + df = arr_tab.to_pandas() + buf = _dataframe(self.version, df, table_name='tbl1', at = qi.ServerTimestamp) + exp = ( + b'tbl1 a=1i,b=10i\n' + + b'tbl1 a=2i,b=20i\n' + + b'tbl1 a=3i,b=30i\n' + + b'tbl1 a=4i,b=40i\n' + + b'tbl1 a=5i,b=50i\n' + + b'tbl1 a=6i,b=60i\n' + + b'tbl1 a=7i,b=70i\n' + + b'tbl1 a=8i,b=80i\n' + + b'tbl1 a=9i,b=90i\n') self.assertEqual(buf, exp) - def test_arrow_chunked_array(self): - # We build a table with chunked arrow arrays as columns. - chunks_a = [ - pa.array([1, 2, 3], type=pa.int16()), - pa.array([4, 5, 6], type=pa.int16()), - pa.array([], type=pa.int16()), - pa.array([7, 8, 9], type=pa.int16())] - chunked_a = pa.chunked_array(chunks_a) - chunks_b = [ - pa.array([10, 20], type=pa.int32()), - pa.array([], type=pa.int32()), - pa.array([30, 40, 50, 60], type=pa.int32()), - pa.array([70, 80, 90], type=pa.int32())] - chunked_b = pa.chunked_array(chunks_b) - arr_tab = pa.Table.from_arrays([chunked_a, chunked_b], names=['a', 'b']) - - # NOTE! - # This does *not* preserve the chunking of the arrow arrays. - df = arr_tab.to_pandas() - buf = _dataframe(df, table_name='tbl1', at = qi.ServerTimestamp) - exp = ( - 'tbl1 a=1i,b=10i\n' + - 'tbl1 a=2i,b=20i\n' + - 'tbl1 a=3i,b=30i\n' + - 'tbl1 a=4i,b=40i\n' + - 'tbl1 a=5i,b=50i\n' + - 'tbl1 a=6i,b=60i\n' + - 'tbl1 a=7i,b=70i\n' + - 'tbl1 a=8i,b=80i\n' + - 'tbl1 a=9i,b=90i\n') - self.assertEqual(buf, exp) - - if not hasattr(pd, 'ArrowDtype'): - # We don't have pandas ArrowDtype, so we can't test the rest. - return - - # To preserve the chunking we need to use a special pandas type: - pandarrow_a = pd.array(chunked_a, dtype='int16[pyarrow]') - pandarrow_b = pd.array(chunked_b, dtype='int32[pyarrow]') - df = pd.DataFrame({'a': pandarrow_a, 'b': pandarrow_b}) - - # Note that this dtype is experimental (currently), - # so we don't support it yet.. but we have everything in place should we - # need to, so - as for now - we just test that we raise a nice error. - with self.assertRaisesRegex( - qi.IngressError, - "Unsupported dtype int16\[pyarrow\] for column 'a'.*github"): - _dataframe(df, table_name='tbl1', at = qi.ServerTimestamp) - - @unittest.skipIf(not fastparquet, 'fastparquet not installed') - @with_tmp_dir - def test_parquet_roundtrip(self, tmpdir): - pa_parquet_path = tmpdir / 'test_pa.parquet' - fp_parquet_path = tmpdir / 'test_fp.parquet' - df = pd.DataFrame({ - 's': pd.Categorical(['a', 'b', 'a', 'c', 'a']), - 'a': pd.Series([1, 2, 3, 4, 5], dtype='int16'), - 'b': pd.Series([10, 20, 30, None, 50], dtype='UInt8'), - 'c': [0.5, float('nan'), 2.5, 3.5, None]}) - df.to_parquet(pa_parquet_path, engine='pyarrow') - df.to_parquet(fp_parquet_path, engine='fastparquet') - pa2pa_df = pd.read_parquet(pa_parquet_path, engine='pyarrow') - pa2fp_df = pd.read_parquet(pa_parquet_path, engine='fastparquet') - fp2pa_df = pd.read_parquet(fp_parquet_path, engine='pyarrow') - fp2fp_df = pd.read_parquet(fp_parquet_path, engine='fastparquet') - - exp_dtypes = ['category', 'int16', 'UInt8', 'float64'] - self.assertEqual(list(df.dtypes), exp_dtypes) - - def df_eq(exp_df, deser_df, exp_dtypes): - self.assertEqual(list(deser_df.dtypes), exp_dtypes) - if not exp_df.equals(deser_df): - print('\nexp_df:') - print(exp_df) - print('\ndeser_df:') - print(deser_df) - self.assertTrue(exp_df.equals(deser_df)) - - # fastparquet doesn't roundtrip with pyarrow parquet properly. - # It decays categories to object and UInt8 to float64. - # We need to set up special case expected results for that. - fallback_exp_dtypes = [ - np.dtype('O'), - np.dtype('int16'), - np.dtype('float64'), - np.dtype('float64')] - fallback_df = df.astype({'s': 'object', 'b': 'float64'}) - - df_eq(df, pa2pa_df, exp_dtypes) - df_eq(df, pa2fp_df, exp_dtypes) - df_eq(fallback_df, fp2pa_df, fallback_exp_dtypes) - df_eq(df, fp2fp_df, exp_dtypes) - - exp = ( - 'tbl1,s=a a=1i,b=10i,c=0.5\n' + - 'tbl1,s=b a=2i,b=20i,c=NaN\n' + - 'tbl1,s=a a=3i,b=30i,c=2.5\n' + - 'tbl1,s=c a=4i,c=3.5\n' + - 'tbl1,s=a a=5i,b=50i,c=NaN\n') - - fallback_exp = ( - 'tbl1 s="a",a=1i,b=10.0,c=0.5\n' + - 'tbl1 s="b",a=2i,b=20.0,c=NaN\n' + - 'tbl1 s="a",a=3i,b=30.0,c=2.5\n' + - 'tbl1 s="c",a=4i,b=NaN,c=3.5\n' + - 'tbl1 s="a",a=5i,b=50.0,c=NaN\n') - - self.assertEqual(_dataframe(df, table_name='tbl1', at=qi.ServerTimestamp), exp) - self.assertEqual(_dataframe(pa2pa_df, table_name='tbl1', at=qi.ServerTimestamp), exp) - self.assertEqual(_dataframe(pa2fp_df, table_name='tbl1', at=qi.ServerTimestamp), exp) - self.assertEqual(_dataframe(fp2pa_df, table_name='tbl1', at=qi.ServerTimestamp), fallback_exp) - self.assertEqual(_dataframe(fp2fp_df, table_name='tbl1', at=qi.ServerTimestamp), exp) + if not hasattr(pd, 'ArrowDtype'): + # We don't have pandas ArrowDtype, so we can't test the rest. + return + + # To preserve the chunking we need to use a special pandas type: + pandarrow_a = pd.array(chunked_a, dtype='int16[pyarrow]') + pandarrow_b = pd.array(chunked_b, dtype='int32[pyarrow]') + df = pd.DataFrame({'a': pandarrow_a, 'b': pandarrow_b}) + + # Note that this dtype is experimental (currently), + # so we don't support it yet.. but we have everything in place should we + # need to, so - as for now - we just test that we raise a nice error. + with self.assertRaisesRegex( + qi.IngressError, + "Unsupported dtype int16\[pyarrow\] for column 'a'.*github"): + _dataframe(self.version, df, table_name='tbl1', at = qi.ServerTimestamp) + + @unittest.skipIf(not fastparquet, 'fastparquet not installed') + @with_tmp_dir + def test_parquet_roundtrip(self, tmpdir): + pa_parquet_path = tmpdir / 'test_pa.parquet' + fp_parquet_path = tmpdir / 'test_fp.parquet' + df = pd.DataFrame({ + 's': pd.Categorical(['a', 'b', 'a', 'c', 'a']), + 'a': pd.Series([1, 2, 3, 4, 5], dtype='int16'), + 'b': pd.Series([10, 20, 30, None, 50], dtype='UInt8'), + 'c': [0.5, float('nan'), 2.5, 3.5, None]}) + df.to_parquet(pa_parquet_path, engine='pyarrow') + df.to_parquet(fp_parquet_path, engine='fastparquet') + pa2pa_df = pd.read_parquet(pa_parquet_path, engine='pyarrow') + pa2fp_df = pd.read_parquet(pa_parquet_path, engine='fastparquet') + fp2pa_df = pd.read_parquet(fp_parquet_path, engine='pyarrow') + fp2fp_df = pd.read_parquet(fp_parquet_path, engine='fastparquet') + + exp_dtypes = ['category', 'int16', 'UInt8', 'float64'] + self.assertEqual(list(df.dtypes), exp_dtypes) + + def df_eq(exp_df, deser_df, exp_dtypes): + self.assertEqual(list(deser_df.dtypes), exp_dtypes) + if not exp_df.equals(deser_df): + print('\nexp_df:') + print(exp_df) + print('\ndeser_df:') + print(deser_df) + self.assertTrue(exp_df.equals(deser_df)) + + # fastparquet doesn't roundtrip with pyarrow parquet properly. + # It decays categories to object and UInt8 to float64. + # We need to set up special case expected results for that. + fallback_exp_dtypes = [ + np.dtype('O'), + np.dtype('int16'), + np.dtype('float64'), + np.dtype('float64')] + fallback_df = df.astype({'s': 'object', 'b': 'float64'}) + + df_eq(df, pa2pa_df, exp_dtypes) + df_eq(df, pa2fp_df, exp_dtypes) + df_eq(fallback_df, fp2pa_df, fallback_exp_dtypes) + df_eq(df, fp2fp_df, exp_dtypes) + + exp = ( + b'tbl1,s=a a=1i,b=10i,c' + _float_binary_bytes(0.5, self.version == 1) + b'\n' + + b'tbl1,s=b a=2i,b=20i,c' + _float_binary_bytes(float('NaN'), self.version == 1) + b'\n' + + b'tbl1,s=a a=3i,b=30i,c' + _float_binary_bytes(2.5, self.version == 1) + b'\n' + + b'tbl1,s=c a=4i,c' + _float_binary_bytes(3.5, self.version == 1) + b'\n' + + b'tbl1,s=a a=5i,b=50i,c' + _float_binary_bytes(float('NaN'), self.version == 1) + b'\n') + + fallback_exp = ( + b'tbl1 s="a",a=1i,b' + _float_binary_bytes(10.0, self.version == 1) + b',c' + + _float_binary_bytes(0.5, self.version == 1) + b'\n' + + b'tbl1 s="b",a=2i,b' + _float_binary_bytes(20.0, self.version == 1) + b',c' + + _float_binary_bytes(float('NaN'), self.version == 1) + b'\n' + + b'tbl1 s="a",a=3i,b' + _float_binary_bytes(30.0, self.version == 1) + b',c' + + _float_binary_bytes(2.5, self.version == 1) + b'\n' + + b'tbl1 s="c",a=4i,b' + _float_binary_bytes(float('NaN'), self.version == 1) + b',c' + + _float_binary_bytes(3.5, self.version == 1) + b'\n' + + b'tbl1 s="a",a=5i,b' + _float_binary_bytes(50.0, self.version == 1) + b',c' + + _float_binary_bytes(float('NaN'), self.version == 1) + b'\n') + + self.assertEqual(_dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp), exp) + self.assertEqual(_dataframe(self.version, pa2pa_df, table_name='tbl1', at=qi.ServerTimestamp), exp) + self.assertEqual(_dataframe(self.version, pa2fp_df, table_name='tbl1', at=qi.ServerTimestamp), exp) + self.assertEqual(_dataframe(self.version, fp2pa_df, table_name='tbl1', at=qi.ServerTimestamp), fallback_exp) + self.assertEqual(_dataframe(self.version, fp2fp_df, table_name='tbl1', at=qi.ServerTimestamp), exp) + + def test_f64_np_array(self): + df = pd.DataFrame({ + 'a': [np.array([1.0], np.float64), np.array([2.0], np.float64), np.array([3.0], np.float64)]}) + + if self.version == 1: + with self.assertRaisesRegex( + qi.IngressError, + "Protocol version v1 does not support array datatype"): + _ = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + else: + buf = _dataframe(self.version, df, table_name='tbl1', at=qi.ServerTimestamp) + self.assertEqual( + buf, + b'tbl1 a=' + _array_binary_bytes(np.array([1.0], np.float64)) + b'\n' + + b'tbl1 a=' + _array_binary_bytes(np.array([2.0], np.float64)) + b'\n' + + b'tbl1 a=' + _array_binary_bytes(np.array([3.0], np.float64)) + b'\n') + +class TestPandasProtocolVersionV1(TestPandasBase.TestPandas): + name = 'protocol version 1' + version = 1 + + +class TestPandasProtocolVersionV2(TestPandasBase.TestPandas): + name = 'protocol version 2' + version = 2 if __name__ == '__main__': diff --git a/test/test_dataframe_fuzz.py b/test/test_dataframe_fuzz.py index 292b313c..045acdc8 100644 --- a/test/test_dataframe_fuzz.py +++ b/test/test_dataframe_fuzz.py @@ -149,7 +149,7 @@ def test_dataframe(input_bytes): df, table_name, table_name_col, symbols, at = params try: - BUF = qi.Buffer() + BUF = qi.Buffer(protocol_version=2) BUF.clear() try: BUF.dataframe( diff --git a/test/test_dataframe_leaks.py b/test/test_dataframe_leaks.py index 15b2a229..eab23314 100644 --- a/test/test_dataframe_leaks.py +++ b/test/test_dataframe_leaks.py @@ -12,7 +12,7 @@ def get_rss(): def serialize_and_cleanup(): - # qi.Buffer().row( + # qi.Buffer(protocol_version=2).row( # 'table_name', # symbols={'x': 'a', 'y': 'b'}, # columns={'a': 1, 'b': 2, 'c': 3}) @@ -20,7 +20,7 @@ def serialize_and_cleanup(): 'a': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], 'b': [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], 'c': [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]}) - qi.Buffer().dataframe(df, table_name='test', at=qi.ServerTimestamp) + qi.Buffer(protocol_version=2).dataframe(df, table_name='test', at=qi.ServerTimestamp) def main(): diff --git a/test/test_tools.py b/test/test_tools.py new file mode 100644 index 00000000..69da3ae2 --- /dev/null +++ b/test/test_tools.py @@ -0,0 +1,53 @@ + +import struct +import numpy as np + +ARRAY_TYPE_TAGS = { + np.float64: 10, +} + +import math +import struct + +def _float_binary_bytes(value: float, text_format: bool = False) -> bytes: + if text_format: + if math.isnan(value): + return b'=NaN' + elif math.isinf(value): + return f'={"-Infinity" if value < 0 else "Infinity"}'.encode('utf-8') + else: + return f'={value}'.encode('utf-8').replace(b'+', b'') + else: + return b'==' + struct.pack(' bytes: + header = b'=' + format_type = struct.pack('