ENH: Dukascopy tick data

Dukascopy (also #153) provides ticks data

http://www.dukascopy.com/datafeed/EURUSD/2016/02/14/20h_ticks.bi5
is ticks for 2016-02-14 (`datetime.datetime(2016, 2, 14)`) from 08PM to 09PM
This is LZMA compressed data.

https://github.com/thalesians/pythalesians provides some Python code to download and process data.
https://github.com/thalesians/pythalesians/blob/4974a26c58fde1b4a86e6b683494a7ccd8fb6e2e/pythalesians/market/loaders/lowlevel/brokers/loaderdukascopy.py
they are using Struct https://docs.python.org/2/library/struct.html
but Numpy fromfile http://docs.scipy.org/doc/numpy-1.10.0/reference/generated/numpy.fromfile.html
or Construct http://construct.readthedocs.org/ might also be used

see also https://github.com/ninety47/dukascopy
http://eareview.net/tick-data/dukascopy-php-scripts
https://github.com/FX31337/FX-BT-Scripts
http://stackoverflow.com/questions/14035808/reading-data-from-dukascopy-tick-binary-file
http://stackoverflow.com/questions/30389417/not-sure-how-to-uncompress-read-result-from-binary-file

WIP:

``` python
import click

# import struct
import datetime

try:
    import lzma
except ImportError:
    # pip install backports.lzma
    from backports import lzma

import numpy as np
import pandas as pd
from collections import OrderedDict

import requests
import requests_cache

import warnings
from pandas_datareader._utils import RemoteDataError, SymbolWarning


def _init_session(session):
    if session is None:
        session = requests.Session()
    return session


# def chunks(lst, n):
#     if n < 1:
#         n = 1
#     return [lst[i:i + n] for i in range(0, len(lst), n)]


def _sanitize_dates(start, end):
    """
    Return (datetime_start, datetime_end) tuple
    if start is None - default is 2016/02/04
    if end is None - default is today
    """
    start = pd.to_datetime(start)
    end = pd.to_datetime(end)

    lst_not_a_time = [None, pd.NaT]

    if start in lst_not_a_time:
        start = datetime.datetime(2016, 2, 4)
    else:
        start = datetime.datetime(start.year, start.month, start.day,
                                  start.hour)

    if end in lst_not_a_time:
        end = start
    else:
        end = datetime.datetime(end.year, end.month, end.day, end.hour)

    return start, end


def _sanitize_symbol(symb):
    return symb.replace("/", "").upper()


def read(symbols, start, stop=None, session=None):
    return _read_several_several_symbols(_read_one_symbol, symbols, start, stop, session)


def _read_several_several_symbols(fct_read, symbols, start, stop=None, session=None):
    if len(symbols) == 1:
        df = fct_read(symbols[0], start, stop, session)
        return df
    else:
        d_df_symb = OrderedDict()
        failed = []
        for symb in symbols:
            try:
                d_df_symb[symb] = fct_read(symb, start, stop, session)
            except RemoteDataError:
                msg = 'Failed to read symbol: {0!r}'
                warnings.warn(msg.format(symb), SymbolWarning)
                failed.append(symb)
        return pd.Panel(d_df_symb).swapaxes('items', 'minor')


def _read_one_symbol(symb, start, stop, session):
    start, end = _sanitize_dates(start, stop)

    dt_chunks = pd.date_range(start, stop, freq="1H")

    lst = []
    for dt_chunk in dt_chunks:
        df_chunk = _read_chunk(symb, dt_chunk, session)
        lst.append(df_chunk)
    df_symb = pd.concat(lst, axis=0)
    return df_symb


def _read_chunk(symb, dt_chunk, session=None):
    session = _init_session(session)
    symb = _sanitize_symbol(symb)

    base_url = "http://www.dukascopy.com/datafeed"
    endpoint = "/%s/%04d/%02d/%02d/%02dh_ticks.bi5" \
        % (symb, dt_chunk.year, dt_chunk.month, dt_chunk.day, dt_chunk.hour)
    url = base_url + endpoint
    # Example:
    # http://www.dukascopy.com/datafeed/EURUSD/2016/02/14/20h_ticks.bi5

    response = session.get(url)

    if response.status_code != 200:
        raise RemoteDataError("Can't download %r using %r" % (symb, url))

    compressed_data = response.content
    columns = ["Date", "Ask", "Bid", "AskVolume", "BidVolume"]

    if len(compressed_data) > 0:
        raw_data = lzma.decompress(response.content)
        # chks = chunks(raw_data, 20)
        # data = [struct.unpack(">LLLff", chk) for chk in chks]
        """
        ">LLLff"
            >   big-endian
            L   unsigned long
            L   unsigned long
            L   unsigned long
            f   float
            f   float
        """
        record_dtype = np.dtype([
            ('Date', '>u4'),
            ('Ask', '>u4'),
            ('Bid', '>u4'),
            ('AskVolume', '>f4'),
            ('BidVolume', '>f4'),
        ])        
        data = np.fromstring(raw_data, record_dtype)

        df = pd.DataFrame(data, columns=columns)
        if symb[3:] == "JPY":
            p_digits = 3
        else:
            p_digits = 5
        for p in ["Ask", "Bid"]:
            df[p] = df[p] / 10**p_digits
        df["Date"] = dt_chunk + pd.to_timedelta(df["Date"], unit="ms")
        df = df.set_index("Date")
        return df
    else:
        return pd.DataFrame(columns=columns).set_index("Date")


@click.command()
#@click.option('--symb', default="EURUSD,USDJPY", help='Symbol.')
@click.option('--symb', default="EURUSD", help='Symbol.')
@click.option('--start', default="2016-02-15 00:00:00", help='Start.')
@click.option('--stop', default="2016-02-15 23:00:00", help='Stop.')
def main(symb, start, stop):
    pd.set_option("max_rows", 10)

    symb = symb.split(",")

    expire_after = datetime.timedelta(days=3)
    session = requests_cache.CachedSession(cache_name='cache',
                                           backend='sqlite',
                                           expire_after=expire_after)

    df = read(symb, start, stop, session=session)

    print(df)


if __name__ == '__main__':
    main()

```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

ENH: Dukascopy tick data #235

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

ENH: Dukascopy tick data #235

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions