Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 120 additions & 29 deletions import_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -970,10 +970,10 @@ def _create_parser(self):
parser.add_argument(
'--w3c-field-regex', action=StoreDictKeyPair, metavar='KEY=VAL', default={}, dest="w3c_field_regexes", type=str,
help="Specify a regex for a field in your W3C extended log file. You can use this option to parse fields the "
"importer does not natively recognize and then use one of the --regex-group-to-XXX-cvar options to track "
"the field in a custom variable. For example, specifying --w3c-field-regex=sc-win32-status=(?P<win32_status>\\S+) "
"--regex-group-to-page-cvar=\"win32_status=Windows Status Code\" will track the sc-win32-status IIS field "
"in the 'Windows Status Code' custom variable. Regexes must contain a named group."
"importer does not natively recognize and then use one of the --regex-group-to-XXX-cdim options to track "
"the field in a custom dimension. For example, specifying --w3c-field-regex=sc-win32-status=(?P<win32_status>\\S+) "
"--regex-group-to-page-cdim=\"win32_status=Windows Status Code\" will track the sc-win32-status IIS field "
"in the 'Windows Status Code' custom dimension. Regexes must contain a named group."
)
parser.add_argument(
'--title-category-delimiter', dest='title_category_delimiter', default='/',
Expand All @@ -994,22 +994,29 @@ def _create_parser(self):
"disable normal user id tracking. See documentation for --log-format-regex for list of available "
"regex groups."
)

parser.add_argument(
'--regex-group-to-visit-cvar', action=StoreDictKeyPair, metavar='KEY=VAL',dest='regex_group_to_visit_cvars_map', default={},
help="Track an attribute through a custom variable with visit scope instead of through Matomo's normal "
"approach. For example, to track usernames as a custom variable instead of through the uid tracking "
"parameter, supply --regex-group-to-visit-cvar=\"userid=User Name\". This will track usernames in a "
"custom variable named 'User Name'. The list of available regex groups can be found in the documentation "
'--regex-group-to-visit-cvar', action=StoreDictKeyPair, metavar='KEY=VAL', dest='regex_group_to_visit_cvars_map', default={},
help="DEPRECATED"
)
parser.add_argument(
'--regex-group-to-page-cvar', action=StoreDictKeyPair, metavar='KEY=VAL', dest='regex_group_to_page_cvars_map', default={},
help="DEPRECATED"
)
parser.add_argument(
'--regex-group-to-visit-cdim', action=StoreDictKeyPair, metavar='KEY=VAL', dest='regex_group_to_visit_cdims_map', default={},
help="Track an attribute through a custom dimension with visit scope instead of through Matomo's normal "
"approach. For example, to track usernames as a custom dimension instead of through the uid tracking "
"parameter, supply --regex-group-to-visit-cdim=\"userid=User Name\". This will track usernames in a "
"custom dimension named 'User Name'. The list of available regex groups can be found in the documentation "
"for --log-format-regex (additional regex groups you may have defined "
"in --log-format-regex can also be used)."
)
parser.add_argument(
'--regex-group-to-page-cvar', action=StoreDictKeyPair, metavar='KEY=VAL', dest='regex_group_to_page_cvars_map', default={},
help="Track an attribute through a custom variable with page scope instead of through Matomo's normal "
"approach. For example, to track usernames as a custom variable instead of through the uid tracking "
"parameter, supply --regex-group-to-page-cvar=\"userid=User Name\". This will track usernames in a "
"custom variable named 'User Name'. The list of available regex groups can be found in the documentation "
'--regex-group-to-action-cdim', action=StoreDictKeyPair, metavar='KEY=VAL', dest='regex_group_to_action_cdims_map', default={},
help="Track an attribute through a custom dimension with action scope instead of through Matomo's normal "
"approach. For example, to track usernames as a custom dimension instead of through the uid tracking "
"parameter, supply --regex-group-to-action-cdim=\"userid=User Name\". This will track usernames in a "
"custom dimension named 'User Name'. The list of available regex groups can be found in the documentation "
"for --log-format-regex (additional regex groups you may have defined "
"in --log-format-regex can also be used)."
)
Expand Down Expand Up @@ -1942,6 +1949,49 @@ def check_format(self, format):
"specify the Matomo site ID with the --idsite argument"
)


class CustomDimensions:
"""
Utility to manage custom dimensions.
"""
dimensions = {}

def __init__(self):
self.lock = threading.RLock()

def pull_dimensions(self, site_id):
self.lock.acquire()
try:
dimensions = matomo.call_api('CustomDimensions.getConfiguredCustomDimensions', idSite=site_id)
for dimension in dimensions:
if dimension['active']:
self.dimensions.setdefault(int(site_id), {})[(dimension['scope'], dimension['name'])] = int(dimension['idcustomdimension'])
finally:
self.lock.release()

def create_new_dimension(self, site_id, scope, name):
self.lock.acquire()
try:
return matomo.call_api('CustomDimensions.configureNewCustomDimension', idSite=site_id, scope=scope, name=name, active=1)
finally:
self.lock.release()

def get_custom_dimension_id(self, site_id, scope, name):
if self.dimensions.get(int(site_id)) is None:
self.pull_dimensions(site_id)
dimension_id = self.dimensions.get(int(site_id), {}).get((scope, name))

if dimension_id:
return dimension_id
self.lock.acquire()
try:
dimension_id = self.create_new_dimension(site_id, scope, name)['value']
self.pull_dimensions(site_id)
return dimension_id
finally:
self.lock.release()


class Recorder:
"""
A Recorder fetches hits from the Queue and inserts them into Matomo using
Expand Down Expand Up @@ -2070,11 +2120,11 @@ def _get_hit_args(self, hit):
# handle custom variables before generating args dict
if config.options.enable_bots:
if hit.is_robot:
hit.add_visit_custom_var("Bot", hit.user_agent)
hit.add_visit_custom_dimension(site_id, "Bot", hit.user_agent)
else:
hit.add_visit_custom_var("Not-Bot", hit.user_agent)
hit.add_visit_custom_dimension(site_id, "Not-Bot", hit.user_agent)

hit.add_page_custom_var("HTTP-code", hit.status)
hit.add_action_custom_dimension(site_id, "HTTP-code", hit.status)

args = {
'rec': '1',
Expand Down Expand Up @@ -2254,6 +2304,22 @@ def get_visitor_id_hash(self):

return abs(hash(visitor_id))

def add_action_custom_dimension(self, site_id, key, value):
"""
Adds a page custom dimension to this Hit.
"""
self._add_custom_dimension(site_id, key, value, 'action')

def add_visit_custom_dimension(self, site_id, key, value):
"""
Adds a visit custom dimension to this Hit.
"""
self._add_custom_dimension(site_id, key, value, 'visit')

def _add_custom_dimension(self, site_id, key, value, scope):
dimension_id = custom_dimensions.get_custom_dimension_id(site_id, scope, key)
self.args['dimension%s' % dimension_id] = value

def add_page_custom_var(self, key, value):
"""
Adds a page custom variable to this Hit.
Expand Down Expand Up @@ -2597,23 +2663,16 @@ def filtered_line(line, reason):
args={},
)

if config.options.regex_groups_to_ignore:
format.remove_ignored_groups(config.options.regex_groups_to_ignore)

# FIXME: custom variables are deprecated...
if config.options.regex_group_to_page_cvars_map:
self._add_custom_vars_from_regex_groups(hit, format, config.options.regex_group_to_page_cvars_map, True)

if config.options.regex_group_to_visit_cvars_map:
self._add_custom_vars_from_regex_groups(hit, format, config.options.regex_group_to_visit_cvars_map, False)

if config.options.regex_groups_to_ignore:
format.remove_ignored_groups(config.options.regex_groups_to_ignore)

# Add http method page cvar
try:
httpmethod = format.get('method')
if config.options.track_http_method and httpmethod != '-':
hit.add_page_custom_var('HTTP-method', httpmethod)
except:
pass

try:
hit.query_string = format.get('query_string')
hit.path = hit.full_path
Expand Down Expand Up @@ -2729,6 +2788,22 @@ def filtered_line(line, reason):
invalid_line(line, 'invalid timezone')
continue

site_id, main_url = resolver.resolve(hit)

if config.options.regex_group_to_action_cdims_map:
self._add_custom_dimension_from_regex_groups(site_id, hit, format, config.options.regex_group_to_action_cdims_map, 'action')

if config.options.regex_group_to_visit_cdims_map:
self._add_custom_dimension_from_regex_groups(site_id, hit, format, config.options.regex_group_to_visit_cdims_map, 'visit')

# Add http method page custom dimension
try:
httpmethod = format.get('method')
if config.options.track_http_method and httpmethod != '-':
hit.add_action_custom_dimension(site_id, 'HTTP-method', httpmethod)
except:
pass

if config.options.replay_tracking:
# we need a query string and we only consider requests with piwik.php
if not hit.query_string or not self.is_hit_for_tracker(hit):
Expand Down Expand Up @@ -2787,6 +2862,21 @@ def _add_custom_vars_from_regex_groups(self, hit, format, groups, is_page_var):
else:
hit.add_visit_custom_var(custom_var_name, value)

def _add_custom_dimension_from_regex_groups(self, site_id, hit, format, groups, scope):
for group_name, custom_dim_name in groups.items():
if group_name in format.get_all():
value = format.get(group_name)

# don't track the '-' empty placeholder value
if value == '-':
continue

if scope == 'action':
hit.add_action_custom_dimension(site_id, custom_dim_name, value)
else:
hit.add_visit_custom_dimension(site_id, custom_dim_name, value)


def main():
"""
Start the importing process.
Expand Down Expand Up @@ -2834,6 +2924,7 @@ def fatal_error(error, filename=None, lineno=None):
stats = Statistics()
resolver = config.get_resolver()
parser = Parser()
custom_dimensions = CustomDimensions()
main()
sys.exit(0)
except KeyboardInterrupt:
Expand Down
49 changes: 32 additions & 17 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,8 @@ def __init__(self):
self.w3c_field_regexes = {}
self.regex_group_to_visit_cvars_map = {}
self.regex_group_to_page_cvars_map = {}
self.regex_group_to_visit_cdims_map = {}
self.regex_group_to_action_cdims_map = {}
self.regex_groups_to_ignore = None
self.replay_tracking_expected_tracker_file = 'piwik.php'
self.debug_request_limit = None
Expand All @@ -202,6 +204,9 @@ class Resolver(object):
def check_format(self, format_):
pass

def resolve(self, hit):
return 1, "https://example.org/"

class Recorder(object):
"""Mock recorder which collects hits but doesn't put their in database."""
recorders = []
Expand All @@ -210,6 +215,16 @@ class Recorder(object):
def add_hits(cls, hits):
cls.recorders.extend(hits)

import_logs.custom_dimensions = import_logs.CustomDimensions()
import_logs.custom_dimensions.dimensions[1] = {
('visit', 'User Name'): 1,
('visit', 'The Date'): 2,
('action', 'Generation Time'): 3,
('action', 'The Referrer'): 4,
('action', 'HTTP-method'): 5
}


def test_replay_tracking_seconds_to_add_to_date():
"""Test data parsing from sample log file."""
file_ = 'logs/logs_to_tests.log'
Expand Down Expand Up @@ -553,7 +568,7 @@ def test_iis_custom_format():
assert hits[0]['extension'] == '/products/theproduct'
assert hits[0]['is_download'] == False
assert hits[0]['referrer'] == 'http://example.com/Search/SearchResults.pg?informationRecipient.languageCode.c=en'
assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}}
assert hits[0]['args'] == {'dimension5': 'GET'}
assert hits[0]['generation_time_milli'] == 109
assert hits[0]['host'] == 'foo'
assert hits[0]['filename'] == 'logs/iis_custom.log'
Expand All @@ -572,7 +587,7 @@ def test_iis_custom_format():
assert hits[1]['extension'] == '/topic/hw43061'
assert hits[1]['is_download'] == False
assert hits[1]['referrer'] == ''
assert hits[1]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}}
assert hits[1]['args'] == {'dimension5': 'GET'}
assert hits[1]['generation_time_milli'] == 0
assert hits[1]['host'] == 'foo'
assert hits[1]['filename'] == 'logs/iis_custom.log'
Expand All @@ -591,7 +606,7 @@ def test_iis_custom_format():
assert hits[2]['extension'] == '/hello/world/6,681965'
assert hits[2]['is_download'] == False
assert hits[2]['referrer'] == ''
assert hits[2]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}}
assert hits[2]['args'] == {'dimension5': 'GET'}
assert hits[2]['generation_time_milli'] == 359
assert hits[2]['host'] == 'foo'
assert hits[2]['filename'] == 'logs/iis_custom.log'
Expand Down Expand Up @@ -629,7 +644,7 @@ def test_netscaler_parsing():
assert hits[0]['extension'] == 'jsp'
assert hits[0]['is_download'] == False
assert hits[0]['referrer'] == ''
assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}}
assert hits[0]['args'] == {'dimension5': 'GET'}
assert hits[0]['generation_time_milli'] == 1000
assert hits[0]['host'] == 'foo'
assert hits[0]['filename'] == 'logs/netscaler.log'
Expand Down Expand Up @@ -827,7 +842,7 @@ def test_amazon_cloudfront_web_parsing():
assert hits[0]['extension'] == 'html'
assert hits[0]['is_download'] == False
assert hits[0]['referrer'] == 'https://example.com/'
assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}}
assert hits[0]['args'] == {'dimension5': 'GET'}
assert hits[0]['generation_time_milli'] == 1.0
assert hits[0]['host'] == 'foo'
assert hits[0]['filename'] == 'logs/amazon_cloudfront_web.log'
Expand Down Expand Up @@ -975,7 +990,7 @@ def test_incapsulaw3c_parsing():
assert hits[0]['extension'] == 'php'
assert hits[0]['is_download'] == False
assert hits[0]['referrer'] == u''
assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', u'"GET"']}}
assert hits[0]['args'] == {'dimension5': '"GET"'}
assert hits[0]['length'] == 10117
assert hits[0]['generation_time_milli'] == 0
assert hits[0]['host'] == 'www.example.com'
Expand All @@ -996,7 +1011,7 @@ def test_incapsulaw3c_parsing():
assert hits[1]['extension'] == '/rss/news'
assert hits[1]['is_download'] == False
assert hits[1]['referrer'] == u''
assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', u'"GET"']}}
assert hits[0]['args'] == {'dimension5': '"GET"'}
assert hits[1]['length'] == 0
assert hits[1]['generation_time_milli'] == 0
assert hits[1]['host'] == 'www.example.com'
Expand Down Expand Up @@ -1121,8 +1136,8 @@ def test_ignore_groups_option_removes_groups():
assert hits[0]['userid'] == None
assert hits[0]['generation_time_milli'] == 0

def test_regex_group_to_custom_var_options():
"""Test that the --regex-group-to-visit-cvar and --regex-group-to-page-cvar track regex groups to custom vars."""
def test_regex_group_to_custom_dimensions_options():
"""Test that the --regex-group-to-visit-cdim and --regex-group-to-action-cdim track regex groups to custom vars."""

file_ = 'logs/iis.log'

Expand All @@ -1136,22 +1151,22 @@ def test_regex_group_to_custom_var_options():
import_logs.config.options.replay_tracking = False
import_logs.config.options.w3c_time_taken_in_millisecs = True
import_logs.config.options.regex_groups_to_ignore = set()
import_logs.config.options.regex_group_to_visit_cvars_map = {
import_logs.config.options.regex_group_to_visit_cdims_map = {
'userid': "User Name",
'date': "The Date"
}
import_logs.config.options.regex_group_to_page_cvars_map = {
import_logs.config.options.regex_group_to_action_cdims_map = {
'generation_time_milli': 'Generation Time',
'referrer': 'The Referrer'
}
import_logs.parser.parse(file_)

hits = [hit.__dict__ for hit in Recorder.recorders]

assert ['The Date', '2012-04-01 00:00:13'] in hits[0]['args']['_cvar'].values()
assert ['User Name', 'theuser'] in hits[0]['args']['_cvar'].values()
assert ['Generation Time', '1687'] in hits[0]['args']['cvar'].values()
assert ['HTTP-method', 'GET'] in hits[0]['args']['cvar'].values()
assert hits[0]['args']['dimension1'] == 'theuser'
assert hits[0]['args']['dimension2'] == '2012-04-01 00:00:13'
assert hits[0]['args']['dimension3'] == '1687'
assert hits[0]['args']['dimension5'] == 'GET'

assert hits[0]['userid'] == 'theuser'
assert hits[0]['date'] == datetime.datetime(2012, 4, 1, 0, 0, 13)
Expand Down Expand Up @@ -1202,8 +1217,8 @@ def test_custom_log_date_format_option():
Recorder.recorders = []
import_logs.parser = import_logs.Parser()
import_logs.config.options.w3c_field_regexes = None
import_logs.config.options.regex_group_to_visit_cvars_map = None
import_logs.config.options.regex_group_to_page_cvars_map = None
import_logs.config.options.regex_group_to_visit_cdims_map = None
import_logs.config.options.regex_group_to_action_cdims_map = None
import_logs.config.options.log_format_regex = (
r'(?P<ip>\S+)\s+\S+\s+\S+\s+\[(?P<date>.*?)\]\s+'
r'"\S+\s+(?P<path>.*?)\s+\S+"\s+(?P<status>\S+)\s+(?P<length>\S+)'
Expand Down
Loading