diff --git a/import_logs.py b/import_logs.py index 4b5073a..623a6ca 100755 --- a/import_logs.py +++ b/import_logs.py @@ -753,10 +753,10 @@ def _create_parser(self): option_parser.add_option( '--w3c-field-regex', action='callback', callback=functools.partial(self._set_option_map, 'w3c_field_regexes'), type='string', help="Specify a regex for a field in your W3C extended log file. You can use this option to parse fields the " - "importer does not natively recognize and then use one of the --regex-group-to-XXX-cvar options to track " - "the field in a custom variable. For example, specifying --w3c-field-regex=sc-win32-status=(?P\\S+) " - "--regex-group-to-page-cvar=\"win32_status=Windows Status Code\" will track the sc-win32-status IIS field " - "in the 'Windows Status Code' custom variable. Regexes must contain a named group." + "importer does not natively recognize and then use one of the --regex-group-to-XXX-cdim options to track " + "the field in a custom dimension. For example, specifying --w3c-field-regex=sc-win32-status=(?P\\S+) " + "--regex-group-to-page-cdim=\"win32_status=Windows Status Code\" will track the sc-win32-status IIS field " + "in the 'Windows Status Code' custom dimension. Regexes must contain a named group." ) option_parser.add_option( '--title-category-delimiter', dest='title_category_delimiter', default='/', @@ -777,22 +777,29 @@ def _create_parser(self): "disable normal user id tracking. See documentation for --log-format-regex for list of available " "regex groups." ) - option_parser.add_option( '--regex-group-to-visit-cvar', action='callback', callback=functools.partial(self._set_option_map, 'regex_group_to_visit_cvars_map'), type='string', - help="Track an attribute through a custom variable with visit scope instead of through Matomo's normal " - "approach. For example, to track usernames as a custom variable instead of through the uid tracking " - "parameter, supply --regex-group-to-visit-cvar=\"userid=User Name\". This will track usernames in a " - "custom variable named 'User Name'. The list of available regex groups can be found in the documentation " + help="DEPRECATED" + ) + option_parser.add_option( + '--regex-group-to-page-cvar', action='callback', callback=functools.partial(self._set_option_map, 'regex_group_to_page_cvars_map'), type='string', + help="DEPRECATED" + ) + option_parser.add_option( + '--regex-group-to-visit-cdim', action='callback', callback=functools.partial(self._set_option_map, 'regex_group_to_visit_cdims_map'), type='string', + help="Track an attribute through a custom dimension with visit scope instead of through Matomo's normal " + "approach. For example, to track usernames as a custom dimension instead of through the uid tracking " + "parameter, supply --regex-group-to-visit-cdim=\"userid=User Name\". This will track usernames in a " + "custom dimension named 'User Name'. The list of available regex groups can be found in the documentation " "for --log-format-regex (additional regex groups you may have defined " "in --log-format-regex can also be used)." ) option_parser.add_option( - '--regex-group-to-page-cvar', action='callback', callback=functools.partial(self._set_option_map, 'regex_group_to_page_cvars_map'), type='string', - help="Track an attribute through a custom variable with page scope instead of through Matomo's normal " - "approach. For example, to track usernames as a custom variable instead of through the uid tracking " - "parameter, supply --regex-group-to-page-cvar=\"userid=User Name\". This will track usernames in a " - "custom variable named 'User Name'. The list of available regex groups can be found in the documentation " + '--regex-group-to-action-cdim', action='callback', callback=functools.partial(self._set_option_map, 'regex_group_to_action_cdims_map'), type='string', + help="Track an attribute through a custom dimension with action scope instead of through Matomo's normal " + "approach. For example, to track usernames as a custom dimension instead of through the uid tracking " + "parameter, supply --regex-group-to-action-cdim=\"userid=User Name\". This will track usernames in a " + "custom dimension named 'User Name'. The list of available regex groups can be found in the documentation " "for --log-format-regex (additional regex groups you may have defined " "in --log-format-regex can also be used)." ) @@ -948,6 +955,12 @@ def _parse_args(self, option_parser): if not hasattr(self.options, 'regex_group_to_page_cvars_map'): self.options.regex_group_to_page_cvars_map = {} + if not hasattr(self.options, 'regex_group_to_visit_cdims_map'): + self.options.regex_group_to_visit_cdims_map = {} + + if not hasattr(self.options, 'regex_group_to_action_cdims_map'): + self.options.regex_group_to_action_cdims_map = {} + if not hasattr(self.options, 'w3c_field_regexes'): self.options.w3c_field_regexes = {} else: @@ -1454,7 +1467,7 @@ def _call(path, args, headers=None, url=None, data=None): if auth_user is not None: base64string = base64.encodestring('%s:%s' % (auth_user, auth_password)).replace('\n', '') - request.add_header("Authorization", "Basic %s" % base64string) + request.add_header("Authorization", "Basic %s" % base64string) # Use non-default SSL context if invalid certificates shall be # accepted. @@ -1736,6 +1749,49 @@ def check_format(self, format): "specify the Matomo site ID with the --idsite argument" ) + +class CustomDimensions(object): + """ + Utility to manage custom dimensions. + """ + dimensions = {} + + def __init__(self): + self.lock = threading.RLock() + + def pull_dimensions(self, site_id): + self.lock.acquire() + try: + dimensions = matomo.call_api('CustomDimensions.getConfiguredCustomDimensions', idSite=site_id) + for dimension in dimensions: + if dimension['active']: + self.dimensions.setdefault(int(site_id), {})[(dimension['scope'], dimension['name'])] = int(dimension['idcustomdimension']) + finally: + self.lock.release() + + def create_new_dimension(self, site_id, scope, name): + self.lock.acquire() + try: + return matomo.call_api('CustomDimensions.configureNewCustomDimension', idSite=site_id, scope=scope, name=name, active=1) + finally: + self.lock.release() + + def get_custom_dimension_id(self, site_id, scope, name): + if self.dimensions.get(int(site_id)) is None: + self.pull_dimensions(site_id) + dimension_id = self.dimensions.get(int(site_id), {}).get((scope, name)) + + if dimension_id: + return dimension_id + self.lock.acquire() + try: + dimension_id = self.create_new_dimension(site_id, scope, name)['value'] + self.pull_dimensions(site_id) + return dimension_id + finally: + self.lock.release() + + class Recorder(object): """ A Recorder fetches hits from the Queue and inserts them into Matomo using @@ -1864,11 +1920,11 @@ def _get_hit_args(self, hit): # handle custom variables before generating args dict if config.options.enable_bots: if hit.is_robot: - hit.add_visit_custom_var("Bot", hit.user_agent) + hit.add_visit_custom_dimension(site_id, "Bot", hit.user_agent) else: - hit.add_visit_custom_var("Not-Bot", hit.user_agent) + hit.add_visit_custom_dimension(site_id, "Not-Bot", hit.user_agent) - hit.add_page_custom_var("HTTP-code", hit.status) + hit.add_action_custom_dimension(site_id, "HTTP-code", hit.status) args = { 'rec': '1', @@ -1885,11 +1941,11 @@ def _get_hit_args(self, hit): if config.options.replay_tracking: # prevent request to be force recorded when option replay-tracking args['rec'] = '0' - + # idsite is already determined by resolver if 'idsite' in hit.args: del hit.args['idsite'] - + args.update(hit.args) if hit.is_download: @@ -1975,7 +2031,7 @@ def _record_hits(self, hits): logging.info("tracker response:\n%s" % response) response = {} - + if ('invalid_indices' in response and isinstance(response['invalid_indices'], list) and response['invalid_indices']): invalid_count = len(response['invalid_indices']) @@ -2045,6 +2101,22 @@ def get_visitor_id_hash(self): return abs(hash(visitor_id)) + def add_action_custom_dimension(self, site_id, key, value): + """ + Adds a page custom dimension to this Hit. + """ + self._add_custom_dimension(site_id, key, value, 'action') + + def add_visit_custom_dimension(self, site_id, key, value): + """ + Adds a visit custom dimension to this Hit. + """ + self._add_custom_dimension(site_id, key, value, 'visit') + + def _add_custom_dimension(self, site_id, key, value, scope): + dimension_id = custom_dimensions.get_custom_dimension_id(site_id, scope, key) + self.args['dimension%s' % dimension_id] = value + def add_page_custom_var(self, key, value): """ Adds a page custom variable to this Hit. @@ -2391,23 +2463,16 @@ def filtered_line(line, reason): args={}, ) + if config.options.regex_groups_to_ignore: + format.remove_ignored_groups(config.options.regex_groups_to_ignore) + + # FIXME: custom variables are deprecated... if config.options.regex_group_to_page_cvars_map: self._add_custom_vars_from_regex_groups(hit, format, config.options.regex_group_to_page_cvars_map, True) if config.options.regex_group_to_visit_cvars_map: self._add_custom_vars_from_regex_groups(hit, format, config.options.regex_group_to_visit_cvars_map, False) - if config.options.regex_groups_to_ignore: - format.remove_ignored_groups(config.options.regex_groups_to_ignore) - - # Add http method page cvar - try: - httpmethod = format.get('method') - if config.options.track_http_method and httpmethod != '-': - hit.add_page_custom_var('HTTP-method', httpmethod) - except: - pass - try: hit.query_string = format.get('query_string') hit.path = hit.full_path @@ -2520,6 +2585,22 @@ def filtered_line(line, reason): if timezone: hit.date -= datetime.timedelta(hours=timezone/100) + site_id, main_url = resolver.resolve(hit) + + if config.options.regex_group_to_action_cdims_map: + self._add_custom_dimension_from_regex_groups(site_id, hit, format, config.options.regex_group_to_action_cdims_map, 'action') + + if config.options.regex_group_to_visit_cdims_map: + self._add_custom_dimension_from_regex_groups(site_id, hit, format, config.options.regex_group_to_visit_cdims_map, 'visit') + + # Add http method page custom dimension + try: + httpmethod = format.get('method') + if config.options.track_http_method and httpmethod != '-': + hit.add_action_custom_dimension(site_id, 'HTTP-method', httpmethod) + except: + pass + if config.options.replay_tracking: # we need a query string and we only consider requests with piwik.php if not hit.query_string or not hit.path.lower().endswith(config.options.replay_tracking_expected_tracker_file): @@ -2566,6 +2647,21 @@ def _add_custom_vars_from_regex_groups(self, hit, format, groups, is_page_var): else: hit.add_visit_custom_var(custom_var_name, value) + def _add_custom_dimension_from_regex_groups(self, site_id, hit, format, groups, scope): + for group_name, custom_dim_name in groups.iteritems(): + if group_name in format.get_all(): + value = format.get(group_name) + + # don't track the '-' empty placeholder value + if value == '-': + continue + + if scope == 'action': + hit.add_action_custom_dimension(site_id, custom_dim_name, value) + else: + hit.add_visit_custom_dimension(site_id, custom_dim_name, value) + + def main(): """ Start the importing process. @@ -2613,6 +2709,7 @@ def fatal_error(error, filename=None, lineno=None): stats = Statistics() resolver = config.get_resolver() parser = Parser() + custom_dimensions = CustomDimensions() main() sys.exit(0) except KeyboardInterrupt: diff --git a/tests/tests.py b/tests/tests.py index 2b7c089..f78b31e 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -74,7 +74,7 @@ def _test(format_name, log_file = None): def _test_junk(format_name, log_file = None): if log_file is None: log_file = 'logs/%s.log' % format_name - + tmp_path = add_junk_to_file(log_file) file = open(tmp_path) @@ -185,6 +185,8 @@ class Options(object): w3c_field_regexes = {} regex_group_to_visit_cvars_map = {} regex_group_to_page_cvars_map = {} + regex_group_to_visit_cdims_map = {} + regex_group_to_action_cdims_map = {} regex_groups_to_ignore = None replay_tracking_expected_tracker_file = 'piwik.php' debug_request_limit = None @@ -206,6 +208,9 @@ class Resolver(object): def check_format(self, format_): pass + def resolve(self, hit): + return 1, "https://example.org/" + class Recorder(object): """Mock recorder which collects hits but doesn't put their in database.""" recorders = [] @@ -214,6 +219,16 @@ class Recorder(object): def add_hits(cls, hits): cls.recorders.extend(hits) +import_logs.custom_dimensions = import_logs.CustomDimensions() +import_logs.custom_dimensions.dimensions[1] = { + ('visit', 'User Name'): 1, + ('visit', 'The Date'): 2, + ('action', 'Generation Time'): 3, + ('action', 'The Referrer'): 4, + ('action', 'HTTP-method'): 5 +} + + def test_replay_tracking_arguments(): """Test data parsing from sample log file.""" file_ = 'logs/logs_to_tests.log' @@ -456,7 +471,7 @@ def test_iis_custom_format(): assert hits[0]['extension'] == u'/products/theproduct' assert hits[0]['is_download'] == False assert hits[0]['referrer'] == u'http://example.com/Search/SearchResults.pg?informationRecipient.languageCode.c=en' - assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}} + assert hits[0]['args'] == {'dimension5': 'GET'} assert hits[0]['generation_time_milli'] == 109 assert hits[0]['host'] == 'foo' assert hits[0]['filename'] == 'logs/iis_custom.log' @@ -475,7 +490,7 @@ def test_iis_custom_format(): assert hits[1]['extension'] == u'/topic/hw43061' assert hits[1]['is_download'] == False assert hits[1]['referrer'] == '' - assert hits[1]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}} + assert hits[1]['args'] == {'dimension5': 'GET'} assert hits[1]['generation_time_milli'] == 0 assert hits[1]['host'] == 'foo' assert hits[1]['filename'] == 'logs/iis_custom.log' @@ -494,7 +509,7 @@ def test_iis_custom_format(): assert hits[2]['extension'] == u'/hello/world/6,681965' assert hits[2]['is_download'] == False assert hits[2]['referrer'] == '' - assert hits[2]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}} + assert hits[2]['args'] == {'dimension5': 'GET'} assert hits[2]['generation_time_milli'] == 359 assert hits[2]['host'] == 'foo' assert hits[2]['filename'] == 'logs/iis_custom.log' @@ -532,7 +547,7 @@ def test_netscaler_parsing(): assert hits[0]['extension'] == u'jsp' assert hits[0]['is_download'] == False assert hits[0]['referrer'] == '' - assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}} + assert hits[0]['args'] == {'dimension5': 'GET'} assert hits[0]['generation_time_milli'] == 1000 assert hits[0]['host'] == 'foo' assert hits[0]['filename'] == 'logs/netscaler.log' @@ -650,7 +665,7 @@ def test_amazon_cloudfront_web_parsing(): assert hits[0]['extension'] == u'html' assert hits[0]['is_download'] == False assert hits[0]['referrer'] == u'www.displaymyfiles.com' - assert hits[0]['args'] == {'cvar': {1: ['HTTP-method', 'GET']}} + assert hits[0]['args'] == {'dimension5': 'GET'} assert hits[0]['generation_time_milli'] == 1.0 assert hits[0]['host'] == 'foo' assert hits[0]['filename'] == 'logs/amazon_cloudfront_web.log' @@ -798,8 +813,8 @@ def test_ignore_groups_option_removes_groups(): assert hits[0]['userid'] == None assert hits[0]['generation_time_milli'] == 0 -def test_regex_group_to_custom_var_options(): - """Test that the --regex-group-to-visit-cvar and --regex-group-to-page-cvar track regex groups to custom vars.""" +def test_regex_group_to_custom_dimensions_options(): + """Test that the --regex-group-to-visit-cdim and --regex-group-to-action-cdim track regex groups to custom vars.""" file_ = 'logs/iis.log' @@ -813,20 +828,22 @@ def test_regex_group_to_custom_var_options(): import_logs.config.options.replay_tracking = False import_logs.config.options.w3c_time_taken_in_millisecs = True import_logs.config.options.regex_groups_to_ignore = set() - import_logs.config.options.regex_group_to_visit_cvars_map = { + import_logs.config.options.regex_group_to_visit_cdims_map = { 'userid': "User Name", 'date': "The Date" } - import_logs.config.options.regex_group_to_page_cvars_map = { - 'generation_time_milli': 'Geneartion Time', + import_logs.config.options.regex_group_to_action_cdims_map = { + 'generation_time_milli': 'Generation Time', 'referrer': 'The Referrer' } import_logs.parser.parse(file_) hits = [hit.__dict__ for hit in Recorder.recorders] - assert hits[0]['args']['_cvar'] == {1: ['The Date', '2012-04-01 00:00:13'], 2: ['User Name', 'theuser']} # check visit custom vars - assert hits[0]['args']['cvar'] == {1: ['Geneartion Time', '1687'], 2: ['HTTP-method', 'GET']} # check page custom vars + assert hits[0]['args']['dimension1'] == 'theuser' + assert hits[0]['args']['dimension2'] == '2012-04-01 00:00:13' + assert hits[0]['args']['dimension3'] == '1687' + assert hits[0]['args']['dimension5'] == 'GET' assert hits[0]['userid'] == 'theuser' assert hits[0]['date'] == datetime.datetime(2012, 4, 1, 0, 0, 13) @@ -877,8 +894,8 @@ def test_custom_log_date_format_option(): Recorder.recorders = [] import_logs.parser = import_logs.Parser() import_logs.config.options.w3c_field_regexes = None - import_logs.config.options.regex_group_to_visit_cvars_map = None - import_logs.config.options.regex_group_to_page_cvars_map = None + import_logs.config.options.regex_group_to_visit_cdims_map = None + import_logs.config.options.regex_group_to_action_cdims_map = None import_logs.config.options.log_format_regex = ( '(?P\S+)\s+\S+\s+\S+\s+\[(?P.*?)\]\s+' '"\S+\s+(?P.*?)\s+\S+"\s+(?P\S+)\s+(?P\S+)'