diff --git a/MatomoTracker.php b/MatomoTracker.php index b40606b..c2bbd28 100644 --- a/MatomoTracker.php +++ b/MatomoTracker.php @@ -32,6 +32,16 @@ class MatomoTracker */ static public $URL = ''; + public const AI_BOT_USER_AGENT_SUBSTRINGS = [ + 'ChatGPT-User', + 'MistralAI-User', + 'Gemini-Deep-Research', + 'Claude-User', + 'Perplexity-User', + 'Google-NotebookLM', + 'GPTBot', + ]; + /** * API Version * @@ -161,11 +171,11 @@ class MatomoTracker // Visitor Ids in order public $userId = false; - + public $forcedVisitorId = false; - + public $cookieVisitorId = false; - + public $randomVisitorId = false; public $configCookiesDisabled = false; @@ -186,11 +196,11 @@ class MatomoTracker // Allow debug while blocking the request public $requestTimeout = 600; - + public $requestConnectTimeout = 300; - + public $doBulkRequests = false; - + public $storedTrackingActions = []; public $sendImageResponse = true; @@ -238,7 +248,7 @@ public function __construct(int $idSite, string $apiUrl = '') $this->currentTs = time(); $this->createTs = $this->currentTs; - + $this->visitorCustomVar = $this->getCustomVariablesFromCookie(); } @@ -735,7 +745,7 @@ public function enableBulkTracking(): void } /** - * Disables the bulk request feature. Make sure to call `doBulkTrack()` before disabling it if you have stored + * Disables the bulk request feature. Make sure to call `doBulkTrack()` before disabling it if you have stored * tracking actions previously as this method won't be sending any previously stored actions before disabling it. */ public function disableBulkTracking(): void @@ -833,6 +843,24 @@ public function doTrackPageView(string $documentTitle) return $this->sendRequest($url); } + /** + * If the current user agent belongs to an AI agent bot, tracks a pageview action. + * + * This method should be used server side to track AI bots that do not execute + * JavaScript. + * + * @return mixed Response string or true if using bulk requests. + */ + public function doTrackPageViewIfAIBot(?int $httpStatus = null, ?int $responseSizeBytes = null, ?int $serverTimeMs = null, ?string $source = null) + { + if (!self::isUserAgentAIBot($this->userAgent)) { + return null; + } + + $url = $this->getUrlTrackAIBot($httpStatus, $responseSizeBytes, $serverTimeMs, $source); + return $this->sendRequest($url); + } + /** * Override PageView id for every use of `doTrackPageView()`. Do not use this if you call `doTrackPageView()` * multiple times during tracking (if, for example, you are tracking a single page application). @@ -847,7 +875,7 @@ public function setPageviewId(string $idPageview): void * Returns the PageView id. If the id was manually set using `setPageViewId()`, that id will be returned. * If the id was not set manually, the id that was automatically generated in last `doTrackPageView()` will * be returned. If there was no last page view, this will be false. - * + * * @return string|false The PageView id as string or false if there is none yet. */ public function getPageviewId() @@ -891,7 +919,7 @@ public function doTrackEvent( public function doTrackContentImpression( string $contentName, string $contentPiece = 'Unknown', - $contentTarget = false + $contentTarget = false ) { $url = $this->getUrlTrackContentImpression($contentName, $contentPiece, $contentTarget); @@ -1215,6 +1243,40 @@ private function forceDotAsSeparatorForDecimalPoint($value): string return str_replace(',', '.', $value); } + /** + * Builds a URL to track a request from an AI bot. + * + * @param int|null $httpStatus the request's HTTP status code, if it is known. + * @param int|null $responseSizeBytes the size of the response sent to the AI bot, if known. + * @param int|null $serverTimeMs the number of milliseconds it took to process the request, if known. + * @param string|null $source + * @return string + */ + public function getUrlTrackAIBot(?int $httpStatus = null, ?int $responseSizeBytes = null, ?int $serverTimeMs = null, ?string $source = null): string + { + $url = $this->getRequest($this->idSite); + + $url .= '&recMode=1'; + + if (!empty($httpStatus)) { + $url .= '&http_status=' . $httpStatus; + } + + if (!empty($responseSizeBytes)) { + $url .= '&bw_bytes=' . $responseSizeBytes; + } + + if (!empty($serverTimeMs)) { + $url .= '&pf_srv=' . $serverTimeMs; + } + + if (!empty($source)) { + $url .= '&source=' . rawurlencode($source); + } + + return $url; + } + /** * Returns URL used to track Ecommerce Cart updates * Calling this function will reinitializes the property ecommerceItems to empty array @@ -1362,7 +1424,7 @@ public function getUrlTrackEvent( public function getUrlTrackContentImpression( string $contentName, string $contentPiece, - $contentTarget + $contentTarget ): string { $url = $this->getRequest($this->idSite); @@ -1876,7 +1938,7 @@ public function setRequestTimeout(int $timeout) return $this; } - + /** * Returns the maximum number of seconds the tracker will spend trying to connect to Matomo. * Defaults to 300 seconds. @@ -1904,7 +1966,7 @@ public function setRequestConnectTimeout(int $timeout) return $this; } - /** + /** * Sets the request method to POST, which is recommended when using setTokenAuth() * to prevent the token from being recorded in server logs. Avoid using redirects * when using POST to prevent the loss of POST values. When using Log Analytics, @@ -1957,7 +2019,7 @@ private function getProxy(): ?string protected function prepareCurlOptions( string $url, string $method, - $data, + $data, bool $forcePostUrlEncoded ): array { $options = [ @@ -2374,7 +2436,7 @@ protected static function getCurrentScriptName(): string if (empty($url) && isset($_SERVER['SCRIPT_NAME'])) { $url = $_SERVER['SCRIPT_NAME']; } elseif (empty($url)) { - $url = '/'; + $url = '/'; } if (!empty($url) && $url[0] !== '/') { @@ -2443,9 +2505,9 @@ protected static function getCurrentQueryString(): string protected static function getCurrentUrl(): string { return self::getCurrentScheme() . '://' - . self::getCurrentHost() - . self::getCurrentScriptName() - . self::getCurrentQueryString(); + . self::getCurrentHost() + . self::getCurrentScriptName() + . self::getCurrentQueryString(); } /** @@ -2576,6 +2638,26 @@ protected function parseIncomingCookies(array $headers): void } } } + + /** + * Returns true if the given user agent belongs to a known AI bot. + * + * @param string $userAgent + * @return bool + */ + public static function isUserAgentAIBot(string $userAgent): bool + { + if (empty($userAgent)) { + return false; + } + + foreach (self::AI_BOT_USER_AGENT_SUBSTRINGS as $substring) { + if (stripos($userAgent, $substring) !== false) { + return true; + } + } + return false; + } } /** diff --git a/tests/Unit/MatomoTrackerTest.php b/tests/Unit/MatomoTrackerTest.php index 75b1669..a607598 100644 --- a/tests/Unit/MatomoTrackerTest.php +++ b/tests/Unit/MatomoTrackerTest.php @@ -85,6 +85,92 @@ public function test_setApiUrl() $this->assertSame(substr($url, 0, strlen($newApiUrl)), $newApiUrl); } + /** + * @dataProvider getTestDataForIsUserAgentAIBot + */ + public function test_isUserAgentAIBot($userAgent, $expected) + { + $this->assertSame($expected, \MatomoTracker::isUserAgentAIBot($userAgent)); + } + + public function getTestDataForIsUserAgentAIBot(): array + { + return [ + ['', false], + + ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.3', false], + ['Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.3', false], + + ['Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot', true], + ['Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.1; +https://openai.com/gptbot', true], + ['Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; MistralAI-User/1.0; +https://docs.mistral.ai/robots)', true], + ['Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Gemini-Deep-Research; +https://gemini.google/overview/deep-research/) Chrome/135.0.0.0 Safari/537.36', true], + ['Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Claude-User/1.0; +Claude-User@anthropic.com)', true], + ['Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Perplexity-User/1.0; +https://perplexity.ai/perplexity-user)', true], + ['Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36; Devin/1.0; +devin.ai', true], + ]; + } + + /** + * @dataProvider getTestDataForGetUrlTrackAIBot + */ + public function test_getUrlTrackAIBot(?int $httpStatus, ?int $responseSizeBytes, ?int $serverTimeMs, ?string $source, string $expected) + { + $_SERVER['HTTP_USER_AGENT'] = 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot'; + + $tracker = new \MatomoTracker(1, $apiUrl = self::TEST_URL); + $tracker->setUrl('https://example.com/page'); + $tracker->setVisitorId('abcdef01234517ab'); + + $actual = $tracker->getUrlTrackAIBot($httpStatus, $responseSizeBytes, $serverTimeMs, $source); + $actual = $this->normalizeTrackingUrl($actual); + + $this->assertEquals($expected, $actual); + } + + public function getTestDataForGetUrlTrackAIBot(): array + { + return [ + [ + 200, + 34567, + 123, + 'wordpress', + 'http://mymatomo.com/matomo.php?idsite=1&rec=1&apiv=1&r=&r=&cid=abcdef01234517ab&url=https%3A%2F%2Fexample.com%2Fpage&urlref=&recMode=1&http_status=200&bw_bytes=34567&pf_srv=123&source=wordpress', + ], + + [ + null, + 34567, + null, + 'something else', + 'http://mymatomo.com/matomo.php?idsite=1&rec=1&apiv=1&r=&r=&cid=abcdef01234517ab&url=https%3A%2F%2Fexample.com%2Fpage&urlref=&recMode=1&bw_bytes=34567&source=something%20else', + ], + + [ + null, + null, + null, + null, + 'http://mymatomo.com/matomo.php?idsite=1&rec=1&apiv=1&r=&r=&cid=abcdef01234517ab&url=https%3A%2F%2Fexample.com%2Fpage&urlref=&recMode=1', + ], + ]; + } + + private function normalizeTrackingUrl(string $url) + { + $nonDeterministicParams = [ + 'r', + '_idts', + ]; + + foreach ($nonDeterministicParams as $param) { + $url = preg_replace('/&' . preg_quote($param) . '=[^&]+/', '&r=', $url); + } + + return $url; + } + public function testUsageApiUrl(): void { $newApiUrl = 'https://NEW-API-URL.com';