Skip to content

Create bm_regex_generic.py #87

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 114 additions & 0 deletions pyperformance/benchmarks/bm_regex_generic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@

"""Benchmarks for Python's regex engine.

Benchmarks for checking begining and end of strings. Should have reasonable times after https://bugs.python.org/issue42885 is fixed.

Real life use case is e.g. comparing 500.000 filenames and paths each with thousands of regex of known malicous files, https://github.com/Neo23x0/signature-base/blob/master/iocs/filename-iocs.txt
(^ and $ mostly not yet included because it wouldn't give a speed advantage in https://github.com/Neo23x0/Loki)

Adopted from bm_regex_effbot.py by Arnim Rupp

Feel free to add other generic regex.

"""

# Python imports
import re

# Local imports
import pyperf

# These are the regular expressions to be tested.

def gen_regex_table():
return [
re.compile('^c:'), # very slow if false
re.compile('\Ac:'), # very slow if false
re.compile('evil\.exe$'),
re.compile('evil\.exe\Z'),
re.compile('\Ad:'),
re.compile('evil\.dll$'), # very slow if true
re.compile('evil\.dll\Z'), # very slow if true
]


def gen_string_table(n):
"""Generates the list of strings that will be used in the benchmarks.

"""
strings = []

strings.append('d:' + ('A' * ( 10 ** n )) + 'evil.dll')
return strings


def init_benchmarks(n_values=None):
"""Initialize the strings we'll run the regexes against.

The generated list of strings is cached in the string_tables
variable, which is indexed by n.

Returns:
A list of string + lengths.
"""

if n_values is None:
n_values = (3,5,7)

string_tables = {n: gen_string_table(n) for n in n_values}
#print(string_tables)
regexs = gen_regex_table()

data = []
for n in n_values:
for id in range(len(regexs)):
regex = regexs[id]
string = string_tables[n]
data.append((regex, string))
return data


def bench_regex_generic(loops):
if bench_regex_generic.data is None:
bench_regex_generic.data = init_benchmarks()
data = bench_regex_generic.data

range_it = range(loops)
search = re.search
t0 = pyperf.perf_counter()

for _ in range_it:
# Runs all of the benchmarks for a given value of n.
for regex, string in data:
# search 10 times
search(regex, string[0])
search(regex, string[0])
search(regex, string[0])
search(regex, string[0])
search(regex, string[0])
search(regex, string[0])
search(regex, string[0])
search(regex, string[0])
search(regex, string[0])
search(regex, string[0])

return pyperf.perf_counter() - t0


# cached data, generated at the first call
bench_regex_generic.data = None


def add_cmdline_args(cmd, args):
pass
#if args.force_bytes:
#cmd.append("--force_bytes")


if __name__ == '__main__':
runner = pyperf.Runner(add_cmdline_args=add_cmdline_args)
runner.metadata['description'] = ("Test the performance of regexps "
"with long strings.")
options = runner.parse_args()

runner.bench_time_func('regex_generic', bench_regex_generic, inner_loops=10)