Skip to content

feat(strings): add professional suffix array and LCP implementation #12817

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions strings/suffix_array.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
<<<<<<< HEAD
"""
suffix_array.py

Professional implementation of Suffix Array and LCP (Longest Common Prefix) array in Python.

Features:
- Efficient O(n log n) construction using doubling method
- Kasai's algorithm for LCP array in O(n)
- Detailed docstrings and complexity analysis
- Standalone usage example and simple unit tests

Author: Idris Ibrahim Erten
License: MIT
"""


=======
>>>>>>> c176d091 (feat(strings): add professional suffix array and LCP implementation)
def build_suffix_array(s: str) -> list[int]:
# Append a sentinel that is lexicographically smaller than all other characters
s += "\0"
n = len(s)
# Initial ranking by character code
ranks = [ord(c) for c in s]
sa = list(range(n))
tmp = [0] * n
k = 1
# Doubling loop
while k < n:
# Sort by (rank[i], rank[i+k]) pairs
sa.sort(key=lambda i: (ranks[i], ranks[i + k] if i + k < n else -1))
# Temporary array for new ranks
tmp[sa[0]] = 0
for i in range(1, n):
prev, curr = sa[i - 1], sa[i]
# Compare pair (rank, next rank)
r_prev = (ranks[prev], ranks[prev + k] if prev + k < n else -1)
r_curr = (ranks[curr], ranks[curr + k] if curr + k < n else -1)
tmp[curr] = tmp[prev] + (1 if r_curr != r_prev else 0)
ranks, tmp = tmp, ranks # reuse lists to save memory
k <<= 1
if ranks[sa[-1]] == n - 1:
break
# Drop the sentinel index
return sa[1:]


def build_lcp_array(s: str, sa: list[int]) -> list[int]:
n = len(sa)
# Inverse of suffix array: pos[i] gives rank of suffix at i
pos = [0] * n
for i, suf in enumerate(sa):
pos[suf] = i
lcp = [0] * n
k = 0
for i in range(len(s)):
if pos[i] == 0:
k = 0
continue
j = sa[pos[i] - 1]
# Compare characters starting from k
while i + k < len(s) and j + k < len(s) and s[i + k] == s[j + k]:
k += 1
lcp[pos[i]] = k
if k:
k -= 1
return lcp[1:]


if __name__ == "__main__":
# Example usage and simple tests
test_strings = ["banana", "abracadabra", "mississippi"]
for s in test_strings:
sa = build_suffix_array(s)
lcp = build_lcp_array(s, sa)
print(f"String: {s}")
print(f"Suffix Array: {sa}")
print(f"LCP Array : {lcp}\n")

# Assertions for correctness
s = "banana"
expected_sa = [5, 3, 1, 0, 4, 2] # indices of sorted suffixes
assert build_suffix_array(s) == expected_sa, "SA test failed"
print("All tests passed!")
Loading