Right you are, sir.
I have no clue what I was thinking when I wrote that piece of code. Here is a much clearer and more correct version(having worked out more examples by hand I have a much better feel for how this algo should go now):
def prune(strings):
"""
Removes from the given strings all the parts common to all strings
"""
result = [""] * len(strings)
for e in strings:
break
for i in range(len(e)):
if len(set([a[i] for a in strings])) > 1:
for j, s in enumerate(strings):
result[j] += s[i]
return set(result)
def classify(strings, n):
"""
given an array of strings gives a map of nth letter => array of strings
"""
result = {}
for s in strings:
if s[n] not in result:
result[s[n]] = set()
result[s[n]].add(s)
return result
cache = {}
def max_comparisons(strings):
l = len(strings)
if l <= 3:
return l-1
key = tuple(sorted(strings))
if key in cache:
return cache[key]
strings = prune(strings)
for e in strings: # Hack to retrieve a single element from set
break
if len(e) == 1:
return len(strings)-1
m = None
for i in range(len(e)):
groups = classify(strings, i).values()
for g in groups:
left_side = max_comparisons(g)
right_side = max_comparisons(strings-g)
tmp = max(left_side, right_side) + 1
if m is None or m > tmp:
m = tmp
cache[key] = m
return m
input = ["fVq9jcBEMNrVl0M2A8VmfNPLgd7kK5JD",
"I9FlZT90gu2n9ywNFBljC7jce2rR0SFJ",
"lGOmu7loH80DbOyaYqzQBGwTPDwHlr8u",
"rFqF5g9u5TXAAXM2sdxHpfZ4VzFBOgNN",
"lGOmu7loH80DbOyfb6NuuWcEP81D7J5r",
"tEANjAjjNwvGfG3uhjjAtFfg3OmYXVK2",
"lGOmu7loH80DOyncyYpajmdnNIGH6HSu",
"zQUwjMzloRmzlR5jYuvaaa9sqo9AH09B",
"JsNn1Hu7J2FjMxoi8t49AogNogfPKlk6",
"lGOmu7loH80DbOyFbtVa1K1m6UxQFvX5"]
print max_comparisons(input) # prints 4
input = ["aaa", "aab", "aba", "abb", "bcc", "ccd", "ddc", "edd"]
print max_comparisons(input) # prints 3
PS: The cache is necessary otherwise the smaller input takes ~3s and the larger one doesn't complete in less than 2min(I cut it off at that point). With the cache the small input finishes instantly(in less than 30ms, which is mostly taken up by the interpreter startup time) and the large one finishes in ~900ms.
The if len(strings) <= 3 is just a simplification(brings down the execution time of the large one from 1.1s to 900ms) and it is a safe assumption since:
1 string: no comparisons needed
2 strings: always 1 comparison needed(since they are unique)
3 strings: always 2 comparisons needed(a comparison will split the set into a part with 1 string and a part with 2)
If we have 4 or more strings it becomes impossible to simplify(4 strings may require 3 or 2 comparisons)