Optimizing with trie?

Guy_Rauscher · July 27, 2020, 2:29pm

I wonder why the time complexity has to be O(N∗M∗Len) (where ‘N’ is the number of characters in the given string, ‘M’ is the total number of words, and ‘Len’ is the length of a word).
It seems to me that the term ‘M’ can be removed by using a trie to check if any of the M words appear in a given location in the string using only Len steps (as opposed to M*Len the way the solution is implemented).
Thanks
Guy

Guy_Rauscher · May 27, 2020, 11:01am

Correction: a trie is not really any better complexity-wise than checking if a word is in a dictionary.

But the time complexity can still be improved to O(N*Len). The proposed solution does redundant work by looking for a complete concatenation at each step instead of using a sliding window to re-use some of the previous word matches.

Here is my proposed solution:

def find_word_concatenation(str, words):
  wlen = len(words[0])

  # calculate word frequencies
  wfreq = {}
  for w in words:
    wfreq.setdefault(w,0)
    wfreq[w] += 1

  result_indices = []

  for phase in range(wlen):
    matched_words = []
    match_start = phase

    for i in range(phase, len(str), wlen):
      word = str[i:i+wlen]

      if word in wfreq:
        # word match found!
        matched_words.append(word)
        wfreq[word] -= 1

        if wfreq[word] == 0 and len(matched_words) == len(words):
          # complete match found!
          result_indices.append(match_start)

        while wfreq[word] < 0 or len(matched_words) == len(words):
          # oops, too many occurences (or match found), shrink window
          word2 = matched_words.pop(0)
          wfreq[word2] += 1
          match_start += wlen

      else:
        # no match, clear everything
        for word2 in matched_words:
          wfreq[word2] += 1
        matched_words = []
        match_start = i + wlen

  return result_indices

matched_words is not necessary but it makes the code a little easier to read.

Please let me know if I’m missing anything

George_L_Taveras · August 25, 2020, 1:40am

Hey Guy,

I’m glad you mentioned this because I had the same idea. I believe you actually CAN improve the runtime with a Trie. I did so myself, bringing the suggested runtime from O(N * W * Len) to O(N + W * Len) (Notice the +) . For the solution to actually have better performance characteristics, you have to use a counting strategy similar to the one used in the Problem Challenge 2.

See my Javascript implementation (There are a couple moving pieces but let me know if you’d like me to explain further):

/**
 * W = number of words
 * w = word size
 * Space: O(W * w)
 * Time: O(N + (W * w))
 * 
 */

const createWordTrie = (words) => {
  const trie = {value: null, children: new Map() };
  let current = trie;
  for (const word of words) {
    for (const char of word) {
      const nextNode = current.children.get(char) || {value: null, children: new Map() };
      current.children.set(char, nextNode);
      current = nextNode;
    }
    current.value = current.value ? current.value + 1 : 1;
    current = trie;
  }
  return trie;
}
const find_word_concatenation = function(str, words) {
  const indices = [];
  const singleWordSize = words[0].length;
  const numberOfWords = words.length;
  const wordsSize = singleWordSize * words.length;
  const wordsTrie = createWordTrie(words);

  let windowStart = 0;
  let matches = 0;

  let frontTrieHead = wordsTrie;
  let backTrieHead = wordsTrie;

  for (let windowEnd = 0; windowEnd < str.length; windowEnd ++) {
    const char = str[windowEnd];

    if (frontTrieHead.children.has(char)) {
      frontTrieHead = frontTrieHead.children.get(char);
      if (frontTrieHead.value !== null) {
        frontTrieHead.value--;

        if (frontTrieHead.value === 0) {
          matches++;
        }

        frontTrieHead = wordsTrie;
      }
    } else {
       frontTrieHead = wordsTrie;
    }

    if (windowEnd >= wordsSize) {
      const startChar = str[windowStart];
      if (backTrieHead.children.has(startChar)) {
        backTrieHead = backTrieHead.children.get(startChar);

        if (backTrieHead.value !== null) {
          if (backTrieHead.value === 0) {
            matches--;
          }

          backTrieHead.value++;
          backTrieHead = wordsTrie
        }
      } else {
        backTrieHead = wordsTrie;
      }

      windowStart++;
    }

    if (matches === numberOfWords && backTrieHead === wordsTrie && frontTrieHead === wordsTrie) {
      indices.push(windowStart);
    }
  }

  return indices;
};