Problem
The DNA sequence is composed of a series of nucleotides abbreviated as ‘A’, ‘C’, ‘G’, and ‘T’.
- For example, “ACGAATTCCG” is a DNA sequence.
When studying DNA, it is useful to identify repeated sequences within the DNA.
Given a string s that represents a DNA sequence, return all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule. You may return the answer in any order.
Algorithm
Implement a trie to insert and search.
Code
class Solution:
def findRepeatedDnaSequences(self, s: str) -> List[str]:
# trie define
class Trie:
def __init__(self):
self.root = {}
def insert(self, word):
cNode = self.root
for c in word:
if not c in cNode:
cNode[c] = {}
cNode = cNode[c]
if not 'leaf' in cNode:
cNode['leaf'] = 1
else:
cNode['leaf'] += 1
def search(self, word):
cNode = self.root
for c in word:
if not c in cNode:
return False
cNode = cNode[c]
if not 'leaf' in cNode:
return False
else:
return cNode['leaf']
def output(self, root, depth):
for c in root:
print(' ' * depth, c)
self.output(root[c], depth+1)
trie = Trie()
slen = len(s)
for i in range(slen-9):
ss = s[i:i+10]
trie.insert(ss)
dic = {}
for i in range(slen-9):
ss = s[i:i+10]
if ss not in dic and trie.search(ss) > 1:
dic[ss] = 1
ans = []
for word in dic:
ans.append(word)
return ans