题目:
All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
For example,
Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT", Return: ["AAAAACCCCC", "CCCCCAAAAA"].
题解:
用Python Dict直接将每一个长度为10的子串作为key保存出现次数。但Java无法通过memory limit。需要将"A", "C", "G", "T"映射为"0x0", "0x1", "0x2", "0x3",再用int的低20 bit作为key。
C++版:
class Solution {
public:
vector<string> findRepeatedDnaSequences(string s) {
vector<string> result;
if(s.length() <= 10)
return result;
unordered_map<int, int> count;
int key = 0;
for(int i = 0; i < s.length(); i++) {
int current = 0;
switch(s[i]) {
case 'A': current = 0x0;
break;
case 'C': current = 0x1;
break;
case 'G': current = 0x2;
break;
case 'T': current = 0x3;
break;
}
key = ((key << 2) | current) & 0xFFFFF;
if(i < 9)
continue;
if(count.find(key) == count.end()) {
count.insert(pair<int, int>(key, 1));
} else if(count[key] == 1) {
result.push_back(s.substr(i-9, 10));
count[key]++;
}
}
return result;
}
};
Java版:
public class Solution {
public List<String> findRepeatedDnaSequences(String s) {
List<String> result = new ArrayList<>();
if(s.length() <= 10)
return result;
Map<String, Integer> count = new HashMap<>();
for(int i = 0; i <= s.length() - 10; i++) {
String current = s.substring(i, i+9);
if(!count.containsKey(current)) {
count.put(current, 1);
} else if(count.get(current) == 1) {
result.add(current);
count.put(current, 2);
}
}
return result;
}
}
Python版:
class Solution:
# @param {string} s
# @return {string[]}
def findRepeatedDnaSequences(self, s):
d = {}
result = []
if len(s) <= 10:
return []
for i in range(len(s)-10+1):
sub = s[i:i+10]
if sub in d:
d[sub] += 1
else:
d[sub] = 1
for i in d:
if d[i] > 1:
result.append(i)
return result