All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
Example:
Input: s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT" Output: ["AAAAACCCCC", "CCCCCAAAAA"]
所有的DNA都是由一系列核苷酸组成的,简写为 A, C, G, and T,比如:"ACGAATTCCG"。当研究DNA时,识别DNA里的子序列是很有帮助的。写一个函数找出10个字母长度的出现过多次的子序列。
解法1:hash table + hash set
解法2: hash set
解法3:hash table + bit manipulte
Java:
public List<String> findRepeatedDnaSequences(String s) {
Set<String> result = new HashSet();
if(s ==null || s.length() <2)
return new ArrayList();
Set<String> temp = new HashSet();
for(int i=0; i<s.length()-9; i++){
String x = s.substring(i,i+10);
if(temp.contains(x)){
result.add(x);
} else
temp.add(x);
}
return new ArrayList(result);
}
Java:
public List<String> findRepeatedDnaSequences(String s) {
Set seen = new HashSet(), repeated = new HashSet();
for (int i = 0; i + 9 < s.length(); i++) {
String ten = s.substring(i, i + 10);
if (!seen.add(ten))
repeated.add(ten);
}
return new ArrayList(repeated);
}
Java: hashmap + bits manipulation
public List<String> findRepeatedDnaSequences(String s) {
Set<Integer> words = new HashSet<>();
Set<Integer> doubleWords = new HashSet<>();
List<String> rv = new ArrayList<>();
char[] map = new char[26];
//map['A' - 'A'] = 0;
map['C' - 'A'] = 1;
map['G' - 'A'] = 2;
map['T' - 'A'] = 3;
for(int i = 0; i < s.length() - 9; i++) {
int v = 0;
for(int j = i; j < i + 10; j++) {
v <<= 2;
v |= map[s.charAt(j) - 'A'];
}
if(!words.add(v) && doubleWords.add(v)) {
rv.add(s.substring(i, i + 10));
}
}
return rv;
}
Python:
class Solution(object):
def findRepeatedDnaSequences(self, s):
"""
:type s: str
:rtype: List[str]
"""
dict, rolling_hash, res = {}, 0, []
for i in xrange(len(s)):
rolling_hash = ((rolling_hash << 3) & 0x3fffffff) | (ord(s[i]) & 7)
if rolling_hash not in dict:
dict[rolling_hash] = True
elif dict[rolling_hash]:
res.append(s[i - 9: i + 1])
dict[rolling_hash] = False
return res
Python:
def findRepeatedDnaSequences2(self, s):
"""
:type s: str
:rtype: List[str]
"""
l, r = [], []
if len(s) < 10: return []
for i in range(len(s) - 9):
l.extend([s[i:i + 10]])
return [k for k, v in collections.Counter(l).items() if v > 1]
C++:
class Solution {
public:
vector<string> findRepeatedDnaSequences(string s) {
unordered_set<int> seen;
unordered_set<int> dup;
vector<string> result;
vector<char> m(26);
m['A' - 'A'] = 0;
m['C' - 'A'] = 1;
m['G' - 'A'] = 2;
m['T' - 'A'] = 3;
for (int i = 0; i + 10 <= s.size(); ++i) {
string substr = s.substr(i, 10);
int v = 0;
for (int j = i; j < i + 10; ++j) { //20 bits < 32 bit int
v <<= 2;
v |= m[s[j] - 'A'];
}
if (seen.count(v) == 0) { //not seen
seen.insert(v);
} else if (dup.count(v) == 0) { //seen but not dup
dup.insert(v);
result.push_back(substr);
} //dup
}
return result;
}
};