All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
For example,
Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT", Return: ["AAAAACCCCC", "CCCCCAAAAA"].
用Map的话超内存了,改用bitsmap,因为只有4个字母,所以只要用两位就可以做为一个字母的编码,10个字母就是20位,所以创建一个2^20大小的数组就可以解决问题了。
class Solution {
public:
int chartoint(char a)
{
if(a == 'A') return 0;
if(a == 'C') return 1;
if(a == 'G') return 2;
if(a == 'T') return 3;
}
vector<string> findRepeatedDnaSequences(string s) {
vector<string> res;
set<string> temp;
string str;
if(s.size()<10)
return res;
unsigned int val = 0;
for(int i = 0; i < 9; i++)
{
val <<= 2;
val |= chartoint(s[i]);
}
int map[1024*1024] = {0};
for(int i = 9; i < s.size(); i++)
{
val <<= 14;
val >>= 12;
val |= chartoint(s[i]);
map[val]++;
if(map[val] > 1)
{
str = s.substr(i-9, 10);
temp.insert(str);
}
}
for(set<string>::iterator i = temp.begin(); i != temp.end(); ++i)
{
res.push_back(*i);
}
return res;
}
};