All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
For example,
Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT", Return: ["AAAAACCCCC", "CCCCCAAAAA"].
class Solution {
public:
int getVal(char ch) {
if (ch == 'A') return 0;
if (ch == 'C') return 1;
if (ch == 'G') return 2;
if (ch == 'T') return 3;
}
vector<string> findRepeatedDnaSequences(string s) {
vector<string> vt;
string sub;
if(s.size()<10 || s == "")
return vt;
int mp[1024*1024] = {0};//2^20的空间
unsigned int val = 0;
for(int i = 0;i<9;i++){//得到前面9为的二进制数据
val <<= 2;
val |= getVal(s[i]);
}
for(int i = 9;i<s.size();i++)
{
val <<= 14;//为了删除前面两位,也就是第一个字符
val >>= 12;//后面字符代替前面字符。
val|=getVal(s[i]);
if(mp[val] > 1)
continue;
mp[val]++;//统计每个字符出现的次数
if(mp[val] >1){//表明循环多次得字符串
sub = s.substr(i-9, 10);
vt.push_back(sub);
}
}
return vt;
}
};