All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
For example,
Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT", Return: ["AAAAACCCCC", "CCCCCAAAAA"].Solutions:
因为只出现4个字符,可以巧用2位二进制位来代表它们。10个字符可以用20位来代表。用哈希映射。
class Solution {
public:
unsigned int getInt(char c) {
switch (c) {
case 'A': return 0;
case 'C': return 1;
case 'G': return 2;
case 'T': return 3;
// default: return -1;
}
}
vector<string> findRepeatedDnaSequences(string s) {
vector<string> result;
if(s.size() <= 10) {
return result;
}
unsigned int toInt=0;
int i=0,j=0;
for( i=0; i<=9; ++i){
//<< must be the first,or wrong
toInt <<= 2;
toInt |= getInt(s[i]);
//toInt <<= 2;
}
/* int *mapArr=new int[1024*1024];//exceed space limitation
memset(mapArr, 0, sizeof(int)*(1024*1024));
mapArr[toInt]++;
*/
map<int, int> mapArr;
mapArr[toInt]++;
for( i=10; i<s.size(); ++i) {
toInt <<= 14;
toInt >>= 12;
toInt |= getInt(s[i]);
mapArr[toInt]++;
if(mapArr[toInt] > 1) {
string temp=s.substr(i-9, 10);
bool flag=false;
for(vector<string>::iterator itr=result.begin(); itr!=result.end(); ++itr) {
if(*itr == temp) {
flag=true;
break;
}
}
if(flag == false) {
result.push_back(temp);
}
}
}
return result;
}
};