All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
For example,
Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT", Return: ["AAAAACCCCC", "CCCCCAAAAA"].-----------------------------------------------------
Solution #1, unordered_map
class Solution { public: vector<string> findRepeatedDnaSequences(string s) { vector<string> rt; hash<string> hash_fn; if (s.length() < 11) return rt; unordered_map<int,int> map; for (int i = 0; i < s.length() - 9; i++) { string sub = s.substr(i,10); int hashN = hash_fn(sub); if (map.find(hashN) == map.end()) { map[hashN] = 1; }else if (map[hashN] == 1){ rt.push_back(sub); map[hashN]++; } } return rt; } };Solution #2, convert ACGT to 4-bit representation, so the space cost of one key is reduced from 10 * 8 bits to 2 bits
public: vector<string> findRepeatedDnaSequences(string s) { vector<string> rt; if (s.length() < 11) return rt; unordered_map<char,int> map; map['A'] = 0; map['C'] = 1; map['G'] = 2; map['T'] = 3; unordered_map<int,int> count; unsigned hash = 0; for (int i = 0; i < s.length(); i++) { hash <<= 2; hash += map[s[i]]; if (i > 8) { hash &= (1 << 20) - 1; if (count.find(hash) == count.end()) { count[hash] = 1; }else if (count[hash] == 1) { count[hash]++; rt.push_back(s.substr(i - 9,10)); } } } return rt; } };another solution https://leetcode.com/discuss/24478/i-did-it-in-10-lines-of-c
No comments:
Post a Comment