Wednesday, June 10, 2015

Day 103, ##, Repeated DNA Sequences

All DNA is composed of a series of nucleotides abbreviated as A, C, G, and T, for example: "ACGAATTCCG". When studying DNA, it is sometimes useful to identify repeated sequences within the DNA.
Write a function to find all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule.
For example,
Given s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT",

Return:
["AAAAACCCCC", "CCCCCAAAAA"].
-----------------------------------------------------
Solution #1, unordered_map would get Memory Limit Exceeded.

class Solution {
public:
    vector<string> findRepeatedDnaSequences(string s) {
        vector<string> rt;
        hash<string> hash_fn;
        if (s.length() < 11) return rt;
        
        unordered_map<int,int> map;
        for (int i = 0; i < s.length() - 9; i++) {
            string sub = s.substr(i,10);
            int hashN = hash_fn(sub);
            
            if (map.find(hashN) == map.end()) {
                map[hashN] = 1;
            }else if (map[hashN] == 1){
                rt.push_back(sub);
                map[hashN]++;
            }
        }

        return rt;
    }
};
Solution #2, convert ACGT to 4-bit representation, so the space cost of one key is reduced from 10 * 8 bits to 2 bits
public:
    vector<string> findRepeatedDnaSequences(string s) {
        vector<string> rt;
        if (s.length() < 11) return rt;
        
        unordered_map<char,int> map;
        map['A'] = 0;
        map['C'] = 1;
        map['G'] = 2;
        map['T'] = 3;
        
        unordered_map<int,int> count;
        unsigned hash = 0;
        for (int i = 0; i < s.length(); i++) {
            hash <<= 2;
            hash += map[s[i]];
            if (i > 8) {
                hash &= (1 << 20) - 1;
                if (count.find(hash) == count.end()) {
                    count[hash] = 1;
                }else if (count[hash] == 1) {
                    count[hash]++;
                    rt.push_back(s.substr(i - 9,10));
                }
            }
        }
        
        return rt;
    }
};
another solution https://leetcode.com/discuss/24478/i-did-it-in-10-lines-of-c

No comments:

Post a Comment