Finish Week 3 Tasks using Literal Programming

2019-12-05 02:19:19 +01:00 · 2019-12-05 02:19:19 +01:00 · fe23029144
commit fe23029144
parent f75408cb6b
1 changed files with 373 additions and 2 deletions
--- a/Notebook.org
+++ b/Notebook.org
@ -121,6 +121,377 @@
 **** Computational approaches to find regulatory motifs
 ***** Exercise: Find the most common nucleotides in each position
 We are going to create a *t x k* Motif Matrix, where *t* is the /k-mer/ string. In each position, we'll insert the most frequent nucleotide, in upper case,
 and the nucleotide in lower case (if there's no popular one).
 Our goal is to select the *most* conserved Matrix, i.e. the Matrix with the most upper case letters.
 We'll use a *4 x k* Count Matrix, one row for each base. We'll first generate
 the Matrix:
 #+BEGIN_SRC python
 def Count(Motifs):
    k = len(Motifs[0])
    count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
    t = len(Motifs)
    for i in range(t):
        for j in range(k):
            symbol = Motifs[i][j]
            count[symbol][j] += 1
    return count
 #+END_SRC
 Now that we have a Count Matrix, we will generate a Profile Matrix, which has
 the frequency of the nucleotide instead of the count:
 #+BEGIN_SRC python
 def Profile(Motifs):
    t = len(Motifs)
    profile = Count(Motifs)
    for key, v in profile.items():
        v[:] = [x / t for x in v]
    return profile
 def Count(Motifs):
    k = len(Motifs[0])
    count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
    t = len(Motifs)
    for i in range(t):
        for j in range(k):
            symbol = Motifs[i][j]
            count[symbol][j] += 1
    return count
 #+END_SRC
 ***** Exercise: Form the most frequent sequence of nucleotides
 Finally, we can form a Consensus string, to get a candidate regulatory motif:
 #+BEGIN_SRC python
 def Consensus(Motifs):
    consensus = ""
    count = Count(Motifs)
    k = len(Motifs[0])
    for j in range(k):
        m = 0
        frequentSymbol = ""
        for symbol in "ACGT":
            if count[symbol][j] > m:
                m = count[symbol][j]
                frequentSymbol = symbol
        consensus += frequentSymbol
    return consensus
 def Count(Motifs):
    k = len(Motifs[0])
    count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
    t = len(Motifs)
    for i in range(t):
        for j in range(k):
            symbol = Motifs[i][j]
            count[symbol][j] += 1
    return count
 #+END_SRC
 After obtaining the Consensus string, all we need to do is obtains the total
 score of our selected /k-mers/:
 #+BEGIN_SRC python
 def Score(Motifs):
    score = 0
    for i in range(len(Motifs)):
        score += HammingDistance(Motifs[i], Consensus(Motifs))
    return score
 def Consensus(Motifs):
    consensus = ""
    count = Count(Motifs)
    k = len(Motifs[0])
    for j in range(k):
        m = 0
        frequentSymbol = ""
        for symbol in "ACGT":
            if count[symbol][j] > m:
                m = count[symbol][j]
                frequentSymbol = symbol
        consensus += frequentSymbol
    return consensus
 def Count(Motifs):
    k = len(Motifs[0])
    count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
    t = len(Motifs)
    for i in range(t):
        for j in range(k):
            symbol = Motifs[i][j]
            count[symbol][j] += 1
    return count
 def HammingDistance(p, q):
    count = 0
    for i in range(0, len(p)):
        if p[i] != q[i]:
            count += 1
    return count
 #+END_SRC
 ***** Exercise: Find a set of /k-mers/ that minimize the score
 Applying a brute force approach for this task is not viable, we'll use a Greedy Algorithm. For that, we'll first determine the probability
 of a sequence, we'll use:
 #+BEGIN_SRC python
 def Pr(Text, Profile):
    probability = 1
    k = len(Text)
    for i in range(k):
        probability *= Profile[Text[i]][i]
    return probability
 #+END_SRC
 We'll use this function to find the most probable /k-mer/ in a sequence:
 #+BEGIN_SRC python
 def ProfileMostProbableKmer(text, k, profile):
    kmer = ""
    keys = ["A", "C", "G", "T"]
    d = dict(zip(keys,profile))
    prob = -1
    for i in range(len(text)-k+1):
        if (Pr((text[i:i+k]), profile) > prob):
            prob = Pr(text[i:i+k], profile)
            kmer = text[i:i+k]
    return kmer
 def Pr(Text, Profile):
    probability = 1
    k = len(Text)
    for i in range(k):
        probability *= Profile[Text[i]][i]
    return probability
 #+END_SRC
 Now we're finally ready to assemble all the pieces and implement a Greedy Motif
 Search Algorithm:
 #+BEGIN_SRC python
 def GreedyMotifSearch(Dna, k, t):
    BestMotifs = []
    for i in range(0, t):
        BestMotifs.append(Dna[i][0:k])
    n = len(Dna[0])
    for i in range(n-k+1):
        Motifs = []
        Motifs.append(Dna[0][i:i+k])
        for j in range(1, t):
            P = Profile(Motifs[0:j])
            Motifs.append(ProfileMostProbableKmer(Dna[j], k, P))
        if Score(Motifs) < Score(BestMotifs):
            BestMotifs = Motifs
    return BestMotifs
 def Score(Motifs):
    score = 0
    for i in range(len(Motifs)):
        score += HammingDistance(Motifs[i], Consensus(Motifs))
    return score
 def Consensus(Motifs):
    consensus = ""
    count = Count(Motifs)
    k = len(Motifs[0])
    for j in range(k):
        m = 0
        frequentSymbol = ""
        for symbol in "ACGT":
            if count[symbol][j] > m:
                m = count[symbol][j]
                frequentSymbol = symbol
        consensus += frequentSymbol
    return consensus
 def Count(Motifs):
    k = len(Motifs[0])
    count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
    t = len(Motifs)
    for i in range(t):
        for j in range(k):
            symbol = Motifs[i][j]
            count[symbol][j] += 1
    return count
 def HammingDistance(p, q):
    count = 0
    for i in range(0, len(p)):
        if p[i] != q[i]:
            count += 1
    return count
 def Profile(Motifs):
    t = len(Motifs)
    profile = Count(Motifs)
    for key, v in profile.items():
        v[:] = [x / t for x in v]
    return profile
 def ProfileMostProbableKmer(text, k, profile):
    kmer = ""
    keys = ["A", "C", "G", "T"]
    d = dict(zip(keys,profile))
    prob = -1
    for i in range(len(text)-k+1):
        if (Pr((text[i:i+k]), profile) > prob):
            prob = Pr(text[i:i+k], profile)
            kmer = text[i:i+k]
    return kmer
 def Pr(Text, Profile):
    probability = 1
    k = len(Text)
    for i in range(k):
        probability *= Profile[Text[i]][i]
    return probability
 #+END_SRC
 ***** Motifs in tuberculosis
 Tuberculosis is an infectious disease, cause by a bacteria called /Mycobacterium
 tuberculosis/. The bacteria can stay latent in the host for decades, in hypoxic
 environments.
 Our Greedy Algorithm can help us identify a motif that might be involved in the process.
 The transcription factor behind this behaviour is *DosR*, we'll identify the
 binding sites:
 #+BEGIN_SRC python  :results output
 def GreedyMotifSearch(Dna, k, t):
    BestMotifs = []
    for i in range(0, t):
        BestMotifs.append(Dna[i][0:k])
    n = len(Dna[0])
    for i in range(n - k + 1):
        Motifs = []
        Motifs.append(Dna[0][i : i + k])
        for j in range(1, t):
            P = Profile(Motifs[0:j])
            Motifs.append(ProfileMostProbableKmer(Dna[j], k, P))
        if Score(Motifs) < Score(BestMotifs):
            BestMotifs = Motifs
    return BestMotifs
 def Score(Motifs):
    score = 0
    for i in range(len(Motifs)):
        score += HammingDistance(Motifs[i], Consensus(Motifs))
    return score
 def Consensus(Motifs):
    consensus = ""
    count = Count(Motifs)
    k = len(Motifs[0])
    for j in range(k):
        m = 0
        frequentSymbol = ""
        for symbol in "ACGT":
            if count[symbol][j] > m:
                m = count[symbol][j]
                frequentSymbol = symbol
        consensus += frequentSymbol
    return consensus
 def Count(Motifs):
    k = len(Motifs[0])
    count = {"A": [0] * k, "C": [0] * k, "G": [0] * k, "T": [0] * k}
    t = len(Motifs)
    for i in range(t):
        for j in range(k):
            symbol = Motifs[i][j]
            count[symbol][j] += 1
    return count
 def HammingDistance(p, q):
    count = 0
    for i in range(0, len(p)):
        if p[i] != q[i]:
            count += 1
    return count
 def Profile(Motifs):
    t = len(Motifs)
    profile = Count(Motifs)
    for key, v in profile.items():
        v[:] = [x / t for x in v]
    return profile
 def ProfileMostProbableKmer(text, k, profile):
    kmer = ""
    keys = ["A", "C", "G", "T"]
    d = dict(zip(keys, profile))
    prob = -1
    for i in range(len(text) - k + 1):
        if Pr((text[i : i + k]), profile) > prob:
            prob = Pr(text[i : i + k], profile)
            kmer = text[i : i + k]
    return kmer
 def Pr(Text, Profile):
    probability = 1
    k = len(Text)
    for i in range(k):
        probability *= Profile[Text[i]][i]
    return probability
 Dna = [
    "GCGCCCCGCCCGGACAGCCATGCGCTAACCCTGGCTTCGATGGCGCCGGCTCAGTTAGGGCCGGAAGTCCCCAATGTGGCAGACCTTTCGCCCCTGGCGGACGAATGACCCCAGTGGCCGGGACTTCAGGCCCTATCGGAGGGCTCCGGCGCGGTGGTCGGATTTGTCTGTGGAGGTTACACCCCAATCGCAAGGATGCATTATGACCAGCGAGCTGAGCCTGGTCGCCACTGGAAAGGGGAGCAACATC",
    "CCGATCGGCATCACTATCGGTCCTGCGGCCGCCCATAGCGCTATATCCGGCTGGTGAAATCAATTGACAACCTTCGACTTTGAGGTGGCCTACGGCGAGGACAAGCCAGGCAAGCCAGCTGCCTCAACGCGCGCCAGTACGGGTCCATCGACCCGCGGCCCACGGGTCAAACGACCCTAGTGTTCGCTACGACGTGGTCGTACCTTCGGCAGCAGATCAGCAATAGCACCCCGACTCGAGGAGGATCCCG",
    "ACCGTCGATGTGCCCGGTCGCGCCGCGTCCACCTCGGTCATCGACCCCACGATGAGGACGCCATCGGCCGCGACCAAGCCCCGTGAAACTCTGACGGCGTGCTGGCCGGGCTGCGGCACCTGATCACCTTAGGGCACTTGGGCCACCACAACGGGCCGCCGGTCTCGACAGTGGCCACCACCACACAGGTGACTTCCGGCGGGACGTAAGTCCCTAACGCGTCGTTCCGCACGCGGTTAGCTTTGCTGCC",
    "GGGTCAGGTATATTTATCGCACACTTGGGCACATGACACACAAGCGCCAGAATCCCGGACCGAACCGAGCACCGTGGGTGGGCAGCCTCCATACAGCGATGACCTGATCGATCATCGGCCAGGGCGCCGGGCTTCCAACCGTGGCCGTCTCAGTACCCAGCCTCATTGACCCTTCGACGCATCCACTGCGCGTAAGTCGGCTCAACCCTTTCAAACCGCTGGATTACCGACCGCAGAAAGGGGGCAGGAC",
    "GTAGGTCAAACCGGGTGTACATACCCGCTCAATCGCCCAGCACTTCGGGCAGATCACCGGGTTTCCCCGGTATCACCAATACTGCCACCAAACACAGCAGGCGGGAAGGGGCGAAAGTCCCTTATCCGACAATAAAACTTCGCTTGTTCGACGCCCGGTTCACCCGATATGCACGGCGCCCAGCCATTCGTGACCGACGTCCCCAGCCCCAAGGCCGAACGACCCTAGGAGCCACGAGCAATTCACAGCG",
    "CCGCTGGCGACGCTGTTCGCCGGCAGCGTGCGTGACGACTTCGAGCTGCCCGACTACACCTGGTGACCACCGCCGACGGGCACCTCTCCGCCAGGTAGGCACGGTTTGTCGCCGGCAATGTGACCTTTGGGCGCGGTCTTGAGGACCTTCGGCCCCACCCACGAGGCCGCCGCCGGCCGATCGTATGACGTGCAATGTACGCCATAGGGTGCGTGTTACGGCGATTACCTGAAGGCGGCGGTGGTCCGGA",
    "GGCCAACTGCACCGCGCTCTTGATGACATCGGTGGTCACCATGGTGTCCGGCATGATCAACCTCCGCTGTTCGATATCACCCCGATCTTTCTGAACGGCGGTTGGCAGACAACAGGGTCAATGGTCCCCAAGTGGATCACCGACGGGCGCGGACAAATGGCCCGCGCTTCGGGGACTTCTGTCCCTAGCCCTGGCCACGATGGGCTGGTCGGATCAAAGGCATCCGTTTCCATCGATTAGGAGGCATCAA",
    "GTACATGTCCAGAGCGAGCCTCAGCTTCTGCGCAGCGACGGAAACTGCCACACTCAAAGCCTACTGGGCGCACGTGTGGCAACGAGTCGATCCACACGAAATGCCGCCGTTGGGCCGCGGACTAGCCGAATTTTCCGGGTGGTGACACAGCCCACATTTGGCATGGGACTTTCGGCCCTGTCCGCGTCCGTGTCGGCCAGACAAGCTTTGGGCATTGGCCACAATCGGGCCACAATCGAAAGCCGAGCAG",
    "GGCAGCTGTCGGCAACTGTAAGCCATTTCTGGGACTTTGCTGTGAAAAGCTGGGCGATGGTTGTGGACCTGGACGAGCCACCCGTGCGATAGGTGAGATTCATTCTCGCCCTGACGGGTTGCGTCTGTCATCGGTCGATAAGGACTAACGGCCCTCAGGTGGGGACCAACGCCCCTGGGAGATAGCGGTCCCCGCCAGTAACGTACCGCTGAACCGACGGGATGTATCCGCCCCAGCGAAGGAGACGGCG",
    "TCAGCACCATGACCGCCTGGCCACCAATCGCCCGTAACAAGCGGGACGTCCGCGACGACGCGTGCGCTAGCGCCGTGGCGGTGACAACGACCAGATATGGTCCGAGCACGCGGGCGAACCTCGTGTTCTGGCCTCGGCCAGTTGTGTAGAGCTCATCGCTGTCATCGAGCGATATCCGACCACTGATCCAAGTCGGGGGCTCTGGGGACCGAAGTCCCCGGGCTCGGAGCTATCGGACCTCACGATCACC",
 ]
 t = len(Dna)
 k = 15
 Motifs = GreedyMotifSearch(Dna, k, t)
 print(Motifs)
 print(Score(Motifs))
 #+END_SRC
 #+RESULTS:
 : ['GTTAGGGCCGGAAGT', 'CCGATCGGCATCACT', 'ACCGTCGATGTGCCC', 'GGGTCAGGTATATTT', 'GTGACCGACGTCCCC', 'CTGTTCGCCGGCAGC', 'CTGTTCGATATCACC', 'GTACATGTCCAGAGC', 'GCGATAGGTGAGATT', 'CTCATCGCTGTCATC']
 : 64
 Our algorithm is pretty fast, but it's not optimal, and that's just a
 characteristic of Greedy Algorithms, they trade optimality for speed.
 ** Vocabulary
   - k-mer: subsquences of length /k/ in a biological sequence
   - Frequency map: sequence --> frequency of the sequence