From fe23029144916938dff25154b50b5ea1d4e1cf00 Mon Sep 17 00:00:00 2001
From: coolneng <akasroua@gmail.com>
Date: Thu, 5 Dec 2019 02:19:19 +0100
Subject: [PATCH] Finish Week 3 Tasks using Literal Programming

---
 Notebook.org | 375 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 373 insertions(+), 2 deletions(-)

diff --git a/Notebook.org b/Notebook.org
index 136ae09..e02ff11 100644
--- a/Notebook.org
+++ b/Notebook.org
@@ -116,11 +116,382 @@
 ** Week 3
 
 *** The circadian clock
-    
-    Variation in gene expression permits the cell to keep track of time.
+
+Variation in gene expression permits the cell to keep track of time.
 
 **** Computational approaches to find regulatory motifs
 
+***** Exercise: Find the most common nucleotides in each position
+
+
+We are going to create a *t x k* Motif Matrix, where *t* is the /k-mer/ string. In each position, we'll insert the most frequent nucleotide, in upper case,
+and the nucleotide in lower case (if there's no popular one).
+Our goal is to select the *most* conserved Matrix, i.e. the Matrix with the most upper case letters.
+We'll use a *4 x k* Count Matrix, one row for each base. We'll first generate
+the Matrix:
+
+#+BEGIN_SRC python
+def Count(Motifs):
+    k = len(Motifs[0])
+    count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
+    t = len(Motifs)
+    for i in range(t):
+        for j in range(k):
+            symbol = Motifs[i][j]
+            count[symbol][j] += 1
+    return count
+#+END_SRC
+
+Now that we have a Count Matrix, we will generate a Profile Matrix, which has
+ the frequency of the nucleotide instead of the count:
+
+#+BEGIN_SRC python
+def Profile(Motifs):
+    t = len(Motifs)
+    profile = Count(Motifs)
+    for key, v in profile.items():
+        v[:] = [x / t for x in v]
+    return profile
+
+
+def Count(Motifs):
+    k = len(Motifs[0])
+    count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
+    t = len(Motifs)
+    for i in range(t):
+        for j in range(k):
+            symbol = Motifs[i][j]
+            count[symbol][j] += 1
+    return count
+#+END_SRC
+
+***** Exercise: Form the most frequent sequence of nucleotides
+      
+Finally, we can form a Consensus string, to get a candidate regulatory motif:
+#+BEGIN_SRC python
+def Consensus(Motifs):
+    consensus = ""
+    count = Count(Motifs)
+    k = len(Motifs[0])
+    for j in range(k):
+        m = 0
+        frequentSymbol = ""
+        for symbol in "ACGT":
+            if count[symbol][j] > m:
+                m = count[symbol][j]
+                frequentSymbol = symbol
+        consensus += frequentSymbol
+    return consensus
+
+
+def Count(Motifs):
+    k = len(Motifs[0])
+    count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
+    t = len(Motifs)
+    for i in range(t):
+        for j in range(k):
+            symbol = Motifs[i][j]
+            count[symbol][j] += 1
+    return count
+#+END_SRC
+
+After obtaining the Consensus string, all we need to do is obtains the total
+score of our selected /k-mers/:
+
+#+BEGIN_SRC python
+def Score(Motifs):
+    score = 0
+    for i in range(len(Motifs)):
+        score += HammingDistance(Motifs[i], Consensus(Motifs))
+    return score
+
+
+def Consensus(Motifs):
+    consensus = ""
+    count = Count(Motifs)
+    k = len(Motifs[0])
+    for j in range(k):
+        m = 0
+        frequentSymbol = ""
+        for symbol in "ACGT":
+            if count[symbol][j] > m:
+                m = count[symbol][j]
+                frequentSymbol = symbol
+        consensus += frequentSymbol
+    return consensus
+
+
+def Count(Motifs):
+    k = len(Motifs[0])
+    count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
+    t = len(Motifs)
+    for i in range(t):
+        for j in range(k):
+            symbol = Motifs[i][j]
+            count[symbol][j] += 1
+    return count
+
+
+def HammingDistance(p, q):
+    count = 0
+    for i in range(0, len(p)):
+        if p[i] != q[i]:
+            count += 1
+    return count
+#+END_SRC
+
+***** Exercise: Find a set of /k-mers/ that minimize the score
+
+
+Applying a brute force approach for this task is not viable, we'll use a Greedy Algorithm. For that, we'll first determine the probability
+of a sequence, we'll use:
+    
+#+BEGIN_SRC python
+def Pr(Text, Profile):
+    probability = 1
+    k = len(Text)
+    for i in range(k):
+        probability *= Profile[Text[i]][i]
+    return probability
+#+END_SRC
+
+We'll use this function to find the most probable /k-mer/ in a sequence:
+
+#+BEGIN_SRC python
+def ProfileMostProbableKmer(text, k, profile):
+    kmer = ""
+    keys = ["A", "C", "G", "T"]
+    d = dict(zip(keys,profile))
+    prob = -1
+    for i in range(len(text)-k+1):
+        if (Pr((text[i:i+k]), profile) > prob):
+            prob = Pr(text[i:i+k], profile)
+            kmer = text[i:i+k]
+    return kmer
+
+
+def Pr(Text, Profile):
+    probability = 1
+    k = len(Text)
+    for i in range(k):
+        probability *= Profile[Text[i]][i]
+    return probability
+#+END_SRC
+
+Now we're finally ready to assemble all the pieces and implement a Greedy Motif
+Search Algorithm:
+
+#+BEGIN_SRC python
+def GreedyMotifSearch(Dna, k, t):
+    BestMotifs = []
+    for i in range(0, t):
+        BestMotifs.append(Dna[i][0:k])
+    n = len(Dna[0])
+    for i in range(n-k+1):
+        Motifs = []
+        Motifs.append(Dna[0][i:i+k])
+        for j in range(1, t):
+            P = Profile(Motifs[0:j])
+            Motifs.append(ProfileMostProbableKmer(Dna[j], k, P))
+        if Score(Motifs) < Score(BestMotifs):
+            BestMotifs = Motifs
+    return BestMotifs
+
+def Score(Motifs):
+    score = 0
+    for i in range(len(Motifs)):
+        score += HammingDistance(Motifs[i], Consensus(Motifs))
+    return score
+
+
+def Consensus(Motifs):
+    consensus = ""
+    count = Count(Motifs)
+    k = len(Motifs[0])
+    for j in range(k):
+        m = 0
+        frequentSymbol = ""
+        for symbol in "ACGT":
+            if count[symbol][j] > m:
+                m = count[symbol][j]
+                frequentSymbol = symbol
+        consensus += frequentSymbol
+    return consensus
+
+
+def Count(Motifs):
+    k = len(Motifs[0])
+    count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k}
+    t = len(Motifs)
+    for i in range(t):
+        for j in range(k):
+            symbol = Motifs[i][j]
+            count[symbol][j] += 1
+    return count
+
+
+def HammingDistance(p, q):
+    count = 0
+    for i in range(0, len(p)):
+        if p[i] != q[i]:
+            count += 1
+    return count
+
+
+def Profile(Motifs):
+    t = len(Motifs)
+    profile = Count(Motifs)
+    for key, v in profile.items():
+        v[:] = [x / t for x in v]
+    return profile
+
+
+def ProfileMostProbableKmer(text, k, profile):
+    kmer = ""
+    keys = ["A", "C", "G", "T"]
+    d = dict(zip(keys,profile))
+    prob = -1
+    for i in range(len(text)-k+1):
+        if (Pr((text[i:i+k]), profile) > prob):
+            prob = Pr(text[i:i+k], profile)
+            kmer = text[i:i+k]
+    return kmer
+
+
+def Pr(Text, Profile):
+    probability = 1
+    k = len(Text)
+    for i in range(k):
+        probability *= Profile[Text[i]][i]
+    return probability
+#+END_SRC
+
+***** Motifs in tuberculosis
+
+Tuberculosis is an infectious disease, cause by a bacteria called /Mycobacterium
+tuberculosis/. The bacteria can stay latent in the host for decades, in hypoxic
+environments.
+Our Greedy Algorithm can help us identify a motif that might be involved in the process.
+
+The transcription factor behind this behaviour is *DosR*, we'll identify the
+binding sites:
+
+#+BEGIN_SRC python  :results output
+def GreedyMotifSearch(Dna, k, t):
+    BestMotifs = []
+    for i in range(0, t):
+        BestMotifs.append(Dna[i][0:k])
+    n = len(Dna[0])
+    for i in range(n - k + 1):
+        Motifs = []
+        Motifs.append(Dna[0][i : i + k])
+        for j in range(1, t):
+            P = Profile(Motifs[0:j])
+            Motifs.append(ProfileMostProbableKmer(Dna[j], k, P))
+        if Score(Motifs) < Score(BestMotifs):
+            BestMotifs = Motifs
+    return BestMotifs
+
+
+def Score(Motifs):
+    score = 0
+    for i in range(len(Motifs)):
+        score += HammingDistance(Motifs[i], Consensus(Motifs))
+    return score
+
+
+def Consensus(Motifs):
+    consensus = ""
+    count = Count(Motifs)
+    k = len(Motifs[0])
+    for j in range(k):
+        m = 0
+        frequentSymbol = ""
+        for symbol in "ACGT":
+            if count[symbol][j] > m:
+                m = count[symbol][j]
+                frequentSymbol = symbol
+        consensus += frequentSymbol
+    return consensus
+
+
+def Count(Motifs):
+    k = len(Motifs[0])
+    count = {"A": [0] * k, "C": [0] * k, "G": [0] * k, "T": [0] * k}
+    t = len(Motifs)
+    for i in range(t):
+        for j in range(k):
+            symbol = Motifs[i][j]
+            count[symbol][j] += 1
+    return count
+
+
+def HammingDistance(p, q):
+    count = 0
+    for i in range(0, len(p)):
+        if p[i] != q[i]:
+            count += 1
+    return count
+
+
+def Profile(Motifs):
+    t = len(Motifs)
+    profile = Count(Motifs)
+    for key, v in profile.items():
+        v[:] = [x / t for x in v]
+    return profile
+
+
+def ProfileMostProbableKmer(text, k, profile):
+    kmer = ""
+    keys = ["A", "C", "G", "T"]
+    d = dict(zip(keys, profile))
+    prob = -1
+    for i in range(len(text) - k + 1):
+        if Pr((text[i : i + k]), profile) > prob:
+            prob = Pr(text[i : i + k], profile)
+            kmer = text[i : i + k]
+    return kmer
+
+
+def Pr(Text, Profile):
+    probability = 1
+    k = len(Text)
+    for i in range(k):
+        probability *= Profile[Text[i]][i]
+    return probability
+
+
+Dna = [
+    "GCGCCCCGCCCGGACAGCCATGCGCTAACCCTGGCTTCGATGGCGCCGGCTCAGTTAGGGCCGGAAGTCCCCAATGTGGCAGACCTTTCGCCCCTGGCGGACGAATGACCCCAGTGGCCGGGACTTCAGGCCCTATCGGAGGGCTCCGGCGCGGTGGTCGGATTTGTCTGTGGAGGTTACACCCCAATCGCAAGGATGCATTATGACCAGCGAGCTGAGCCTGGTCGCCACTGGAAAGGGGAGCAACATC",
+    "CCGATCGGCATCACTATCGGTCCTGCGGCCGCCCATAGCGCTATATCCGGCTGGTGAAATCAATTGACAACCTTCGACTTTGAGGTGGCCTACGGCGAGGACAAGCCAGGCAAGCCAGCTGCCTCAACGCGCGCCAGTACGGGTCCATCGACCCGCGGCCCACGGGTCAAACGACCCTAGTGTTCGCTACGACGTGGTCGTACCTTCGGCAGCAGATCAGCAATAGCACCCCGACTCGAGGAGGATCCCG",
+    "ACCGTCGATGTGCCCGGTCGCGCCGCGTCCACCTCGGTCATCGACCCCACGATGAGGACGCCATCGGCCGCGACCAAGCCCCGTGAAACTCTGACGGCGTGCTGGCCGGGCTGCGGCACCTGATCACCTTAGGGCACTTGGGCCACCACAACGGGCCGCCGGTCTCGACAGTGGCCACCACCACACAGGTGACTTCCGGCGGGACGTAAGTCCCTAACGCGTCGTTCCGCACGCGGTTAGCTTTGCTGCC",
+    "GGGTCAGGTATATTTATCGCACACTTGGGCACATGACACACAAGCGCCAGAATCCCGGACCGAACCGAGCACCGTGGGTGGGCAGCCTCCATACAGCGATGACCTGATCGATCATCGGCCAGGGCGCCGGGCTTCCAACCGTGGCCGTCTCAGTACCCAGCCTCATTGACCCTTCGACGCATCCACTGCGCGTAAGTCGGCTCAACCCTTTCAAACCGCTGGATTACCGACCGCAGAAAGGGGGCAGGAC",
+    "GTAGGTCAAACCGGGTGTACATACCCGCTCAATCGCCCAGCACTTCGGGCAGATCACCGGGTTTCCCCGGTATCACCAATACTGCCACCAAACACAGCAGGCGGGAAGGGGCGAAAGTCCCTTATCCGACAATAAAACTTCGCTTGTTCGACGCCCGGTTCACCCGATATGCACGGCGCCCAGCCATTCGTGACCGACGTCCCCAGCCCCAAGGCCGAACGACCCTAGGAGCCACGAGCAATTCACAGCG",
+    "CCGCTGGCGACGCTGTTCGCCGGCAGCGTGCGTGACGACTTCGAGCTGCCCGACTACACCTGGTGACCACCGCCGACGGGCACCTCTCCGCCAGGTAGGCACGGTTTGTCGCCGGCAATGTGACCTTTGGGCGCGGTCTTGAGGACCTTCGGCCCCACCCACGAGGCCGCCGCCGGCCGATCGTATGACGTGCAATGTACGCCATAGGGTGCGTGTTACGGCGATTACCTGAAGGCGGCGGTGGTCCGGA",
+    "GGCCAACTGCACCGCGCTCTTGATGACATCGGTGGTCACCATGGTGTCCGGCATGATCAACCTCCGCTGTTCGATATCACCCCGATCTTTCTGAACGGCGGTTGGCAGACAACAGGGTCAATGGTCCCCAAGTGGATCACCGACGGGCGCGGACAAATGGCCCGCGCTTCGGGGACTTCTGTCCCTAGCCCTGGCCACGATGGGCTGGTCGGATCAAAGGCATCCGTTTCCATCGATTAGGAGGCATCAA",
+    "GTACATGTCCAGAGCGAGCCTCAGCTTCTGCGCAGCGACGGAAACTGCCACACTCAAAGCCTACTGGGCGCACGTGTGGCAACGAGTCGATCCACACGAAATGCCGCCGTTGGGCCGCGGACTAGCCGAATTTTCCGGGTGGTGACACAGCCCACATTTGGCATGGGACTTTCGGCCCTGTCCGCGTCCGTGTCGGCCAGACAAGCTTTGGGCATTGGCCACAATCGGGCCACAATCGAAAGCCGAGCAG",
+    "GGCAGCTGTCGGCAACTGTAAGCCATTTCTGGGACTTTGCTGTGAAAAGCTGGGCGATGGTTGTGGACCTGGACGAGCCACCCGTGCGATAGGTGAGATTCATTCTCGCCCTGACGGGTTGCGTCTGTCATCGGTCGATAAGGACTAACGGCCCTCAGGTGGGGACCAACGCCCCTGGGAGATAGCGGTCCCCGCCAGTAACGTACCGCTGAACCGACGGGATGTATCCGCCCCAGCGAAGGAGACGGCG",
+    "TCAGCACCATGACCGCCTGGCCACCAATCGCCCGTAACAAGCGGGACGTCCGCGACGACGCGTGCGCTAGCGCCGTGGCGGTGACAACGACCAGATATGGTCCGAGCACGCGGGCGAACCTCGTGTTCTGGCCTCGGCCAGTTGTGTAGAGCTCATCGCTGTCATCGAGCGATATCCGACCACTGATCCAAGTCGGGGGCTCTGGGGACCGAAGTCCCCGGGCTCGGAGCTATCGGACCTCACGATCACC",
+]
+
+t = len(Dna)
+k = 15
+
+Motifs = GreedyMotifSearch(Dna, k, t)
+
+print(Motifs)
+print(Score(Motifs))
+#+END_SRC
+
+#+RESULTS:
+: ['GTTAGGGCCGGAAGT', 'CCGATCGGCATCACT', 'ACCGTCGATGTGCCC', 'GGGTCAGGTATATTT', 'GTGACCGACGTCCCC', 'CTGTTCGCCGGCAGC', 'CTGTTCGATATCACC', 'GTACATGTCCAGAGC', 'GCGATAGGTGAGATT', 'CTCATCGCTGTCATC']
+: 64
+
+Our algorithm is pretty fast, but it's not optimal, and that's just a
+characteristic of Greedy Algorithms, they trade optimality for speed.
+
+
 ** Vocabulary
    - k-mer: subsquences of length /k/ in a biological sequence
    - Frequency map: sequence --> frequency of the sequence