From fe23029144916938dff25154b50b5ea1d4e1cf00 Mon Sep 17 00:00:00 2001 From: coolneng Date: Thu, 5 Dec 2019 02:19:19 +0100 Subject: [PATCH] Finish Week 3 Tasks using Literal Programming --- Notebook.org | 375 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 373 insertions(+), 2 deletions(-) diff --git a/Notebook.org b/Notebook.org index 136ae09..e02ff11 100644 --- a/Notebook.org +++ b/Notebook.org @@ -116,11 +116,382 @@ ** Week 3 *** The circadian clock - - Variation in gene expression permits the cell to keep track of time. + +Variation in gene expression permits the cell to keep track of time. **** Computational approaches to find regulatory motifs +***** Exercise: Find the most common nucleotides in each position + + +We are going to create a *t x k* Motif Matrix, where *t* is the /k-mer/ string. In each position, we'll insert the most frequent nucleotide, in upper case, +and the nucleotide in lower case (if there's no popular one). +Our goal is to select the *most* conserved Matrix, i.e. the Matrix with the most upper case letters. +We'll use a *4 x k* Count Matrix, one row for each base. We'll first generate +the Matrix: + +#+BEGIN_SRC python +def Count(Motifs): + k = len(Motifs[0]) + count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k} + t = len(Motifs) + for i in range(t): + for j in range(k): + symbol = Motifs[i][j] + count[symbol][j] += 1 + return count +#+END_SRC + +Now that we have a Count Matrix, we will generate a Profile Matrix, which has + the frequency of the nucleotide instead of the count: + +#+BEGIN_SRC python +def Profile(Motifs): + t = len(Motifs) + profile = Count(Motifs) + for key, v in profile.items(): + v[:] = [x / t for x in v] + return profile + + +def Count(Motifs): + k = len(Motifs[0]) + count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k} + t = len(Motifs) + for i in range(t): + for j in range(k): + symbol = Motifs[i][j] + count[symbol][j] += 1 + return count +#+END_SRC + +***** Exercise: Form the most frequent sequence of nucleotides + +Finally, we can form a Consensus string, to get a candidate regulatory motif: +#+BEGIN_SRC python +def Consensus(Motifs): + consensus = "" + count = Count(Motifs) + k = len(Motifs[0]) + for j in range(k): + m = 0 + frequentSymbol = "" + for symbol in "ACGT": + if count[symbol][j] > m: + m = count[symbol][j] + frequentSymbol = symbol + consensus += frequentSymbol + return consensus + + +def Count(Motifs): + k = len(Motifs[0]) + count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k} + t = len(Motifs) + for i in range(t): + for j in range(k): + symbol = Motifs[i][j] + count[symbol][j] += 1 + return count +#+END_SRC + +After obtaining the Consensus string, all we need to do is obtains the total +score of our selected /k-mers/: + +#+BEGIN_SRC python +def Score(Motifs): + score = 0 + for i in range(len(Motifs)): + score += HammingDistance(Motifs[i], Consensus(Motifs)) + return score + + +def Consensus(Motifs): + consensus = "" + count = Count(Motifs) + k = len(Motifs[0]) + for j in range(k): + m = 0 + frequentSymbol = "" + for symbol in "ACGT": + if count[symbol][j] > m: + m = count[symbol][j] + frequentSymbol = symbol + consensus += frequentSymbol + return consensus + + +def Count(Motifs): + k = len(Motifs[0]) + count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k} + t = len(Motifs) + for i in range(t): + for j in range(k): + symbol = Motifs[i][j] + count[symbol][j] += 1 + return count + + +def HammingDistance(p, q): + count = 0 + for i in range(0, len(p)): + if p[i] != q[i]: + count += 1 + return count +#+END_SRC + +***** Exercise: Find a set of /k-mers/ that minimize the score + + +Applying a brute force approach for this task is not viable, we'll use a Greedy Algorithm. For that, we'll first determine the probability +of a sequence, we'll use: + +#+BEGIN_SRC python +def Pr(Text, Profile): + probability = 1 + k = len(Text) + for i in range(k): + probability *= Profile[Text[i]][i] + return probability +#+END_SRC + +We'll use this function to find the most probable /k-mer/ in a sequence: + +#+BEGIN_SRC python +def ProfileMostProbableKmer(text, k, profile): + kmer = "" + keys = ["A", "C", "G", "T"] + d = dict(zip(keys,profile)) + prob = -1 + for i in range(len(text)-k+1): + if (Pr((text[i:i+k]), profile) > prob): + prob = Pr(text[i:i+k], profile) + kmer = text[i:i+k] + return kmer + + +def Pr(Text, Profile): + probability = 1 + k = len(Text) + for i in range(k): + probability *= Profile[Text[i]][i] + return probability +#+END_SRC + +Now we're finally ready to assemble all the pieces and implement a Greedy Motif +Search Algorithm: + +#+BEGIN_SRC python +def GreedyMotifSearch(Dna, k, t): + BestMotifs = [] + for i in range(0, t): + BestMotifs.append(Dna[i][0:k]) + n = len(Dna[0]) + for i in range(n-k+1): + Motifs = [] + Motifs.append(Dna[0][i:i+k]) + for j in range(1, t): + P = Profile(Motifs[0:j]) + Motifs.append(ProfileMostProbableKmer(Dna[j], k, P)) + if Score(Motifs) < Score(BestMotifs): + BestMotifs = Motifs + return BestMotifs + +def Score(Motifs): + score = 0 + for i in range(len(Motifs)): + score += HammingDistance(Motifs[i], Consensus(Motifs)) + return score + + +def Consensus(Motifs): + consensus = "" + count = Count(Motifs) + k = len(Motifs[0]) + for j in range(k): + m = 0 + frequentSymbol = "" + for symbol in "ACGT": + if count[symbol][j] > m: + m = count[symbol][j] + frequentSymbol = symbol + consensus += frequentSymbol + return consensus + + +def Count(Motifs): + k = len(Motifs[0]) + count = {'A': [0]*k, 'C': [0]*k, 'G': [0]*k, 'T': [0] * k} + t = len(Motifs) + for i in range(t): + for j in range(k): + symbol = Motifs[i][j] + count[symbol][j] += 1 + return count + + +def HammingDistance(p, q): + count = 0 + for i in range(0, len(p)): + if p[i] != q[i]: + count += 1 + return count + + +def Profile(Motifs): + t = len(Motifs) + profile = Count(Motifs) + for key, v in profile.items(): + v[:] = [x / t for x in v] + return profile + + +def ProfileMostProbableKmer(text, k, profile): + kmer = "" + keys = ["A", "C", "G", "T"] + d = dict(zip(keys,profile)) + prob = -1 + for i in range(len(text)-k+1): + if (Pr((text[i:i+k]), profile) > prob): + prob = Pr(text[i:i+k], profile) + kmer = text[i:i+k] + return kmer + + +def Pr(Text, Profile): + probability = 1 + k = len(Text) + for i in range(k): + probability *= Profile[Text[i]][i] + return probability +#+END_SRC + +***** Motifs in tuberculosis + +Tuberculosis is an infectious disease, cause by a bacteria called /Mycobacterium +tuberculosis/. The bacteria can stay latent in the host for decades, in hypoxic +environments. +Our Greedy Algorithm can help us identify a motif that might be involved in the process. + +The transcription factor behind this behaviour is *DosR*, we'll identify the +binding sites: + +#+BEGIN_SRC python :results output +def GreedyMotifSearch(Dna, k, t): + BestMotifs = [] + for i in range(0, t): + BestMotifs.append(Dna[i][0:k]) + n = len(Dna[0]) + for i in range(n - k + 1): + Motifs = [] + Motifs.append(Dna[0][i : i + k]) + for j in range(1, t): + P = Profile(Motifs[0:j]) + Motifs.append(ProfileMostProbableKmer(Dna[j], k, P)) + if Score(Motifs) < Score(BestMotifs): + BestMotifs = Motifs + return BestMotifs + + +def Score(Motifs): + score = 0 + for i in range(len(Motifs)): + score += HammingDistance(Motifs[i], Consensus(Motifs)) + return score + + +def Consensus(Motifs): + consensus = "" + count = Count(Motifs) + k = len(Motifs[0]) + for j in range(k): + m = 0 + frequentSymbol = "" + for symbol in "ACGT": + if count[symbol][j] > m: + m = count[symbol][j] + frequentSymbol = symbol + consensus += frequentSymbol + return consensus + + +def Count(Motifs): + k = len(Motifs[0]) + count = {"A": [0] * k, "C": [0] * k, "G": [0] * k, "T": [0] * k} + t = len(Motifs) + for i in range(t): + for j in range(k): + symbol = Motifs[i][j] + count[symbol][j] += 1 + return count + + +def HammingDistance(p, q): + count = 0 + for i in range(0, len(p)): + if p[i] != q[i]: + count += 1 + return count + + +def Profile(Motifs): + t = len(Motifs) + profile = Count(Motifs) + for key, v in profile.items(): + v[:] = [x / t for x in v] + return profile + + +def ProfileMostProbableKmer(text, k, profile): + kmer = "" + keys = ["A", "C", "G", "T"] + d = dict(zip(keys, profile)) + prob = -1 + for i in range(len(text) - k + 1): + if Pr((text[i : i + k]), profile) > prob: + prob = Pr(text[i : i + k], profile) + kmer = text[i : i + k] + return kmer + + +def Pr(Text, Profile): + probability = 1 + k = len(Text) + for i in range(k): + probability *= Profile[Text[i]][i] + return probability + + +Dna = [ + "GCGCCCCGCCCGGACAGCCATGCGCTAACCCTGGCTTCGATGGCGCCGGCTCAGTTAGGGCCGGAAGTCCCCAATGTGGCAGACCTTTCGCCCCTGGCGGACGAATGACCCCAGTGGCCGGGACTTCAGGCCCTATCGGAGGGCTCCGGCGCGGTGGTCGGATTTGTCTGTGGAGGTTACACCCCAATCGCAAGGATGCATTATGACCAGCGAGCTGAGCCTGGTCGCCACTGGAAAGGGGAGCAACATC", + "CCGATCGGCATCACTATCGGTCCTGCGGCCGCCCATAGCGCTATATCCGGCTGGTGAAATCAATTGACAACCTTCGACTTTGAGGTGGCCTACGGCGAGGACAAGCCAGGCAAGCCAGCTGCCTCAACGCGCGCCAGTACGGGTCCATCGACCCGCGGCCCACGGGTCAAACGACCCTAGTGTTCGCTACGACGTGGTCGTACCTTCGGCAGCAGATCAGCAATAGCACCCCGACTCGAGGAGGATCCCG", + "ACCGTCGATGTGCCCGGTCGCGCCGCGTCCACCTCGGTCATCGACCCCACGATGAGGACGCCATCGGCCGCGACCAAGCCCCGTGAAACTCTGACGGCGTGCTGGCCGGGCTGCGGCACCTGATCACCTTAGGGCACTTGGGCCACCACAACGGGCCGCCGGTCTCGACAGTGGCCACCACCACACAGGTGACTTCCGGCGGGACGTAAGTCCCTAACGCGTCGTTCCGCACGCGGTTAGCTTTGCTGCC", + "GGGTCAGGTATATTTATCGCACACTTGGGCACATGACACACAAGCGCCAGAATCCCGGACCGAACCGAGCACCGTGGGTGGGCAGCCTCCATACAGCGATGACCTGATCGATCATCGGCCAGGGCGCCGGGCTTCCAACCGTGGCCGTCTCAGTACCCAGCCTCATTGACCCTTCGACGCATCCACTGCGCGTAAGTCGGCTCAACCCTTTCAAACCGCTGGATTACCGACCGCAGAAAGGGGGCAGGAC", + "GTAGGTCAAACCGGGTGTACATACCCGCTCAATCGCCCAGCACTTCGGGCAGATCACCGGGTTTCCCCGGTATCACCAATACTGCCACCAAACACAGCAGGCGGGAAGGGGCGAAAGTCCCTTATCCGACAATAAAACTTCGCTTGTTCGACGCCCGGTTCACCCGATATGCACGGCGCCCAGCCATTCGTGACCGACGTCCCCAGCCCCAAGGCCGAACGACCCTAGGAGCCACGAGCAATTCACAGCG", + "CCGCTGGCGACGCTGTTCGCCGGCAGCGTGCGTGACGACTTCGAGCTGCCCGACTACACCTGGTGACCACCGCCGACGGGCACCTCTCCGCCAGGTAGGCACGGTTTGTCGCCGGCAATGTGACCTTTGGGCGCGGTCTTGAGGACCTTCGGCCCCACCCACGAGGCCGCCGCCGGCCGATCGTATGACGTGCAATGTACGCCATAGGGTGCGTGTTACGGCGATTACCTGAAGGCGGCGGTGGTCCGGA", + "GGCCAACTGCACCGCGCTCTTGATGACATCGGTGGTCACCATGGTGTCCGGCATGATCAACCTCCGCTGTTCGATATCACCCCGATCTTTCTGAACGGCGGTTGGCAGACAACAGGGTCAATGGTCCCCAAGTGGATCACCGACGGGCGCGGACAAATGGCCCGCGCTTCGGGGACTTCTGTCCCTAGCCCTGGCCACGATGGGCTGGTCGGATCAAAGGCATCCGTTTCCATCGATTAGGAGGCATCAA", + "GTACATGTCCAGAGCGAGCCTCAGCTTCTGCGCAGCGACGGAAACTGCCACACTCAAAGCCTACTGGGCGCACGTGTGGCAACGAGTCGATCCACACGAAATGCCGCCGTTGGGCCGCGGACTAGCCGAATTTTCCGGGTGGTGACACAGCCCACATTTGGCATGGGACTTTCGGCCCTGTCCGCGTCCGTGTCGGCCAGACAAGCTTTGGGCATTGGCCACAATCGGGCCACAATCGAAAGCCGAGCAG", + "GGCAGCTGTCGGCAACTGTAAGCCATTTCTGGGACTTTGCTGTGAAAAGCTGGGCGATGGTTGTGGACCTGGACGAGCCACCCGTGCGATAGGTGAGATTCATTCTCGCCCTGACGGGTTGCGTCTGTCATCGGTCGATAAGGACTAACGGCCCTCAGGTGGGGACCAACGCCCCTGGGAGATAGCGGTCCCCGCCAGTAACGTACCGCTGAACCGACGGGATGTATCCGCCCCAGCGAAGGAGACGGCG", + "TCAGCACCATGACCGCCTGGCCACCAATCGCCCGTAACAAGCGGGACGTCCGCGACGACGCGTGCGCTAGCGCCGTGGCGGTGACAACGACCAGATATGGTCCGAGCACGCGGGCGAACCTCGTGTTCTGGCCTCGGCCAGTTGTGTAGAGCTCATCGCTGTCATCGAGCGATATCCGACCACTGATCCAAGTCGGGGGCTCTGGGGACCGAAGTCCCCGGGCTCGGAGCTATCGGACCTCACGATCACC", +] + +t = len(Dna) +k = 15 + +Motifs = GreedyMotifSearch(Dna, k, t) + +print(Motifs) +print(Score(Motifs)) +#+END_SRC + +#+RESULTS: +: ['GTTAGGGCCGGAAGT', 'CCGATCGGCATCACT', 'ACCGTCGATGTGCCC', 'GGGTCAGGTATATTT', 'GTGACCGACGTCCCC', 'CTGTTCGCCGGCAGC', 'CTGTTCGATATCACC', 'GTACATGTCCAGAGC', 'GCGATAGGTGAGATT', 'CTCATCGCTGTCATC'] +: 64 + +Our algorithm is pretty fast, but it's not optimal, and that's just a +characteristic of Greedy Algorithms, they trade optimality for speed. + + ** Vocabulary - k-mer: subsquences of length /k/ in a biological sequence - Frequency map: sequence --> frequency of the sequence