diff --git a/Code/ApproximatePatternCount.py b/Code/ApproximatePatternCount.py deleted file mode 100644 index f5945b1..0000000 --- a/Code/ApproximatePatternCount.py +++ /dev/null @@ -1,14 +0,0 @@ -def ApproximatePatternCount(Pattern, Text, d): - count = 0 - for i in range(len(Text)-len(Pattern)+1): - if Text[i:i+len(Pattern)] == Pattern or HammingDistance(Text[i:i+len(Pattern)], Pattern) <= d: - count += 1 - return count - - -def HammingDistance(p, q): - count = 0 - for i in range(0, len(p)): - if p[i] != q[i]: - count += 1 - return count diff --git a/Code/ApproximatePatternMatching.py b/Code/ApproximatePatternMatching.py deleted file mode 100644 index 2edb1c0..0000000 --- a/Code/ApproximatePatternMatching.py +++ /dev/null @@ -1,16 +0,0 @@ -def ApproximatePatternMatching(Text, Pattern, d): - positions = [] - for i in range(len(Text)-len(Pattern)+1): - if Text[i:i+len(Pattern)] == Pattern: - positions.append(i) - elif HammingDistance(Text[i:i+len(Pattern)], Pattern) <= d: - positions.append(i) - return positions - - -def HammingDistance(p, q): - count = 0 - for i in range(0, len(p)): - if p[i] != q[i]: - count += 1 - return count diff --git a/Code/FasterSymbolArray.py b/Code/FasterSymbolArray.py deleted file mode 100644 index ef1068c..0000000 --- a/Code/FasterSymbolArray.py +++ /dev/null @@ -1,20 +0,0 @@ -def FasterSymbolArray(Genome, symbol): - array = {} - n = len(Genome) - ExtendedGenome = Genome + Genome[0:n//2] - array[0] = PatternCount(symbol, Genome[0:n//2]) - for i in range(1, n): - array[i] = array[i-1] - if ExtendedGenome[i-1] == symbol: - array[i] = array[i]-1 - if ExtendedGenome[i+(n//2)-1] == symbol: - array[i] = array[i]+1 - return array - - -def PatternCount(Text, Pattern): - count = 0 - for i in range(len(Text)-len(Pattern)+1): - if Text[i:i+len(Pattern)] == Pattern: - count = count+1 - return count diff --git a/Code/FrequentWords.py b/Code/FrequentWords.py deleted file mode 100644 index 06ef4b2..0000000 --- a/Code/FrequentWords.py +++ /dev/null @@ -1,20 +0,0 @@ -def FrequentWords(Text, k): - words = [] - freq = FrequencyMap(Text, k) - m = max(freq.values()) - for key in freq: - if freq[key] == m: - words.append(key) - return words - - -def FrequencyMap(Text, k): - freq = {} - n = len(Text) - for i in range(n - k + 1): - Pattern = Text[i:i + k] - freq[Pattern] = 0 - for i in range(n - k + 1): - Pattern = Text[i:i + k] - freq[Pattern] += 1 - return freq diff --git a/Code/HammingDistance.py b/Code/HammingDistance.py deleted file mode 100644 index 9a7cfe5..0000000 --- a/Code/HammingDistance.py +++ /dev/null @@ -1,6 +0,0 @@ -def HammingDistance(p, q): - count = 0 - for i in range(0, len(p)): - if p[i] != q[i]: - count += 1 - return count diff --git a/Code/MinimumSkew.py b/Code/MinimumSkew.py deleted file mode 100644 index 04ac36b..0000000 --- a/Code/MinimumSkew.py +++ /dev/null @@ -1,18 +0,0 @@ -def MinimumSkew(Genome): - positions = [] - skew = SkewArray(Genome) - minimum = min(skew) - return [i for i in range(0, len(Genome)) if skew[i] == minimum] - - -def SkewArray(Genome): - Skew = [] - Skew.append(0) - for i in range(0, len(Genome)): - if Genome[i] == "G": - Skew.append(Skew[i] + 1) - elif Genome[i] == "C": - Skew.append(Skew[i] - 1) - else: - Skew.append(Skew[i]) - return Skew diff --git a/Code/PatternCount.py b/Code/PatternCount.py deleted file mode 100644 index 93ac561..0000000 --- a/Code/PatternCount.py +++ /dev/null @@ -1,6 +0,0 @@ -def PatternCount(Text, Pattern): - count = 0 - for i in range(len(Text)-len(Pattern)+1): - if Text[i:i+len(Pattern)] == Pattern: - count = count+1 - return count diff --git a/Code/PatternMatching.py b/Code/PatternMatching.py deleted file mode 100644 index c4e8225..0000000 --- a/Code/PatternMatching.py +++ /dev/null @@ -1,6 +0,0 @@ -def PatternMatching(Pattern, Genome): - positions = [] - for i in range(len(Genome)-len(Pattern)+1): - if Genome[i:i+len(Pattern)] == Pattern: - positions.append(i) - return positions diff --git a/Code/ReverseComplement.py b/Code/ReverseComplement.py deleted file mode 100644 index de6b4ef..0000000 --- a/Code/ReverseComplement.py +++ /dev/null @@ -1,17 +0,0 @@ -def ReverseComplement(Pattern): - Pattern = Reverse(Pattern) - Pattern = Complement(Pattern) - return Pattern - - -def Reverse(Pattern): - reversed = Pattern[::-1] - return reversed - - -def Complement(Pattern): - compl = "" - complement_letters = {"A": "T", "T": "A", "C": "G", "G": "C"} - for char in Pattern: - compl += complement_letters[char] - return compl diff --git a/Code/SkewArray.py b/Code/SkewArray.py deleted file mode 100644 index 2fee7cc..0000000 --- a/Code/SkewArray.py +++ /dev/null @@ -1,11 +0,0 @@ -def SkewArray(Genome): - Skew = [] - Skew.append(0) - for i in range(0, len(Genome)): - if Genome[i] == "G": - Skew.append(Skew[i] + 1) - elif Genome[i] == "C": - Skew.append(Skew[i] - 1) - else: - Skew.append(Skew[i]) - return Skew diff --git a/Code/SymbolArray.py b/Code/SymbolArray.py deleted file mode 100644 index bcb6387..0000000 --- a/Code/SymbolArray.py +++ /dev/null @@ -1,15 +0,0 @@ -def SymbolArray(Genome, symbol): - array = {} - n = len(Genome) - ExtendedGenome = Genome + Genome[0:n//2] - for i in range(n): - array[i] = PatternCount(ExtendedGenome[i:i+(n//2)], symbol) - return array - - -def PatternCount(Text, Pattern): - count = 0 - for i in range(len(Text)-len(Pattern)+1): - if Text[i:i+len(Pattern)] == Pattern: - count = count+1 - return count diff --git a/Notebook.org b/Notebook.org index e02ff11..95eeb26 100644 --- a/Notebook.org +++ b/Notebook.org @@ -5,113 +5,309 @@ *** DNA replication **** Origin of replication (ori) - - Locating an ori is key for gene therapy (e.g. viral vectors), to introduce a theraupetic gene. + +Locating an ori is key for gene therapy (e.g. viral vectors), to introduce a theraupetic gene. **** Computational approaches to find ori in Vibrio Cholerae ***** Exercise: find Pattern - We'll look for the *DnaA box* sequence, using a sliding window, in that case we will use the function [[./Code/PatternCount.py][PatternCount]] to find out how many times - does a sequence appear in the genome. - - For the second part, we're going to calculate the frequency map of the sequences of length /k/, for that purpose we'll use [[./Code/FrequentWords.py][FrequentWords]] - +We'll look for the *DnaA box* sequence, using a sliding window, in that case we will use this function to find out how many times +does a sequence appear in the genome: + +#+BEGIN_SRC python +def PatternCount(Text, Pattern): + count = 0 + for i in range(len(Text)-len(Pattern)+1): + if Text[i:i+len(Pattern)] == Pattern: + count = count+1 + return count +#+END_SRC + +For the second part, we're going to calculate the frequency map of the sequences + of length /k/, for that purpose we'll use: + +#+BEGIN_SRC python +def FrequentWords(Text, k): + words = [] + freq = FrequencyMap(Text, k) + m = max(freq.values()) + for key in freq: + if freq[key] == m: + words.append(key) + return words + + +def FrequencyMap(Text, k): + freq = {} + n = len(Text) + for i in range(n - k + 1): + Pattern = Text[i:i + k] + freq[Pattern] = 0 + for i in range(n - k + 1): + Pattern = Text[i:i + k] + freq[Pattern] += 1 + return freq +#+END_SRC + ***** Exercise: Find the reverse complement of a sequence - We're going to generate the reverse complement of a sequence, which is the complement of a sequence, read in the same direction (5' -> 3'). - In this case, we're going to use [[./Code/ReverseComplement.py][ReverseComplement]] - After using our function on the /Vibrio Cholerae's/ genome, we realize that some of the frequent /k-mers/ are reverse complements of other frequent ones. +We're going to generate the reverse complement of a sequence, which is the complement of a sequence, read in the same direction (5' -> 3'). +In this case, we're going to use: + +#+BEGIN_SRC python +def ReverseComplement(Pattern): + Pattern = Reverse(Pattern) + Pattern = Complement(Pattern) + return Pattern + + +def Reverse(Pattern): + reversed = Pattern[::-1] + return reversed + + +def Complement(Pattern): + compl = "" + complement_letters = {"A": "T", "T": "A", "C": "G", "G": "C"} + for char in Pattern: + compl += complement_letters[char] + return compl +#+END_SRC + +After using our function on the /Vibrio Cholerae's/ genome, we realize that some of the frequent /k-mers/ are reverse complements of other frequent ones. ***** Exercise: Find a subsequence within a sequence - We're going to find the ocurrences of a subsquence inside a sequence, and save the index of the first letter in the sequence. - This time, we'll use [[./Code/PatternMatching.py][PatternMatching]] - After using our function on the /Vibrio Cholerae's/ genome, we find out that the /9-mers/ with the highest frequency appear in cluster. - This is strong statistical evidence that our subsequences are /DnaA boxes/. +We're going to find the ocurrences of a subsquence inside a sequence, and save the index of the first letter in the sequence. +This time, we'll use: + +#+BEGIN_SRC python +def PatternMatching(Pattern, Genome): + positions = [] + for i in range(len(Genome)-len(Pattern)+1): + if Genome[i:i+len(Pattern)] == Pattern: + positions.append(i) + return positions +#+END_SRC + +After using our function on the /Vibrio Cholerae's/ genome, we find out that the /9-mers/ with the highest frequency appear in cluster. +This is strong statistical evidence that our subsequences are /DnaA boxes/. **** Computational approaches to find ori in any bacteria - Now that we're pretty confident about the /DnaA boxes/ sequences that we found, we are going to check if they are a common pattern in the rest of bacterias. - We're going to find the ocurrences of the sequences in /Thermotoga petrophila/ using [[./Code/PatternCount.py][PatternCount]] - - After the execution, we observe that there are *no* ocurrences of the sequences found in /Vibrio Cholerae/. - We can conclude that different bacterias have different /DnaA boxes/. - - We have to try another computational approach then, find clusters of /k-mers/ repeated in a small interval. +Now that we're pretty confident about the /DnaA boxes/ sequences that we found, we are going to check if they are a common pattern in the rest of bacterias. +We're going to find the ocurrences of the sequences in /Thermotoga petrophila/ +with: + +#+BEGIN_SRC python +def PatternCount(Text, Pattern): + count = 0 + for i in range(len(Text)-len(Pattern)+1): + if Text[i:i+len(Pattern)] == Pattern: + count = count+1 + return count +#+END_SRC + +After the execution, we observe that there are *no* ocurrences of the sequences found in /Vibrio Cholerae/. +We can conclude that different bacterias have different /DnaA boxes/. + +We have to try another computational approach then, find clusters of /k-mers/ repeated in a small interval. - ** Week 2 *** DNA replication (II) **** Replication process - The /DNA polymerases/ start replicating while the parent strands are unraveling. - On the lagging strand, the DNA polymerase waits until the replication fork opens around 2000 nucleotides, and because of that it forms Okazaki fragments. - We need 1 primer for the leading strand and 1 primer per Okazaki fragment for the lagging strand. - While the Okazaki fragments are being synthetized, a /DNA ligase/ starts joining the fragments together. +The /DNA polymerases/ start replicating while the parent strands are unraveling. +On the lagging strand, the DNA polymerase waits until the replication fork opens around 2000 nucleotides, and because of that it forms Okazaki fragments. +We need 1 primer for the leading strand and 1 primer per Okazaki fragment for the lagging strand. +While the Okazaki fragments are being synthetized, a /DNA ligase/ starts joining the fragments together. **** Computational approach to find ori using deamination - As the lagging strand is always waiting for the helicase to go forward, the lagging strand is mostly in single-stranded configuration, which is more prone to mutations. - One frequent form of mutation is *deamination*, a process that causes cytosine to convert into thymine. This means that cytosine is more frequent in half of the genome. +As the lagging strand is always waiting for the helicase to go forward, the lagging strand is mostly in single-stranded configuration, which is more prone to mutations. +One frequent form of mutation is *deamination*, a process that causes cytosine to convert into thymine. This means that cytosine is more frequent in half of the genome. ***** Exercise: count the ocurrences of cytosine - We're going to count the ocurrences of the bases in a genome and include them in a symbol array, for that purpose we'll use [[./Code/SymbolArray.py][SymbolArray]] - After executing the program, we realize that the algorithm is too inefficient. +We're going to count the ocurrences of the bases in a genome and include them in +a symbol array, for that purpose we'll use: + +#+BEGIN_SRC python +def SymbolArray(Genome, symbol): + array = {} + n = len(Genome) + ExtendedGenome = Genome + Genome[0:n//2] + for i in range(n): + array[i] = PatternCount(ExtendedGenome[i:i+(n//2)], symbol) + return array + + +def PatternCount(Text, Pattern): + count = 0 + for i in range(len(Text)-len(Pattern)+1): + if Text[i:i+len(Pattern)] == Pattern: + count = count+1 + return count +#+END_SRC + +After executing the program, we realize that the algorithm is too inefficient. ***** Exercise: find a better algorithm for the previous exercise - - This time, we are going to evaluate an element /i+1/, using the element /i/. We'll use [[./Code/FasterSymbolArray.py][FasterSymbolArray]] to achieve this - After executing the program we see that it's a viable algorithm, with a complexity of /O(n)/ instead of the previous /O(n²)/. - In /Escherichia Coli/ we plotted the result of our program: - #+CAPTION: Symbol array for Cytosine in E. Coli Genome] - [[./Assets/e-coli.png]] - - From that graph, we conclude that ori is located around position 4000000, because that's where the Cytosine concentration is the lowest, - which indicates that the region stays single-stranded for the longest time. +This time, we are going to evaluate an element /i+1/, using the element /i/. +We'll use the following algorithm: + +#+BEGIN_SRC python +def FasterSymbolArray(Genome, symbol): + array = {} + n = len(Genome) + ExtendedGenome = Genome + Genome[0:n//2] + array[0] = PatternCount(symbol, Genome[0:n//2]) + for i in range(1, n): + array[i] = array[i-1] + if ExtendedGenome[i-1] == symbol: + array[i] = array[i]-1 + if ExtendedGenome[i+(n//2)-1] == symbol: + array[i] = array[i]+1 + return array + + +def PatternCount(Text, Pattern): + count = 0 + for i in range(len(Text)-len(Pattern)+1): + if Text[i:i+len(Pattern)] == Pattern: + count = count+1 + return count +#+END_SRC + +After executing the program we see that it's a viable algorithm, with a complexity of /O(n)/ instead of the previous /O(n²)/. +In /Escherichia Coli/ we plotted the result of our program: + +#+CAPTION: Symbol array for Cytosine in E. Coli Genome] +[[./Assets/e-coli.png]] + +From that graph, we conclude that ori is located around position 4000000, because that's where the Cytosine concentration is the lowest, +which indicates that the region stays single-stranded for the longest time. **** The Skew Diagram - Usually scientists measure the difference between /G - C/, which is *higher on the lagging strand* and *lower on the leading strand*. +Usually scientists measure the difference between /G - C/, which is *higher on the lagging strand* and *lower on the leading strand*. ***** Exercise: Synthetize a Skew Array - We're going to make a Skew Diagram, for that we'll first need a Skew Array. For that purpose we wrote [[./Code/SkewArray.py][SkewArray]] - We can see the utility of a Skew Diagram looking at the one from /Escherichia Coli/: +We're going to make a Skew Diagram, for that we'll first need a Skew Array. For +that purpose we wrote: - #+CAPTION: Symbol array for Cytosine in E. Coli Genome] - [[./Assets/skew_diagram.png]] +#+BEGIN_SRC python +def SkewArray(Genome): + Skew = [] + Skew.append(0) + for i in range(0, len(Genome)): + if Genome[i] == "G": + Skew.append(Skew[i] + 1) + elif Genome[i] == "C": + Skew.append(Skew[i] - 1) + else: + Skew.append(Skew[i]) + return Skew +#+END_SRC - Ori should be located where the skew is at its minimum value. +We can see the utility of a Skew Diagram looking at the one from /Escherichia Coli/: + +#+CAPTION: Symbol array for Cytosine in E. Coli Genome] +[[./Assets/skew_diagram.png]] + +Ori should be located where the skew is at its minimum value. ***** Exercise: Efficient algorithm for locating ori - Now that we know more about ori's skew value, we're going to construct a better algorithm to find it. We'll do that in [[./Code/MinimumSkew.py][MinimumSkew]] +Now that we know more about ori's skew value, we're going to construct a better +algorithm to find it: + +#+BEGIN_SRC python +def MinimumSkew(Genome): + positions = [] + skew = SkewArray(Genome) + minimum = min(skew) + return [i for i in range(0, len(Genome)) if skew[i] == minimum] + + +def SkewArray(Genome): + Skew = [] + Skew.append(0) + for i in range(0, len(Genome)): + if Genome[i] == "G": + Skew.append(Skew[i] + 1) + elif Genome[i] == "C": + Skew.append(Skew[i] - 1) + else: + Skew.append(Skew[i]) + return Skew +#+END_SRC + **** Finding /DnaA boxes/ - When we look for /DnaA boxes/ in the minimal skew region, we can't find highly repeated /9-mers/ in /Escherichia Coli/. - But we find approximate sequences that are similar to our /9-mers/ and only differ in 1 nucleotide. - +When we look for /DnaA boxes/ in the minimal skew region, we can't find highly repeated /9-mers/ in /Escherichia Coli/. +But we find approximate sequences that are similar to our /9-mers/ and only differ in 1 nucleotide. + ***** Exercise: Calculate Hamming distance - The Hamming distance is the number of mismatches between 2 strings, we'll solve this problem in [[./Code/HammingDistance][HammingDistance]] +The Hamming distance is the number of mismatches between 2 strings, we'll solve this problem in [[./Code/HammingDistance][HammingDistance]] ***** Exercise: Find approximate patterns - Now that we have our Hamming distance, we have to find the approximate sequences. We'll do this in [[./Code/ApproximatePatternMatching.py][ApproximatePatternMatching.py]] +Now that we have our Hamming distance, we have to find the approximate +sequences: + +#+BEGIN_SRC python +def ApproximatePatternMatching(Text, Pattern, d): + positions = [] + for i in range(len(Text)-len(Pattern)+1): + if Text[i:i+len(Pattern)] == Pattern: + positions.append(i) + elif HammingDistance(Text[i:i+len(Pattern)], Pattern) <= d: + positions.append(i) + return positions + + +def HammingDistance(p, q): + count = 0 + for i in range(0, len(p)): + if p[i] != q[i]: + count += 1 + return count +#+END_SRC + ***** Exercise: Count the approximate patterns - The final part is counting the approximate sequences, for that we'll use [[./Code/ApproximatePatternCount.py][ApproximatePatternCount.py]] - - After trying out our ApproximatePatternCount in the hypothesized ori region, we find a frequent /k-mer/ with its reverse complement in /Escherichia Coli/. - We've finally found a computational method to find ori that seems correct. +The final part is counting the approximate sequences: + +#+BEGIN_SRC python +def ApproximatePatternCount(Pattern, Text, d): + count = 0 + for i in range(len(Text)-len(Pattern)+1): + if Text[i:i+len(Pattern)] == Pattern or HammingDistance(Text[i:i+len(Pattern)], Pattern) <= d: + count += 1 + return count + + +def HammingDistance(p, q): + count = 0 + for i in range(0, len(p)): + if p[i] != q[i]: + count += 1 + return count +#+END_SRC + + +After trying out our ApproximatePatternCount in the hypothesized ori region, we find a frequent /k-mer/ with its reverse complement in /Escherichia Coli/. +We've finally found a computational method to find ori that seems correct. ** Week 3 @@ -123,7 +319,6 @@ Variation in gene expression permits the cell to keep track of time. ***** Exercise: Find the most common nucleotides in each position - We are going to create a *t x k* Motif Matrix, where *t* is the /k-mer/ string. In each position, we'll insert the most frequent nucleotide, in upper case, and the nucleotide in lower case (if there's no popular one). Our goal is to select the *most* conserved Matrix, i.e. the Matrix with the most upper case letters. @@ -493,5 +688,5 @@ characteristic of Greedy Algorithms, they trade optimality for speed. ** Vocabulary - - k-mer: subsquences of length /k/ in a biological sequence - - Frequency map: sequence --> frequency of the sequence +- k-mer: subsquences of length /k/ in a biological sequence +- Frequency map: sequence --> frequency of the sequence