Convert all the content to Literal Programming

2019-12-09 13:35:28 +01:00
parent fe23029144
commit 6b04676fbb
12 changed files with 253 additions and 207 deletions
@@ -1,14 +0,0 @@
-def ApproximatePatternCount(Pattern, Text, d):
-    count = 0
-    for i in range(len(Text)-len(Pattern)+1):
-        if Text[i:i+len(Pattern)] == Pattern or HammingDistance(Text[i:i+len(Pattern)], Pattern) <= d:
-            count += 1
-    return count
-
-
-def HammingDistance(p, q):
-    count = 0
-    for i in range(0, len(p)):
-        if p[i] != q[i]:
-            count += 1
-    return count
@@ -1,16 +0,0 @@
-def ApproximatePatternMatching(Text, Pattern, d):
-    positions = []
-    for i in range(len(Text)-len(Pattern)+1):
-        if Text[i:i+len(Pattern)] == Pattern:
-            positions.append(i)
-        elif HammingDistance(Text[i:i+len(Pattern)], Pattern) <= d:
-            positions.append(i)
-    return positions
-
-
-def HammingDistance(p, q):
-    count = 0
-    for i in range(0, len(p)):
-        if p[i] != q[i]:
-            count += 1
-    return count
@@ -1,20 +0,0 @@
-def FasterSymbolArray(Genome, symbol):
-    array = {}
-    n = len(Genome)
-    ExtendedGenome = Genome + Genome[0:n//2]
-    array[0] = PatternCount(symbol, Genome[0:n//2])
-    for i in range(1, n):
-        array[i] = array[i-1]
-        if ExtendedGenome[i-1] == symbol:
-            array[i] = array[i]-1
-        if ExtendedGenome[i+(n//2)-1] == symbol:
-            array[i] = array[i]+1
-    return array
-
-
-def PatternCount(Text, Pattern):
-    count = 0
-    for i in range(len(Text)-len(Pattern)+1):
-        if Text[i:i+len(Pattern)] == Pattern:
-            count = count+1
-    return count
@@ -1,20 +0,0 @@
-def FrequentWords(Text, k):
-    words = []
-    freq = FrequencyMap(Text, k)
-    m = max(freq.values())
-    for key in freq:
-        if freq[key] == m:
-            words.append(key)
-    return words
-
-
-def FrequencyMap(Text, k):
-    freq = {}
-    n = len(Text)
-    for i in range(n - k + 1):
-        Pattern = Text[i:i + k]
-        freq[Pattern] = 0
-    for i in range(n - k + 1):
-        Pattern = Text[i:i + k]
-        freq[Pattern] += 1
-    return freq
@@ -1,6 +0,0 @@
-def HammingDistance(p, q):
-    count = 0
-    for i in range(0, len(p)):
-        if p[i] != q[i]:
-            count += 1
-    return count
@@ -1,18 +0,0 @@
-def MinimumSkew(Genome):
-    positions = []
-    skew = SkewArray(Genome)
-    minimum = min(skew)
-    return [i for i in range(0, len(Genome)) if skew[i] == minimum]
-
-
-def SkewArray(Genome):
-    Skew = []
-    Skew.append(0)
-    for i in range(0, len(Genome)):
-        if Genome[i] == "G":
-            Skew.append(Skew[i] + 1)
-        elif Genome[i] == "C":
-            Skew.append(Skew[i] - 1)
-        else:
-            Skew.append(Skew[i])
-    return Skew
@@ -1,6 +0,0 @@
-def PatternCount(Text, Pattern):
-    count = 0
-    for i in range(len(Text)-len(Pattern)+1):
-        if Text[i:i+len(Pattern)] == Pattern:
-            count = count+1
-    return count
@@ -1,6 +0,0 @@
-def PatternMatching(Pattern, Genome):
-    positions = []
-    for i in range(len(Genome)-len(Pattern)+1):
-        if Genome[i:i+len(Pattern)] == Pattern:
-            positions.append(i)
-    return positions
@@ -1,17 +0,0 @@
-def ReverseComplement(Pattern):
-    Pattern = Reverse(Pattern)
-    Pattern = Complement(Pattern)
-    return Pattern
-
-
-def Reverse(Pattern):
-    reversed = Pattern[::-1]
-    return reversed
-
-
-def Complement(Pattern):
-    compl = ""
-    complement_letters = {"A": "T", "T": "A", "C": "G", "G": "C"}
-    for char in Pattern:
-        compl += complement_letters[char]
-    return compl
@@ -1,11 +0,0 @@
-def SkewArray(Genome):
-    Skew = []
-    Skew.append(0)
-    for i in range(0, len(Genome)):
-        if Genome[i] == "G":
-            Skew.append(Skew[i] + 1)
-        elif Genome[i] == "C":
-            Skew.append(Skew[i] - 1)
-        else:
-            Skew.append(Skew[i])
-    return Skew
@@ -1,15 +0,0 @@
-def SymbolArray(Genome, symbol):
-    array = {}
-    n = len(Genome)
-    ExtendedGenome = Genome + Genome[0:n//2]
-    for i in range(n):
-        array[i] = PatternCount(ExtendedGenome[i:i+(n//2)], symbol)
-    return array
-
-
-def PatternCount(Text, Pattern):
-    count = 0
-    for i in range(len(Text)-len(Pattern)+1):
-        if Text[i:i+len(Pattern)] == Pattern:
-            count = count+1
-    return count
@@ -6,41 +6,114 @@

 **** Origin of replication (ori)

-     Locating an ori is key for gene therapy (e.g. viral vectors), to introduce a theraupetic gene.
+Locating an ori is key for gene therapy (e.g. viral vectors), to introduce a theraupetic gene.
     
 **** Computational approaches to find ori in Vibrio Cholerae
     
 ***** Exercise: find Pattern
     
-     We'll look for the *DnaA box* sequence, using a sliding window, in that case we will use the function [[./Code/PatternCount.py][PatternCount]] to find out how many times
-     does a sequence appear in the genome.
+We'll look for the *DnaA box* sequence, using a sliding window, in that case we will use this function to find out how many times
+does a sequence appear in the genome:

-     For the second part, we're going to calculate the frequency map of the sequences of length /k/, for that purpose we'll use [[./Code/FrequentWords.py][FrequentWords]] 
+#+BEGIN_SRC python
+def PatternCount(Text, Pattern):
+    count = 0
+    for i in range(len(Text)-len(Pattern)+1):
+        if Text[i:i+len(Pattern)] == Pattern:
+            count = count+1
+    return count
+#+END_SRC
+
+For the second part, we're going to calculate the frequency map of the sequences
+ of length /k/, for that purpose we'll use:
+
+#+BEGIN_SRC python
+def FrequentWords(Text, k):
+    words = []
+    freq = FrequencyMap(Text, k)
+    m = max(freq.values())
+    for key in freq:
+        if freq[key] == m:
+            words.append(key)
+    return words
+
+
+def FrequencyMap(Text, k):
+    freq = {}
+    n = len(Text)
+    for i in range(n - k + 1):
+        Pattern = Text[i:i + k]
+        freq[Pattern] = 0
+    for i in range(n - k + 1):
+        Pattern = Text[i:i + k]
+        freq[Pattern] += 1
+    return freq
+#+END_SRC

 ***** Exercise: Find the reverse complement of a sequence
     
-      We're going to generate the reverse complement of a sequence, which is the complement of a sequence, read in the same direction (5' -> 3').
-      In this case, we're going to use [[./Code/ReverseComplement.py][ReverseComplement]] 
-      After using our function on the /Vibrio Cholerae's/ genome, we realize that some of the frequent /k-mers/ are reverse complements of other frequent ones.
+We're going to generate the reverse complement of a sequence, which is the complement of a sequence, read in the same direction (5' -> 3').
+In this case, we're going to use:
+
+#+BEGIN_SRC python
+def ReverseComplement(Pattern):
+    Pattern = Reverse(Pattern)
+    Pattern = Complement(Pattern)
+    return Pattern
+
+
+def Reverse(Pattern):
+    reversed = Pattern[::-1]
+    return reversed
+
+
+def Complement(Pattern):
+    compl = ""
+    complement_letters = {"A": "T", "T": "A", "C": "G", "G": "C"}
+    for char in Pattern:
+        compl += complement_letters[char]
+    return compl
+#+END_SRC
+
+After using our function on the /Vibrio Cholerae's/ genome, we realize that some of the frequent /k-mers/ are reverse complements of other frequent ones.
     
 ***** Exercise: Find a subsequence within a sequence
      
-      We're going to find the ocurrences of a subsquence inside a sequence, and save the index of the first letter in the sequence.
-      This time, we'll use [[./Code/PatternMatching.py][PatternMatching]] 
-      After using our function on the /Vibrio Cholerae's/ genome, we find out that the /9-mers/ with the highest frequency appear in cluster.
-      This is strong statistical evidence that our subsequences are /DnaA boxes/.
+We're going to find the ocurrences of a subsquence inside a sequence, and save the index of the first letter in the sequence.
+This time, we'll use:
+
+#+BEGIN_SRC python
+def PatternMatching(Pattern, Genome):
+    positions = []
+    for i in range(len(Genome)-len(Pattern)+1):
+        if Genome[i:i+len(Pattern)] == Pattern:
+            positions.append(i)
+    return positions
+#+END_SRC
+
+After using our function on the /Vibrio Cholerae's/ genome, we find out that the /9-mers/ with the highest frequency appear in cluster.
+This is strong statistical evidence that our subsequences are /DnaA boxes/.

      
 **** Computational approaches to find ori in any bacteria
     
-     Now that we're pretty confident about the /DnaA boxes/ sequences that we found, we are going to check if they are a common pattern in the rest of bacterias.
-     We're going to find the ocurrences of the sequences in /Thermotoga petrophila/ using [[./Code/PatternCount.py][PatternCount]]
+Now that we're pretty confident about the /DnaA boxes/ sequences that we found, we are going to check if they are a common pattern in the rest of bacterias.
+We're going to find the ocurrences of the sequences in /Thermotoga petrophila/
+with:

-     After the execution, we observe that there are *no* ocurrences of the sequences found in /Vibrio Cholerae/.
-     We can conclude that different bacterias have different /DnaA boxes/.
+#+BEGIN_SRC python
+def PatternCount(Text, Pattern):
+    count = 0
+    for i in range(len(Text)-len(Pattern)+1):
+        if Text[i:i+len(Pattern)] == Pattern:
+            count = count+1
+    return count
+#+END_SRC

-     We have to try another computational approach then, find clusters of /k-mers/ repeated in a small interval.
+After the execution, we observe that there are *no* ocurrences of the sequences found in /Vibrio Cholerae/.
+We can conclude that different bacterias have different /DnaA boxes/.

+We have to try another computational approach then, find clusters of /k-mers/ repeated in a small interval.

 ** Week 2
   
@@ -48,70 +121,193 @@

 **** Replication process

-     The /DNA polymerases/ start replicating while the parent strands are unraveling. 
-     On the lagging strand, the DNA polymerase waits until the replication fork opens around 2000 nucleotides, and because of that it forms Okazaki fragments.
-     We need 1 primer for the leading strand and 1 primer per Okazaki fragment for the lagging strand.
-     While the Okazaki fragments are being synthetized, a /DNA ligase/ starts joining the fragments together. 
+The /DNA polymerases/ start replicating while the parent strands are unraveling.
+On the lagging strand, the DNA polymerase waits until the replication fork opens around 2000 nucleotides, and because of that it forms Okazaki fragments.
+We need 1 primer for the leading strand and 1 primer per Okazaki fragment for the lagging strand.
+While the Okazaki fragments are being synthetized, a /DNA ligase/ starts joining the fragments together.

 **** Computational approach to find ori using deamination
     
-     As the lagging strand is always waiting for the helicase to go forward, the lagging strand is mostly in single-stranded configuration, which is more prone to mutations.
-     One frequent form of mutation is *deamination*, a process that causes cytosine to convert into thymine. This means that cytosine is more frequent in half of the genome.
+As the lagging strand is always waiting for the helicase to go forward, the lagging strand is mostly in single-stranded configuration, which is more prone to mutations.
+One frequent form of mutation is *deamination*, a process that causes cytosine to convert into thymine. This means that cytosine is more frequent in half of the genome.

 ***** Exercise: count the ocurrences of cytosine
      
-      We're going to count the ocurrences of the bases in a genome and include them in a symbol array, for that purpose we'll use [[./Code/SymbolArray.py][SymbolArray]]
-      After executing the program, we realize that the algorithm is too inefficient.
+We're going to count the ocurrences of the bases in a genome and include them in
+a symbol array, for that purpose we'll use:
+
+#+BEGIN_SRC python
+def SymbolArray(Genome, symbol):
+    array = {}
+    n = len(Genome)
+    ExtendedGenome = Genome + Genome[0:n//2]
+    for i in range(n):
+        array[i] = PatternCount(ExtendedGenome[i:i+(n//2)], symbol)
+    return array
+
+
+def PatternCount(Text, Pattern):
+    count = 0
+    for i in range(len(Text)-len(Pattern)+1):
+        if Text[i:i+len(Pattern)] == Pattern:
+            count = count+1
+    return count
+#+END_SRC
+
+After executing the program, we realize that the algorithm is too inefficient.

 ***** Exercise: find a better algorithm for the previous exercise

-      This time, we are going to evaluate an element /i+1/, using the element /i/. We'll use [[./Code/FasterSymbolArray.py][FasterSymbolArray]] to achieve this
-      After executing the program we see that it's a viable algorithm, with a complexity of /O(n)/ instead of the previous /O(n²)/.
-      In /Escherichia Coli/ we plotted the result of our program:
+This time, we are going to evaluate an element /i+1/, using the element /i/.
+We'll use the following algorithm:

-      #+CAPTION: Symbol array for Cytosine in E. Coli Genome]
-      [[./Assets/e-coli.png]]
+#+BEGIN_SRC python
+def FasterSymbolArray(Genome, symbol):
+    array = {}
+    n = len(Genome)
+    ExtendedGenome = Genome + Genome[0:n//2]
+    array[0] = PatternCount(symbol, Genome[0:n//2])
+    for i in range(1, n):
+        array[i] = array[i-1]
+        if ExtendedGenome[i-1] == symbol:
+            array[i] = array[i]-1
+        if ExtendedGenome[i+(n//2)-1] == symbol:
+            array[i] = array[i]+1
+    return array

-      From that graph, we conclude that ori is located around position 4000000, because that's where the Cytosine concentration is the lowest,
-      which indicates that the region stays single-stranded for the longest time.
+
+def PatternCount(Text, Pattern):
+    count = 0
+    for i in range(len(Text)-len(Pattern)+1):
+        if Text[i:i+len(Pattern)] == Pattern:
+            count = count+1
+    return count
+#+END_SRC
+
+After executing the program we see that it's a viable algorithm, with a complexity of /O(n)/ instead of the previous /O(n²)/.
+In /Escherichia Coli/ we plotted the result of our program:
+
+#+CAPTION: Symbol array for Cytosine in E. Coli Genome]
+[[./Assets/e-coli.png]]
+
+From that graph, we conclude that ori is located around position 4000000, because that's where the Cytosine concentration is the lowest,
+which indicates that the region stays single-stranded for the longest time.
      
 **** The Skew Diagram

-     Usually scientists measure the difference between /G - C/, which is *higher on the lagging strand* and *lower on the leading strand*.
+Usually scientists measure the difference between /G - C/, which is *higher on the lagging strand* and *lower on the leading strand*.
     
 ***** Exercise: Synthetize a Skew Array
      
-      We're going to make a Skew Diagram, for that we'll first need a Skew Array. For that purpose we wrote [[./Code/SkewArray.py][SkewArray]] 
-      We can see the utility of a Skew Diagram looking at the one from /Escherichia Coli/:
+We're going to make a Skew Diagram, for that we'll first need a Skew Array. For
+that purpose we wrote:

-      #+CAPTION: Symbol array for Cytosine in E. Coli Genome]
-      [[./Assets/skew_diagram.png]]
+#+BEGIN_SRC python
+def SkewArray(Genome):
+    Skew = []
+    Skew.append(0)
+    for i in range(0, len(Genome)):
+        if Genome[i] == "G":
+            Skew.append(Skew[i] + 1)
+        elif Genome[i] == "C":
+            Skew.append(Skew[i] - 1)
+        else:
+            Skew.append(Skew[i])
+    return Skew
+#+END_SRC

-     Ori should be located where the skew is at its minimum value.
+We can see the utility of a Skew Diagram looking at the one from /Escherichia Coli/:
+
+#+CAPTION: Symbol array for Cytosine in E. Coli Genome]
+[[./Assets/skew_diagram.png]]
+
+Ori should be located where the skew is at its minimum value.

 ***** Exercise: Efficient algorithm for locating ori
      
-      Now that we know more about ori's skew value, we're going to construct a better algorithm to find it. We'll do that in [[./Code/MinimumSkew.py][MinimumSkew]]
+Now that we know more about ori's skew value, we're going to construct a better
+algorithm to find it:
+
+#+BEGIN_SRC python
+def MinimumSkew(Genome):
+    positions = []
+    skew = SkewArray(Genome)
+    minimum = min(skew)
+    return [i for i in range(0, len(Genome)) if skew[i] == minimum]
+
+
+def SkewArray(Genome):
+    Skew = []
+    Skew.append(0)
+    for i in range(0, len(Genome)):
+        if Genome[i] == "G":
+            Skew.append(Skew[i] + 1)
+        elif Genome[i] == "C":
+            Skew.append(Skew[i] - 1)
+        else:
+            Skew.append(Skew[i])
+    return Skew
+#+END_SRC
+

 **** Finding /DnaA boxes/
     
-     When we look for /DnaA boxes/ in the minimal skew region, we can't find highly repeated /9-mers/ in /Escherichia Coli/.
-     But we find approximate sequences that are similar to our /9-mers/ and only differ in 1 nucleotide.
+When we look for /DnaA boxes/ in the minimal skew region, we can't find highly repeated /9-mers/ in /Escherichia Coli/.
+But we find approximate sequences that are similar to our /9-mers/ and only differ in 1 nucleotide.

 ***** Exercise: Calculate Hamming distance
      
-      The Hamming distance is the number of mismatches between 2 strings, we'll solve this problem in [[./Code/HammingDistance][HammingDistance]]
+The Hamming distance is the number of mismatches between 2 strings, we'll solve this problem in [[./Code/HammingDistance][HammingDistance]]

 ***** Exercise: Find approximate patterns
      
-      Now that we have our Hamming distance, we have to find the approximate sequences. We'll do this in [[./Code/ApproximatePatternMatching.py][ApproximatePatternMatching.py]]
+Now that we have our Hamming distance, we have to find the approximate
+sequences:
+
+#+BEGIN_SRC python
+def ApproximatePatternMatching(Text, Pattern, d):
+    positions = []
+    for i in range(len(Text)-len(Pattern)+1):
+        if Text[i:i+len(Pattern)] == Pattern:
+            positions.append(i)
+        elif HammingDistance(Text[i:i+len(Pattern)], Pattern) <= d:
+            positions.append(i)
+    return positions
+
+
+def HammingDistance(p, q):
+    count = 0
+    for i in range(0, len(p)):
+        if p[i] != q[i]:
+            count += 1
+    return count
+#+END_SRC
+

 ***** Exercise: Count the approximate patterns

-      The final part is counting the approximate sequences, for that we'll use [[./Code/ApproximatePatternCount.py][ApproximatePatternCount.py]]
+The final part is counting the approximate sequences:

-     After trying out our ApproximatePatternCount in the hypothesized ori region, we find a frequent /k-mer/ with its reverse complement in /Escherichia Coli/.
-     We've finally found a computational method to find ori that seems correct.
+#+BEGIN_SRC python
+def ApproximatePatternCount(Pattern, Text, d):
+    count = 0
+    for i in range(len(Text)-len(Pattern)+1):
+        if Text[i:i+len(Pattern)] == Pattern or HammingDistance(Text[i:i+len(Pattern)], Pattern) <= d:
+            count += 1
+    return count
+
+
+def HammingDistance(p, q):
+    count = 0
+    for i in range(0, len(p)):
+        if p[i] != q[i]:
+            count += 1
+    return count
+#+END_SRC
+
+
+After trying out our ApproximatePatternCount in the hypothesized ori region, we find a frequent /k-mer/ with its reverse complement in /Escherichia Coli/.
+We've finally found a computational method to find ori that seems correct.

 ** Week 3

@@ -123,7 +319,6 @@ Variation in gene expression permits the cell to keep track of time.

 ***** Exercise: Find the most common nucleotides in each position

-
 We are going to create a *t x k* Motif Matrix, where *t* is the /k-mer/ string. In each position, we'll insert the most frequent nucleotide, in upper case,
 and the nucleotide in lower case (if there's no popular one).
 Our goal is to select the *most* conserved Matrix, i.e. the Matrix with the most upper case letters.
@@ -493,5 +688,5 @@ characteristic of Greedy Algorithms, they trade optimality for speed.


 ** Vocabulary
-   - k-mer: subsquences of length /k/ in a biological sequence
-   - Frequency map: sequence --> frequency of the sequence
+- k-mer: subsquences of length /k/ in a biological sequence
+- Frequency map: sequence --> frequency of the sequence