diff --git a/Assets/e-coli.png b/Assets/e-coli.png new file mode 100644 index 0000000..80fdf29 Binary files /dev/null and b/Assets/e-coli.png differ diff --git a/Assets/e-coli_symbol_array_c.png b/Assets/e-coli_symbol_array_c.png new file mode 100644 index 0000000..291a0a8 Binary files /dev/null and b/Assets/e-coli_symbol_array_c.png differ diff --git a/Assets/skew_diagram.png b/Assets/skew_diagram.png new file mode 100644 index 0000000..c49b8c7 Binary files /dev/null and b/Assets/skew_diagram.png differ diff --git a/Code/ApproximatePatternCount.py b/Code/ApproximatePatternCount.py new file mode 100644 index 0000000..49feca0 --- /dev/null +++ b/Code/ApproximatePatternCount.py @@ -0,0 +1,16 @@ +def ApproximatePatternCount(Pattern, Text, d): + count = 0 + for i in range(len(Text)-len(Pattern)+1): + if Text[i:i+len(Pattern)] == Pattern: + count += 1 + elif HammingDistance(Text[i:i+len(Pattern)], Pattern) <= d: + count += 1 + return count + + +def HammingDistance(p, q): + count = 0 + for i in range(0, len(p)): + if p[i] != q[i]: + count += 1 + return count diff --git a/Code/ApproximatePatternMatching.py b/Code/ApproximatePatternMatching.py new file mode 100644 index 0000000..2edb1c0 --- /dev/null +++ b/Code/ApproximatePatternMatching.py @@ -0,0 +1,16 @@ +def ApproximatePatternMatching(Text, Pattern, d): + positions = [] + for i in range(len(Text)-len(Pattern)+1): + if Text[i:i+len(Pattern)] == Pattern: + positions.append(i) + elif HammingDistance(Text[i:i+len(Pattern)], Pattern) <= d: + positions.append(i) + return positions + + +def HammingDistance(p, q): + count = 0 + for i in range(0, len(p)): + if p[i] != q[i]: + count += 1 + return count diff --git a/Code/FasterSymbolArray.py b/Code/FasterSymbolArray.py new file mode 100644 index 0000000..ef1068c --- /dev/null +++ b/Code/FasterSymbolArray.py @@ -0,0 +1,20 @@ +def FasterSymbolArray(Genome, symbol): + array = {} + n = len(Genome) + ExtendedGenome = Genome + Genome[0:n//2] + array[0] = PatternCount(symbol, Genome[0:n//2]) + for i in range(1, n): + array[i] = array[i-1] + if ExtendedGenome[i-1] == symbol: + array[i] = array[i]-1 + if ExtendedGenome[i+(n//2)-1] == symbol: + array[i] = array[i]+1 + return array + + +def PatternCount(Text, Pattern): + count = 0 + for i in range(len(Text)-len(Pattern)+1): + if Text[i:i+len(Pattern)] == Pattern: + count = count+1 + return count diff --git a/Code/HammingDistance.py b/Code/HammingDistance.py new file mode 100644 index 0000000..9a7cfe5 --- /dev/null +++ b/Code/HammingDistance.py @@ -0,0 +1,6 @@ +def HammingDistance(p, q): + count = 0 + for i in range(0, len(p)): + if p[i] != q[i]: + count += 1 + return count diff --git a/Code/MinimumSkew.py b/Code/MinimumSkew.py new file mode 100644 index 0000000..924f7eb --- /dev/null +++ b/Code/MinimumSkew.py @@ -0,0 +1,21 @@ +def MinimumSkew(Genome): + positions = [] + skew = SkewArray(Genome) + minimum = min(skew) + for i in range(0, len(Genome)): + if skew[i] == minimum: + positions.append(i) + return positions + + +def SkewArray(Genome): + Skew = [] + Skew.append(0) + for i in range(0, len(Genome)): + if Genome[i] == "G": + Skew.append(Skew[i] + 1) + elif Genome[i] == "C": + Skew.append(Skew[i] - 1) + else: + Skew.append(Skew[i]) + return Skew diff --git a/Code/Replication.py b/Code/PatternCount.py similarity index 100% rename from Code/Replication.py rename to Code/PatternCount.py diff --git a/Code/SkewArray.py b/Code/SkewArray.py new file mode 100644 index 0000000..2fee7cc --- /dev/null +++ b/Code/SkewArray.py @@ -0,0 +1,11 @@ +def SkewArray(Genome): + Skew = [] + Skew.append(0) + for i in range(0, len(Genome)): + if Genome[i] == "G": + Skew.append(Skew[i] + 1) + elif Genome[i] == "C": + Skew.append(Skew[i] - 1) + else: + Skew.append(Skew[i]) + return Skew diff --git a/Code/SymbolArray.py b/Code/SymbolArray.py new file mode 100644 index 0000000..bcb6387 --- /dev/null +++ b/Code/SymbolArray.py @@ -0,0 +1,15 @@ +def SymbolArray(Genome, symbol): + array = {} + n = len(Genome) + ExtendedGenome = Genome + Genome[0:n//2] + for i in range(n): + array[i] = PatternCount(ExtendedGenome[i:i+(n//2)], symbol) + return array + + +def PatternCount(Text, Pattern): + count = 0 + for i in range(len(Text)-len(Pattern)+1): + if Text[i:i+len(Pattern)] == Pattern: + count = count+1 + return count diff --git a/Notebook.org b/Notebook.org index 01619f2..136ae09 100644 --- a/Notebook.org +++ b/Notebook.org @@ -12,7 +12,7 @@ ***** Exercise: find Pattern - We'll look for the *DnaA box* sequence, using a sliding window, in that case we will use the function [[./Code/Replication.py][Replication]] to find out how many times + We'll look for the *DnaA box* sequence, using a sliding window, in that case we will use the function [[./Code/PatternCount.py][PatternCount]] to find out how many times does a sequence appear in the genome. For the second part, we're going to calculate the frequency map of the sequences of length /k/, for that purpose we'll use [[./Code/FrequentWords.py][FrequentWords]] @@ -21,7 +21,7 @@ We're going to generate the reverse complement of a sequence, which is the complement of a sequence, read in the same direction (5' -> 3'). In this case, we're going to use [[./Code/ReverseComplement.py][ReverseComplement]] - After using our function on the /Vibrio Cholerae's/ genome, we realize that some of the frequent k-mers are reverse complements of other frequent ones. + After using our function on the /Vibrio Cholerae's/ genome, we realize that some of the frequent /k-mers/ are reverse complements of other frequent ones. ***** Exercise: Find a subsequence within a sequence @@ -34,13 +34,93 @@ **** Computational approaches to find ori in any bacteria Now that we're pretty confident about the /DnaA boxes/ sequences that we found, we are going to check if they are a common pattern in the rest of bacterias. - We're going to find the ocurrences of the sequences in /Thermotoga petrophila/ using [[./Code/Replication.py][Replication]] + We're going to find the ocurrences of the sequences in /Thermotoga petrophila/ using [[./Code/PatternCount.py][PatternCount]] After the execution, we observe that there are *no* ocurrences of the sequences found in /Vibrio Cholerae/. We can conclude that different bacterias have different /DnaA boxes/. We have to try another computational approach then, find clusters of /k-mers/ repeated in a small interval. -*** Vocabulary - - k-mer: subsquences of length /k/ in a biological sequence - - Frequency map: sequence --> frequency of the sequence + +** Week 2 + +*** DNA replication (II) + +**** Replication process + + The /DNA polymerases/ start replicating while the parent strands are unraveling. + On the lagging strand, the DNA polymerase waits until the replication fork opens around 2000 nucleotides, and because of that it forms Okazaki fragments. + We need 1 primer for the leading strand and 1 primer per Okazaki fragment for the lagging strand. + While the Okazaki fragments are being synthetized, a /DNA ligase/ starts joining the fragments together. + +**** Computational approach to find ori using deamination + + As the lagging strand is always waiting for the helicase to go forward, the lagging strand is mostly in single-stranded configuration, which is more prone to mutations. + One frequent form of mutation is *deamination*, a process that causes cytosine to convert into thymine. This means that cytosine is more frequent in half of the genome. + +***** Exercise: count the ocurrences of cytosine + + We're going to count the ocurrences of the bases in a genome and include them in a symbol array, for that purpose we'll use [[./Code/SymbolArray.py][SymbolArray]] + After executing the program, we realize that the algorithm is too inefficient. + +***** Exercise: find a better algorithm for the previous exercise + + This time, we are going to evaluate an element /i+1/, using the element /i/. We'll use [[./Code/FasterSymbolArray.py][FasterSymbolArray]] to achieve this + After executing the program we see that it's a viable algorithm, with a complexity of /O(n)/ instead of the previous /O(n²)/. + In /Escherichia Coli/ we plotted the result of our program: + + #+CAPTION: Symbol array for Cytosine in E. Coli Genome] + [[./Assets/e-coli.png]] + + From that graph, we conclude that ori is located around position 4000000, because that's where the Cytosine concentration is the lowest, + which indicates that the region stays single-stranded for the longest time. + +**** The Skew Diagram + + Usually scientists measure the difference between /G - C/, which is *higher on the lagging strand* and *lower on the leading strand*. + +***** Exercise: Synthetize a Skew Array + + We're going to make a Skew Diagram, for that we'll first need a Skew Array. For that purpose we wrote [[./Code/SkewArray.py][SkewArray]] + We can see the utility of a Skew Diagram looking at the one from /Escherichia Coli/: + + #+CAPTION: Symbol array for Cytosine in E. Coli Genome] + [[./Assets/skew_diagram.png]] + + Ori should be located where the skew is at its minimum value. + +***** Exercise: Efficient algorithm for locating ori + + Now that we know more about ori's skew value, we're going to construct a better algorithm to find it. We'll do that in [[./Code/MinimumSkew.py][MinimumSkew]] + +**** Finding /DnaA boxes/ + + When we look for /DnaA boxes/ in the minimal skew region, we can't find highly repeated /9-mers/ in /Escherichia Coli/. + But we find approximate sequences that are similar to our /9-mers/ and only differ in 1 nucleotide. + +***** Exercise: Calculate Hamming distance + + The Hamming distance is the number of mismatches between 2 strings, we'll solve this problem in [[./Code/HammingDistance][HammingDistance]] + +***** Exercise: Find approximate patterns + + Now that we have our Hamming distance, we have to find the approximate sequences. We'll do this in [[./Code/ApproximatePatternMatching.py][ApproximatePatternMatching.py]] + +***** Exercise: Count the approximate patterns + + The final part is counting the approximate sequences, for that we'll use [[./Code/ApproximatePatternCount.py][ApproximatePatternCount.py]] + + After trying out our ApproximatePatternCount in the hypothesized ori region, we find a frequent /k-mer/ with its reverse complement in /Escherichia Coli/. + We've finally found a computational method to find ori that seems correct. + +** Week 3 + +*** The circadian clock + + Variation in gene expression permits the cell to keep track of time. + +**** Computational approaches to find regulatory motifs + +** Vocabulary + - k-mer: subsquences of length /k/ in a biological sequence + - Frequency map: sequence --> frequency of the sequence