Compare commits
3 Commits
e826d6f92b
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
93a403182b
|
|||
|
7ed975c7ce
|
|||
|
fa23b1a950
|
29
parser.py
29
parser.py
@@ -6,12 +6,18 @@ from re import sub
|
|||||||
from pandas import DataFrame, read_html, Series
|
from pandas import DataFrame, read_html, Series
|
||||||
|
|
||||||
|
|
||||||
def find_html_files(path) -> List:
|
def remove_parenthesis(identifier):
|
||||||
file_list = glob(path + "/*fastqc.html")
|
"""
|
||||||
return file_list
|
Remove parenthesis from the sequence identifier
|
||||||
|
"""
|
||||||
|
sequence_str = "".join(map(str, identifier))
|
||||||
|
return sub(r"[()]", "", sequence_str)
|
||||||
|
|
||||||
|
|
||||||
def extract_adapters(files) -> Tuple[Series, List]:
|
def extract_adapters(files) -> Tuple[Series, List]:
|
||||||
|
"""
|
||||||
|
Extract the adapters sequences and statistics from the files
|
||||||
|
"""
|
||||||
all_adapters = DataFrame()
|
all_adapters = DataFrame()
|
||||||
for entry in files:
|
for entry in files:
|
||||||
tables = read_html(entry)
|
tables = read_html(entry)
|
||||||
@@ -27,6 +33,9 @@ def extract_adapters(files) -> Tuple[Series, List]:
|
|||||||
|
|
||||||
|
|
||||||
def preprocess_dataframe(adapters) -> Series:
|
def preprocess_dataframe(adapters) -> Series:
|
||||||
|
"""
|
||||||
|
Remove empty sequences and duplicates
|
||||||
|
"""
|
||||||
na_free_adapters = adapters.dropna(axis=1)
|
na_free_adapters = adapters.dropna(axis=1)
|
||||||
stacked_adapters = na_free_adapters.stack()
|
stacked_adapters = na_free_adapters.stack()
|
||||||
duplicate_free_adapters = stacked_adapters.drop_duplicates()
|
duplicate_free_adapters = stacked_adapters.drop_duplicates()
|
||||||
@@ -34,15 +43,20 @@ def preprocess_dataframe(adapters) -> Series:
|
|||||||
|
|
||||||
|
|
||||||
def save_to_file(filename, adapters) -> None:
|
def save_to_file(filename, adapters) -> None:
|
||||||
|
"""
|
||||||
|
Save the adapter sequences as a FASTA file
|
||||||
|
"""
|
||||||
with open(filename, "w") as f:
|
with open(filename, "w") as f:
|
||||||
for index, value in adapters.iteritems():
|
for index, value in adapters.iteritems():
|
||||||
sequence_str = "".join(map(str, index))
|
sequence_id = remove_parenthesis(index)
|
||||||
sequence_id = sub(r"[()]", "", sequence_str)
|
|
||||||
fasta_entry = f">{sequence_id}\n{value}\n"
|
fasta_entry = f">{sequence_id}\n{value}\n"
|
||||||
f.write(fasta_entry)
|
f.write(fasta_entry)
|
||||||
|
|
||||||
|
|
||||||
def parse_arguments():
|
def parse_arguments():
|
||||||
|
"""
|
||||||
|
Parse the command-line arguments
|
||||||
|
"""
|
||||||
parser = ArgumentParser()
|
parser = ArgumentParser()
|
||||||
parser.add_argument("input", help="directory containing the fastqc reports")
|
parser.add_argument("input", help="directory containing the fastqc reports")
|
||||||
parser.add_argument("output", help="file where to export the sequences")
|
parser.add_argument("output", help="file where to export the sequences")
|
||||||
@@ -50,8 +64,11 @@ def parse_arguments():
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
"""
|
||||||
|
Extract the adapters from FASTQC reports to a FASTA file and show sequence length statistics
|
||||||
|
"""
|
||||||
args = parse_arguments()
|
args = parse_arguments()
|
||||||
file_list = find_html_files(args.input)
|
file_list = glob(args.input + "/*fastqc.html")
|
||||||
adapters, stats = extract_adapters(file_list)
|
adapters, stats = extract_adapters(file_list)
|
||||||
save_to_file(args.output, adapters)
|
save_to_file(args.output, adapters)
|
||||||
print(
|
print(
|
||||||
|
|||||||
Reference in New Issue
Block a user