Compare commits

...

3 Commits

Author SHA1 Message Date
93a403182b Document the functions 2021-12-27 17:33:05 +01:00
7ed975c7ce Refactor regex parenthesis removal into a function 2021-12-22 18:36:32 +01:00
fa23b1a950 Remove find_html_files function 2021-12-22 18:36:13 +01:00

View File

@@ -6,12 +6,18 @@ from re import sub
from pandas import DataFrame, read_html, Series
def find_html_files(path) -> List:
file_list = glob(path + "/*fastqc.html")
return file_list
def remove_parenthesis(identifier):
"""
Remove parenthesis from the sequence identifier
"""
sequence_str = "".join(map(str, identifier))
return sub(r"[()]", "", sequence_str)
def extract_adapters(files) -> Tuple[Series, List]:
"""
Extract the adapters sequences and statistics from the files
"""
all_adapters = DataFrame()
for entry in files:
tables = read_html(entry)
@@ -27,6 +33,9 @@ def extract_adapters(files) -> Tuple[Series, List]:
def preprocess_dataframe(adapters) -> Series:
"""
Remove empty sequences and duplicates
"""
na_free_adapters = adapters.dropna(axis=1)
stacked_adapters = na_free_adapters.stack()
duplicate_free_adapters = stacked_adapters.drop_duplicates()
@@ -34,15 +43,20 @@ def preprocess_dataframe(adapters) -> Series:
def save_to_file(filename, adapters) -> None:
"""
Save the adapter sequences as a FASTA file
"""
with open(filename, "w") as f:
for index, value in adapters.iteritems():
sequence_str = "".join(map(str, index))
sequence_id = sub(r"[()]", "", sequence_str)
sequence_id = remove_parenthesis(index)
fasta_entry = f">{sequence_id}\n{value}\n"
f.write(fasta_entry)
def parse_arguments():
"""
Parse the command-line arguments
"""
parser = ArgumentParser()
parser.add_argument("input", help="directory containing the fastqc reports")
parser.add_argument("output", help="file where to export the sequences")
@@ -50,8 +64,11 @@ def parse_arguments():
def main():
"""
Extract the adapters from FASTQC reports to a FASTA file and show sequence length statistics
"""
args = parse_arguments()
file_list = find_html_files(args.input)
file_list = glob(args.input + "/*fastqc.html")
adapters, stats = extract_adapters(file_list)
save_to_file(args.output, adapters)
print(