From a987e662b58b6db70c81d06c3a75eee34683fbba Mon Sep 17 00:00:00 2001
From: coolneng <akasroua@gmail.com>
Date: Sat, 13 Nov 2021 14:21:24 +0100
Subject: [PATCH] Implement HTML parsing and output to a file

---
 shell.nix     |  2 +-
 src/parser.py | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 src/parser.py

diff --git a/shell.nix b/shell.nix
index a3f2639..ab1fe93 100644
--- a/shell.nix
+++ b/shell.nix
@@ -2,4 +2,4 @@
 
 with pkgs;
 
-mkShell { buildInputs = [ python39 python39Packages.beautifulsoup4 ]; }
+mkShell { buildInputs = [ python39 python39Packages.pandas ]; }
diff --git a/src/parser.py b/src/parser.py
new file mode 100644
index 0000000..719e41e
--- /dev/null
+++ b/src/parser.py
@@ -0,0 +1,32 @@
+from glob import glob
+from typing import List
+
+from pandas import DataFrame, read_html
+
+
+def find_html_files(path) -> List:
+    file_list = glob(path + "/*fastqc.html")
+    return file_list
+
+
+def extract_adapters(files) -> DataFrame:
+    all_adapters = DataFrame()
+    for entry in files:
+        tables = read_html(entry)
+        adapter_sequences = tables[1].Sequence
+        all_adapters = all_adapters.append(adapter_sequences)
+    return all_adapters.dropna()
+
+
+def save_to_file(filename, adapters) -> None:
+    adapters.to_csv(filename, index=False, header=False, sep="\n")
+
+
+def main():
+    file_list = find_html_files("data")
+    adapters = extract_adapters(file_list)
+    save_to_file("placeholder.txt", adapters)
+
+
+if __name__ == "__main__":
+    main()