from Bio.motifs import create from Bio.SeqIO import parse from numpy.random import random from tensorflow.io import TFRecordWriter from tensorflow.train import BytesList, Example, Feature, Features, FloatList def generate_example(sequence, weight_matrix): schema = { "sequence": Feature(bytes_list=BytesList(value=[sequence.encode()])), "A_counts": Feature(float_list=FloatList(value=[weight_matrix["A"][0]])), "C_counts": Feature(float_list=FloatList(value=[weight_matrix["C"][0]])), "G_counts": Feature(float_list=FloatList(value=[weight_matrix["G"][0]])), "T_counts": Feature(float_list=FloatList(value=[weight_matrix["T"][0]])), } example = Example(features=Features(feature=schema)) return example.SerializeToString() def parse_data(filepath): examples = [] with open(filepath) as handle: for row in parse(handle, "fastq"): sequence = str(row.seq) motifs = create(row.seq) example = generate_example(sequence=sequence, weight_matrix=motifs.pwm) examples.append(example) return examples def create_dataset(filepath): data = parse_data(filepath) train_test_split = 0.7 with TFRecordWriter("data/train_data.tfrecords") as train_writer, TFRecordWriter( "data/test_data.tfrecords" ) as test_writer: for element in data: if random() < train_test_split: train_writer.write(element) else: test_writer.write(element) create_dataset("data/curesim-HVR.fastq")