Update documentation about data splits

This commit is contained in:
2021-06-06 00:13:37 +02:00
parent 8870da8543
commit 168a68b50d

View File

@@ -54,12 +54,13 @@ def read_fastq(data_file, label_file) -> List[bytes]:
return examples
def create_dataset(data_file, label_file) -> None:
def create_dataset(
data_file, label_file, train_eval_test_split=[0.8, 0.1, 0.1]
) -> None:
"""
Create a training and test dataset with a 70/30 split respectively
Create a training, evaluation and test dataset with a 80/10/30 split respectively
"""
data = read_fastq(data_file, label_file)
train_eval_test_split = [0.8, 0.1, 0.1]
with TFRecordWriter(TRAIN_DATASET) as training, TFRecordWriter(
TEST_DATASET
) as test, TFRecordWriter(EVAL_DATASET) as evaluation:
@@ -101,6 +102,9 @@ def read_dataset(filepath) -> TFRecordDataset:
def dataset_creation(
data_file, label_file
) -> Tuple[TFRecordDataset, TFRecordDataset, TFRecordDataset]:
"""
Generate the TFRecord files and split them into training, validation and test data
"""
create_dataset(data_file, label_file)
train_data = read_dataset(TRAIN_DATASET)
eval_data = read_dataset(EVAL_DATASET)