-
Notifications
You must be signed in to change notification settings - Fork 41
/
Copy pathtextsplit.py
88 lines (71 loc) · 2.43 KB
/
textsplit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
Splits folders with songs into training and validation subsets.
"""
import argparse
import json
from pathlib import Path
import random
import pandas as pd
def main():
args = parse_args()
args.output.mkdir(parents=True, exist_ok=True)
meta = []
for subfolder in args.input.iterdir():
print(f'Splitting folder for artist \'{subfolder.stem}\'')
files = [file for file in subfolder.iterdir() if file.suffix == '.txt']
n_train = int(len(files) * args.train_size)
random.shuffle(files)
train_files, valid_files = files[:n_train], files[n_train:]
print(f'Training: {len(train_files)}, validation: {len(valid_files)}')
train_ids = {file.stem for file in train_files}
split = [
('train', train_files),
('valid', valid_files)]
for line in (subfolder/'songs.csv').open():
index, _, song = line.partition(',')
meta.append({
'id': int(index),
'artist': subfolder.stem,
'song': song.strip(),
'valid': index not in train_ids
})
for sub, files in split:
new_dir = args.output/sub/subfolder.stem
new_dir.mkdir(parents=True, exist_ok=True)
for old_file in files:
new_file = new_dir/old_file.name
new_file.open('w').write(old_file.open().read())
with (args.output/'songs.json').open('w') as file:
json.dump(json.loads(
pd.DataFrame(meta).to_json(orient='records')),
file, indent=2)
print('Files copied into folder ', args.output)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
'-i', '--input',
required=True,
help='path to folders with labelled texts'
)
parser.add_argument(
'-o', '--output',
default=Path.home(),
help='path to save separated files'
)
parser.add_argument(
'-ts', '--train-size',
default=0.8, type=float,
help='amount of texts (per category) to keep for training'
)
parser.add_argument(
'-rs', '--random-state',
default=1, type=int,
help='random state to use when taking training subset'
)
args = parser.parse_args()
args.input = Path(args.input)
args.output = Path(args.output)
random.seed(args.random_state)
return args
if __name__ == '__main__':
main()