-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
65 lines (56 loc) · 1.86 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# preprocess.py
import argparse
from pathlib import Path
from src.preprocessing.pdf_processor import PDFProcessor
from src.utils.graceful_killer import GracefulKiller
from src.config import Config
def main():
# Set up environment and directories
Config.setup_environment()
Config.setup_directories()
parser = argparse.ArgumentParser(
description='Preprocess PDF files into markdown for LSTM training.'
)
parser.add_argument(
'--input-dir',
type=str,
default=str(Config.RAW_DATA_DIR),
help='Directory containing PDF files'
)
parser.add_argument(
'--output-dir',
type=str,
default=str(Config.PROCESSED_DATA_DIR),
help='Directory to store processed markdown files'
)
parser.add_argument(
'--temp-dir',
type=str,
default=str(Config.TEMP_DIR),
help='Directory for temporary files'
)
args = parser.parse_args()
# Create directories if they don't exist
for dir_path in [args.input_dir, args.output_dir, args.temp_dir]:
Path(dir_path).mkdir(parents=True, exist_ok=True)
# Initialize the processor
processor = PDFProcessor(
input_dir=args.input_dir,
output_dir=args.output_dir,
temp_dir=args.temp_dir
)
print("\nStarting PDF preprocessing...")
print(f"Input directory: {args.input_dir}")
print(f"Output directory: {args.output_dir}")
print(f"Temporary directory: {args.temp_dir}")
# Process the files
try:
processor.process_files()
except KeyboardInterrupt:
print("\nProcessing interrupted by user. Saving progress...")
except Exception as e:
print(f"\nError during processing: {type(e).__name__}: {str(e)}")
finally:
print("\nPreprocessing completed. Check the output directory for results.")
if __name__ == "__main__":
main()