-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathantiquarian.py
63 lines (40 loc) · 1.47 KB
/
antiquarian.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import sys
import time
from dotenv import load_dotenv
from utils.cost import estimate_costs, get_costs_gpt4o, get_n_tokens
from utils.utils import ask_permission
from utils.ai import query_gpt, get_prompt
INPUT = "output/1717274042.txt"
OUTPUT = f"output/{int(time.time())}_ai.txt"
AI_MODEL = "gpt-4o"
PROMPT = """This text was transcribed by OCR. Your job is to fix obvious transcription errors, such as problems with word separation, punctuation, encoding. Output only the corrected text. Only correct issues which would be generated by a bad transcription. Don't correct stylistic choices, local accents or the logical flow of sentences. Answer in Markdown.
###
Text:
{snippet}
###
Answer:
"""
def ai_cleanup(text: str) -> str:
load_dotenv()
prompt = get_prompt(PROMPT, text)
model = AI_MODEL
n_tokens = get_n_tokens(model, prompt)
estimate_costs(model, n_tokens)
if not ask_permission():
sys.exit(0)
try:
response = query_gpt(model, prompt)
except Exception as e:
print(e)
return e.__repr__()
get_costs_gpt4o(response)
return response.choices[0].message.content # type: ignore
def main():
with open(INPUT, "r", encoding="utf-8") as _input:
text = _input.read()
result = ai_cleanup(text)
with open(OUTPUT, "w", encoding="utf-8") as _output:
_output.write(result)
print("Antiquarian: cleanup complete. Output written to", OUTPUT)
if __name__ == "__main__":
main()