-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.py
136 lines (106 loc) · 4.11 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import streamlit as st
import boto3
import io
from PIL import Image
import docx
import time
from dotenv import load_dotenv
import os
# Load environment variables from .env file
load_dotenv()
# Read AWS credentials from environment variables
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
region_name = os.getenv('AWS_REGION')
title = '<p style="font-size: 40px;font-weight: 550;"> Textract - Extract Text via Images & PDFs </p>'
st.markdown(title, unsafe_allow_html=True)
st.image("./assets/example.jpeg")
# Provide AWS credentials directly
aws_management_console = boto3.session.Session(
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
region_name=region_name # Replace with your desired region
)
client = aws_management_console.client(service_name='textract', region_name='ap-south-1')
# Create S3 client
s3_client = aws_management_console.client('s3')
# Define a function to extract text from an image
def extract_text_from_image(image):
response = client.detect_document_text(Document={"Bytes": image})
# Extract the text from the response
text = ""
for item in response["Blocks"]:
if item["BlockType"] == "LINE":
text += item["Text"] + "\n"
# Display the extracted text
st.subheader("Extracted Text")
st.write(text)
download_text(text)
def download_text(text):
doc = docx.Document()
doc.add_paragraph(text)
output_file = io.BytesIO()
doc.save(output_file)
output_file.seek(0)
st.download_button(
label="Download Doc File",
data=output_file,
file_name="extracted_text.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
def extract_text_from_pdf(pdf_bytes, pdf_filename):
# Create a file-like object from the bytes
pdf_fileobj = io.BytesIO(pdf_bytes)
# Upload PDF file to S3 bucket
bucket_name = 'pdf-stuff' # Replace with your S3 bucket name
s3_key = 'uploads/' + pdf_filename
s3_client.upload_fileobj(pdf_fileobj, bucket_name, s3_key)
# Start text detection job with S3 object
response = client.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': bucket_name,
'Name': s3_key
}
}
)
# Wait for the job to complete
job_id = response['JobId']
st.write("Text extraction job started. Job ID:", job_id)
st.write("Waiting for the job to complete... (This might take a while)")
# Poll until the job is complete
while True:
job_status = client.get_document_text_detection(JobId=job_id)
if job_status['JobStatus'] in ['SUCCEEDED', 'FAILED']:
break
time.sleep(5) # Wait for 5 seconds before checking again
if job_status['JobStatus'] == 'SUCCEEDED':
text_blocks = [item['Text'] for item in job_status['Blocks'] if item['BlockType'] == "LINE"]
if len(text_blocks) > 0:
text = "\n".join(text_blocks)
st.success("Text extracted from PDF:")
st.write(text)
download_text(text)
else:
st.warning("No text found in the PDF.")
else:
st.error("Text extraction failed. Please try again later.")
# Define the main function that will be called when the "Extract Text" button is clicked
def extract_text(file, file_type):
if file_type == "Image":
extracted_text = extract_text_from_image(file.read())
img = Image.open(file)
st.image(img, caption="Uploaded Image")
elif file_type == "PDF":
extracted_text = extract_text_from_pdf(file.read(), file.name)
st.write(extracted_text)
# Create an option menu for selecting the file type
file_type = st.selectbox("Select file type", options=["Image", "PDF"])
# Create a file uploader
file = st.file_uploader("Upload file", type=["jpg", "jpeg", "png", "pdf"])
# Create a button to extract text from the uploaded file
if st.button("Extract Text"):
if file is not None:
extract_text(file, file_type)
else:
st.write("Please upload a file.")