-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathwebsitegpt.py
127 lines (107 loc) · 4.46 KB
/
websitegpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urlparse
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class SitemapScraper:
def __init__(self, output_folder='output'):
self.output_folder = output_folder
self.scraped_content = {} # Add this to store content
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument('--headless') # Run in headless mode
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-images') # Disable images
self.driver = webdriver.Chrome(options=chrome_options)
def fetch_sitemap(self, sitemap_url):
"""Fetch and parse sitemap XML."""
try:
response = requests.get(sitemap_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'xml')
urls = [loc.text for loc in soup.find_all('loc')]
return urls
except requests.RequestException as e:
print(f"Error fetching sitemap: {e}")
return []
def sanitize_filename(self, url):
"""Convert URL to a valid filename."""
parsed = urlparse(url)
path = parsed.path.strip('/')
if not path:
path = 'index'
return f"{path.replace('/', '_')}.pdf"
def save_as_text(self, url):
"""Save webpage content as text after rendering JavaScript."""
try:
filename = self.sanitize_filename(url).replace('.pdf', '.txt')
print(f"Fetching {url}...")
self.driver.get(url)
# Wait for the page to load (adjust timeout as needed)
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
# Get the rendered text
text_content = self.driver.find_element(By.TAG_NAME, "body").text
# Store content in dictionary instead of saving directly
self.scraped_content[filename] = text_content
print(f"Processed {filename}")
return True
except Exception as e:
print(f"Error saving text for {url}: {e}")
return False
def process_sitemap(self, sitemap_url):
"""Process entire sitemap and save all pages as text."""
urls = self.fetch_sitemap(sitemap_url)
if not urls:
print("No URLs found in sitemap.")
return {}
print(f"Found {len(urls)} URLs in sitemap.")
try:
for url in urls:
self.save_as_text(url)
time.sleep(1)
finally:
self.driver.quit() # Ensure browser is closed
return self.scraped_content # Return the collected content
def get_user_preference():
while True:
choice = input("""
Choose output format:
1. Individual files (one file per page)
2. Single merged file (all pages in one file with headers)
Enter 1 or 2: """).strip()
if choice in ['1', '2']:
return choice == '2' # Returns True for merged, False for individual
print("Invalid choice. Please enter 1 or 2.")
def main():
# Get user preference at start
merge_files = get_user_preference()
sitemap_url = input("Enter sitemap URL (e.g., https://example.com/sitemap.xml): ")
scraper = SitemapScraper()
scraped_content = scraper.process_sitemap(sitemap_url) # Get the content
# Handle output based on user preference
if merge_files:
# Merged file output
with open('merged_output.txt', 'w', encoding='utf-8') as f:
for filename, content in scraped_content.items():
f.write(f"\n{'='*50}\n")
f.write(f"{filename}\n")
f.write(f"{'='*50}\n\n")
f.write(content)
f.write('\n\n')
else:
# Individual files output
for filename, content in scraped_content.items():
with open(f"output/{filename}", 'w', encoding='utf-8') as f:
f.write(content)
if __name__ == "__main__":
main()