-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebscrape.py
116 lines (81 loc) · 2.71 KB
/
webscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import nest_asyncio
import random
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import datetime
import time
from selenium import webdriver
one_stop_dict=dict()
visited_urls = []
# Gets user input
url = input("What is the website you would like to scrape? (note: enter without appending / at the end)\n")
initial_url = [url]
# Format the url string for javascript use:
url_java = url
url_java = url_java.replace("'", "").replace('"', "")
url_java = str.format(url_java)
json_name = input("What would you like your filename to be?")
json_filename = json_name + ".json"
# Uses user input to print out information
print("Webcrawling URL " + url)
driver = webdriver.Chrome()
javascript_code = """
var allElements = document.querySelectorAll('h1,h2,h3,h4,h5,h6,p,th,tbody');
var content = [];
// var content = Array.from(allElements).map(el => el.textContent);
for (var i = 0; i < allElements.length; i++) {
content.push(allElements[i].textContent);
}
return content;
"""
javascript_code_url5 = """// Define the origin address
var originAddress = arguments[0];
console.log(originAddress);
// Select all <a> elements on the page
var links = document.querySelectorAll('a');
// Initialize an array to store valid URLs
var validLinks = [];
// Regular expression for matching URLs
var urlPattern = /^(https?|ftp):\/\/[^\s/$.?#].[^\s]*$/;
// Loop through each <a> element and extract the href attribute
for (var i = 0; i < links.length; i++) {
var link = links[i];
var href = link.getAttribute('href');
// Check if the href attribute matches the URL pattern
if (urlPattern.test(href)) {
// Check if the URL starts with the specified origin address
if (href.startsWith(originAddress) || href==originAddress) {
// Check if the URL does not end with ".pdf"
if (!href.endsWith(".pdf")) {
// Store the valid URL
validLinks.push(href);
}
}
}
}
return validLinks
// Output the valid URLs
console.log(validLinks);
"""
# Start the crawl
queue = [initial_url]
queue = [url]
while queue:
current_url = queue.pop(0)
if (current_url not in visited_urls):
driver.get(current_url)
time.sleep(2)
visited_urls.append(current_url)
data_method = driver.execute_script(javascript_code)
one_stop_dict[current_url]=data_method
print(f"Scraping {current_url}")
links_on_page = driver.execute_script(javascript_code_url5, url_java)
queue.extend(link for link in links_on_page if link not in visited_urls)
# Close the WebDriver
driver.quit()
one_stop_json_nopdf2=json.dumps(one_stop_dict)
with open(json_filename, 'w') as f:
json.dump(one_stop_json_nopdf2, f)