forked from JeffreyATW/mbfc_crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.rb
executable file
·138 lines (113 loc) · 3.01 KB
/
crawler.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env ruby
require 'wombat'
require 'json'
require 'leveldb'
require './dry'
base = 'https://mediabiasfactcheck.com'
biases = {}
if ARGV[0].nil?
directory = 'output'
unless File.directory?('output')
Dir.mkdir('output')
end
else
directory = ARGV[0]
end
Dir.chdir(directory)
db = LevelDB::DB.new('cache')
# (shrug)
class Mechanize
attr_accessor :cache_key
attr_accessor :cached
end
class Hash
def symbolize_keys
inject({}) do |memo,(k,v)|
memo[k.to_sym] = v
memo
end
end
end
# When the response is cached we use the cache+text/html parser (an instance of
# this class), which pulls the body out of the side channel and runs the regular
# parser.
class CachedPage < Mechanize::Page
def initialize(uri=nil, response=nil, body=nil, code=nil, mech=nil)
yield self if block_given?
@uri = uri
@response = @mech.cached[:resp]
@body = @mech.cached[:resp].body
@code = @mech.cached[:resp].code
super @uri, @response, @body, @code
end
end
%w(left leftcenter center right-center right pro-science conspiracy satire fake-news).each do |p|
begin
bias = Wombat.crawl do
base_url base
path "/#{p}/"
name({ css: '.page > h1.page-title' })
description({ css: '.entry p:first-of-type' }) do |d|
d.sub(/see also:/i, '').strip
end
url "#{base}/#{p}/"
# for some kind of bias, they are in a table-container
source_urls({ xpath: '//*/div[contains(@class, "entry")]/p/a/@href | //*/table[@id="mbfc-table"]//tr/td/a/@href' }, :list)
end
puts "Bias crawled: #{bias['name']}"
biases[p] = bias
rescue Exception => e
puts "Could not crawl bias: #{p}"
puts e.backtrace
end
end
sources = {}
source_ids = []
biases.each do |k, b|
b['source_urls'].each do |u|
source_uri = URI(u)
begin
source = Wombat.crawl do
# There's probably a better way to dry it out
instance_eval &DRY.mech_cache(db)
base_url base
path source_uri.path
# share these DSL rules between here and tests
instance_eval &DRY.source_dsl
url "#{source_uri.scheme}://#{source_uri.host}#{source_uri.path}"
end
source['bias'] = k
unless (source_ids.include?(source['id']) || source['domain'] == '')
domain = source['domain']
source.delete('domain')
source['path'] = source['thepath']
source.delete('thepath')
if (sources[domain].nil?)
sources[domain] = [source]
else
sources[domain] << source
end
source_ids << source['id']
puts "Source crawled: #{source['name']}"
end
rescue Exception => e
puts "Could not crawl source: #{source_uri}"
puts e.backtrace
end
end
b.delete('source_urls')
end
if (biases.count > 0)
File.open("biases.json", "w") do |f|
f.write(biases.to_json)
end
else
puts "No biases to write."
end
if (sources.count > 0)
File.open("sources.json", "w") do |f|
f.write(sources.to_json)
end
else
puts "No sources to write."
end