-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexternal-posts.rb
105 lines (93 loc) · 3.26 KB
/
external-posts.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
require 'feedjira'
require 'httparty'
require 'jekyll'
require 'nokogiri'
require 'time'
module ExternalPosts
class ExternalPostsGenerator < Jekyll::Generator
safe true
priority :high
def generate(site)
if site.config['external_sources'] != nil
site.config['external_sources'].each do |src|
puts "Fetching external posts from #{src['name']}:"
if src['rss_url']
fetch_from_rss(site, src)
elsif src['posts']
fetch_from_urls(site, src)
end
end
end
end
def fetch_from_rss(site, src)
xml = HTTParty.get(src['rss_url']).body
return if xml.nil?
feed = Feedjira.parse(xml)
process_entries(site, src, feed.entries)
end
def process_entries(site, src, entries)
entries.each do |e|
puts "...fetching #{e.url}"
create_document(site, src['name'], e.url, {
title: e.title,
content: e.content,
summary: e.summary,
published: e.published
})
end
end
def create_document(site, source_name, url, content)
# check if title is composed only of whitespace or foreign characters
if content[:title].gsub(/[^\w]/, '').strip.empty?
# use the source name and last url segment as fallback
slug = "#{source_name.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')}-#{url.split('/').last}"
else
# parse title from the post or use the source name and last url segment as fallback
slug = content[:title].downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')
slug = "#{source_name.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')}-#{url.split('/').last}" if slug.empty?
end
path = site.in_source_dir("_posts/#{slug}.md")
doc = Jekyll::Document.new(
path, { :site => site, :collection => site.collections['posts'] }
)
doc.data['external_source'] = source_name
doc.data['title'] = content[:title]
doc.data['feed_content'] = content[:content]
doc.data['description'] = content[:summary]
doc.data['date'] = content[:published]
doc.data['redirect'] = url
site.collections['posts'].docs << doc
end
def fetch_from_urls(site, src)
src['posts'].each do |post|
puts "...fetching #{post['url']}"
content = fetch_content_from_url(post['url'])
content[:published] = parse_published_date(post['published_date'])
create_document(site, src['name'], post['url'], content)
end
end
def parse_published_date(published_date)
case published_date
when String
Time.parse(published_date).utc
when Date
published_date.to_time.utc
else
raise "Invalid date format for #{published_date}"
end
end
def fetch_content_from_url(url)
html = HTTParty.get(url).body
parsed_html = Nokogiri::HTML(html)
title = parsed_html.at('head title')&.text.strip || ''
description = parsed_html.at('head meta[name="description"]')&.attr('content') || ''
body_content = parsed_html.at('body')&.inner_html || ''
{
title: title,
content: body_content,
summary: description
# Note: The published date is now added in the fetch_from_urls method.
}
end
end
end