Based on discussion #3393. --------- Signed-off-by: George Araújo <george.gcac@gmail.com>
125 lines
4.0 KiB
Ruby
125 lines
4.0 KiB
Ruby
require 'feedjira'
|
|
require 'httparty'
|
|
require 'jekyll'
|
|
require 'nokogiri'
|
|
require 'time'
|
|
|
|
module ExternalPosts
|
|
class ExternalPostsGenerator < Jekyll::Generator
|
|
safe true
|
|
priority :high
|
|
|
|
def generate(site)
|
|
if site.config['external_sources'] != nil
|
|
site.config['external_sources'].each do |src|
|
|
puts "Fetching external posts from #{src['name']}:"
|
|
if src['rss_url']
|
|
fetch_from_rss(site, src)
|
|
elsif src['posts']
|
|
fetch_from_urls(site, src)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
def fetch_from_rss(site, src)
|
|
xml = HTTParty.get(src['rss_url']).body
|
|
return if xml.nil?
|
|
begin
|
|
feed = Feedjira.parse(xml)
|
|
rescue StandardError => e
|
|
puts "Error parsing RSS feed from #{src['rss_url']} - #{e.message}"
|
|
return
|
|
end
|
|
process_entries(site, src, feed.entries)
|
|
end
|
|
|
|
def process_entries(site, src, entries)
|
|
entries.each do |e|
|
|
puts "...fetching #{e.url}"
|
|
create_document(site, src['name'], e.url, {
|
|
title: e.title,
|
|
content: e.content,
|
|
summary: e.summary,
|
|
published: e.published
|
|
}, src)
|
|
end
|
|
end
|
|
|
|
def create_document(site, source_name, url, content, src = {})
|
|
# check if title is composed only of whitespace or foreign characters
|
|
if content[:title].gsub(/[^\w]/, '').strip.empty?
|
|
# use the source name and last url segment as fallback
|
|
slug = "#{source_name.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')}-#{url.split('/').last}"
|
|
else
|
|
# parse title from the post or use the source name and last url segment as fallback
|
|
slug = content[:title].downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')
|
|
slug = "#{source_name.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')}-#{url.split('/').last}" if slug.empty?
|
|
end
|
|
|
|
path = site.in_source_dir("_posts/#{slug}.md")
|
|
doc = Jekyll::Document.new(
|
|
path, { :site => site, :collection => site.collections['posts'] }
|
|
)
|
|
doc.data['external_source'] = source_name
|
|
doc.data['title'] = content[:title]
|
|
doc.data['feed_content'] = content[:content]
|
|
doc.data['description'] = content[:summary]
|
|
doc.data['date'] = content[:published]
|
|
doc.data['redirect'] = url
|
|
|
|
# Apply default categories and tags from source configuration
|
|
if src['categories'] && src['categories'].is_a?(Array) && !src['categories'].empty?
|
|
doc.data['categories'] = src['categories']
|
|
end
|
|
if src['tags'] && src['tags'].is_a?(Array) && !src['tags'].empty?
|
|
doc.data['tags'] = src['tags']
|
|
end
|
|
|
|
doc.content = content[:content]
|
|
site.collections['posts'].docs << doc
|
|
end
|
|
|
|
def fetch_from_urls(site, src)
|
|
src['posts'].each do |post|
|
|
puts "...fetching #{post['url']}"
|
|
content = fetch_content_from_url(post['url'])
|
|
content[:published] = parse_published_date(post['published_date'])
|
|
create_document(site, src['name'], post['url'], content, src)
|
|
end
|
|
end
|
|
|
|
def parse_published_date(published_date)
|
|
case published_date
|
|
when String
|
|
Time.parse(published_date).utc
|
|
when Date
|
|
published_date.to_time.utc
|
|
else
|
|
raise "Invalid date format for #{published_date}"
|
|
end
|
|
end
|
|
|
|
def fetch_content_from_url(url)
|
|
html = HTTParty.get(url).body
|
|
parsed_html = Nokogiri::HTML(html)
|
|
|
|
title = parsed_html.at('head title')&.text.strip || ''
|
|
description = parsed_html.at('head meta[name="description"]')&.attr('content')
|
|
description ||= parsed_html.at('head meta[name="og:description"]')&.attr('content')
|
|
description ||= parsed_html.at('head meta[property="og:description"]')&.attr('content')
|
|
|
|
body_content = parsed_html.search('p').map { |e| e.text }
|
|
body_content = body_content.join() || ''
|
|
|
|
{
|
|
title: title,
|
|
content: body_content,
|
|
summary: description
|
|
# Note: The published date is now added in the fetch_from_urls method.
|
|
}
|
|
end
|
|
|
|
end
|
|
end
|