rss plugin now with rss text

thx @Brawl345
This commit is contained in:
Akamaru 2015-09-08 22:22:00 +02:00
parent 7a39db43b2
commit 31f50f2081
2 changed files with 425 additions and 13 deletions

369
libs/feedparser.lua Normal file
View File

@ -0,0 +1,369 @@
local LOM = assert(require("lxp.lom"), "LuaExpat doesn't seem to be installed. feedparser kind of needs it to work...")
local XMLElement = require "feedparser.XMLElement"
local dateparser = require "feedparser.dateparser"
local URL = require "feedparser.url"
local tinsert, tremove, tconcat = table.insert, table.remove, table.concat
local pairs, ipairs = pairs, ipairs
--- feedparser, similar to the Universal Feed Parser for python, but a good deal weaker.
-- see http://feedparser.org for details about the Universal Feed Parser
local feedparser= {
_DESCRIPTION = "RSS and Atom feed parser",
_VERSION = "feedparser 0.71"
}
local blanky = XMLElement.new() --useful in a whole bunch of places
local function resolve(url, base_url)
return URL.absolute(base_url, url)
end
local function rebase(el, base_uri)
local xml_base = el:getAttr('xml:base')
if not xml_base then return base_uri end
return resolve(xml_base, base_uri)
end
local function parse_entries(entries_el, format_str, base)
local entries = {}
for i, entry_el in ipairs(entries_el) do
local entry = {enclosures={}, links={}, contributors={}}
local entry_base = rebase(entry_el, base)
for i, el in ipairs(entry_el:getChildren('*')) do
local tag = el:getTag()
local el_base = rebase(el, entry_base)
--title
if tag == 'title' or tag == 'dc:title' or tag =='rdf:title' then --'dc:title' doesn't occur in atom feeds, but whatever.
entry.title=el:getText()
--link(s)
elseif format_str == 'rss' and tag=='link' then
entry.link=resolve(el:getText(), el_base)
tinsert(entry.links, {href=entry.link})
elseif (format_str=='atom' and tag == 'link') or
(format_str == 'rss' and tag=='atom:link') then
local link = {}
for i, attr in ipairs{'rel','type', 'href','title'} do
link[attr]= (attr=='href') and resolve(el:getAttr(attr), el_base) or el:getAttr(attr) --uri
end
tinsert(entry.links, link)
if link.rel=='enclosure' then
tinsert(entry.enclosures, {
href=link.href,
length=el:getAttr('length'),
type=el:getAttr('type')
})
end
--rss enclosures
elseif format_str == 'rss' and tag=='enclosure' then
tinsert(entry.enclosures, {
url=el:getAttr('url'),
length=el:getAttr('length'),
type=el:getAttr('type')
})
--summary
elseif (format_str=='atom' and tag=='summary') or
(format_str=='rss' and(tag=='description' or tag=='dc:description' or tag=='rdf:description')) then
entry.summary=el:getText()
--TODO: summary_detail
--content
elseif (format_str=='atom' and tag=='content') or
(format_str=='rss' and (tag=='body' or tag=='xhtml:body' or tag == 'fullitem' or tag=='content:encoded')) then
entry.content=el:getText()
--TODO: content_detail
--published
elseif (format_str == 'atom' and (tag=='published' or tag=='issued')) or
(format_str == 'rss' and (tag=='dcterms:issued' or tag=='atom:published' or tag=='atom:issued')) then
entry.published = el:getText()
entry.published_parsed=dateparser.parse(entry.published)
--updated
elseif (format_str=='atom' and (tag=='updated' or tag=='modified')) or
(format_str=='rss' and (tag=='dc:date' or tag=='pubDate' or tag=='dcterms:modified')) then
entry.updated=el:getText()
entry.updated_parsed=dateparser.parse(entry.updated)
elseif tag=='created' or tag=='atom:created' or tag=='dcterms:created' then
entry.created=el:getText()
entry.created_parsed=dateparser.parse(entry.created)
--id
elseif (format_str =='atom' and tag=='id') or
(format_str=='rss' and tag=='guid') then
entry.id=resolve(el:getText(), el_base) -- this is a uri, right?...
--author
elseif format_str=='rss' and (tag=='author' or tag=='dc:creator') then --author tag should give the author's email. should I respect this?
entry.author=(el:getChild('name') or el):getText()
entry.author_detail={
name=entry.author
}
elseif format_str=='atom' and tag=='author' then
entry.author=(el:getChild('name') or el):getText()
entry.author_detail = {
name=entry.author,
email=(el:getChild('email') or blanky):getText()
}
local author_url = (el:getChild('url') or blanky):getText()
if author_url and author_url ~= "" then entry.author_detail.href=resolve(author_url, rebase(el:getChild('url'), el_base)) end
elseif tag=='category' or tag=='dc:subject' then
--todo
elseif tag=='source' then
--todo
end
end
--wrap up rss guid
if format_str == 'rss' and (not entry.id) and entry_el:getAttr('rdf:about') then
entry.id=resolve(entry_el:getAttr('rdf:about'), entry_base) --uri
end
--wrap up entry.link
for i, link in pairs(entry.links) do
if link.rel=="alternate" or (not link.rel) or link.rel=="" then
entry.link=link.href --already resolved.
break
end
end
if not entry.link and format_str=='rss' then
entry.link=entry.id
end
tinsert(entries, entry)
end
return entries
end
local function atom_person_construct(person_el, base_uri)
local dude ={
name= (person_el:getChild('name') or blanky):getText(),
email=(person_el:getChild('email') or blanky):getText()
}
local url_el = person_el:getChild('url')
if url_el then dude.href=resolve(url_el:getText(), rebase(url_el, base_uri)) end
return dude
end
local function parse_atom(root, base_uri)
local res = {}
local feed = {
links = {},
contributors={},
language = root:getAttr('lang') or root:getAttr('xml:lang')
}
local root_base = rebase(root, base_uri)
res.feed=feed
res.format='atom'
local version=(root:getAttr('version') or ''):lower()
if version=="1.0" or root:getAttr('xmlns')=='http://www.w3.org/2005/Atom' then res.version='atom10'
elseif version=="0.3" then res.version='atom03'
else res.version='atom' end
for i, el in ipairs(root:getChildren('*')) do
local tag = el:getTag()
local el_base=rebase(el, root_base)
if tag == 'title' or tag == 'dc:title' or tag == 'atom10:title' or tag == 'atom03:title' then
feed.title=el:getText() --sanitize!
--todo: feed.title_detail
--link stuff
elseif tag=='link' then
local link = {}
for i, attr in ipairs{'rel','type', 'href','title'} do
link[attr]= (attr=='href') and resolve(el:getAttr(attr), el_base) or el:getAttr(attr)
end
tinsert(feed.links, link)
--subtitle
elseif tag == 'subtitle' then
feed.subtitle=el:getText() --sanitize!
elseif not feed.subtitle and (tag == 'tagline' or tag =='atom03:tagline' or tag=='dc:description') then
feed.subtitle=el:getText() --sanitize!
--rights
elseif tag == 'copyright' or tag == 'rights' then
feed.rights=el:getText() --sanitize!
--generator
elseif tag == 'generator' then
feed.generator=el:getText() --sanitize!
elseif tag == 'admin:generatorAgent' then
feed.generator = feed.generator or el:getAttr('rdf:resource')
--info
elseif tag == 'info' then --whatever, nobody cared, anyway.
feed.info = el:getText()
--id
elseif tag=='id' then
feed.id=resolve(el:getText(), el_base) --this is a url, right?.,,
--updated
elseif tag == 'updated' or tag == 'dc:date' or tag == 'modified' or tag=='rss:pubDate' then
feed.updated = el:getText()
feed.updated_parsed=dateparser.parse(feed.updated)
--author
elseif tag=='author' or tag=='atom:author' then
feed.author_detail=atom_person_construct(el, el_base)
feed.author=feed.author_detail.name
--contributors
elseif tag=='contributor' or tag=='atom:contributor' then
tinsert(feed.contributors, atom_person_construct(el, el_base))
--icon
elseif tag=='icon' then
feed.icon=resolve(el:getText(), el_base)
--logo
elseif tag=='logo' then
feed.logo=resolve(el:getText(), el_base)
--language
elseif tag=='language' or tag=='dc:language' then
feed.language=feed.language or el:getText()
--licence
end
end
--feed.link (already resolved)
for i, link in pairs(feed.links) do
if link.rel=='alternate' or not link.rel or link.rel=='' then
feed.link=link.href
break
end
end
res.entries=parse_entries(root:getChildren('entry'),'atom', root_base)
return res
end
local function parse_rss(root, base_uri)
local channel = root:getChild({'channel', 'rdf:channel'})
local channel_base = rebase(channel, base_uri)
if not channel then return nil, "can't parse that." end
local feed = {links = {}, contributors={}}
local res = {
feed=feed,
format='rss',
entries={}
}
--this isn't quite right at all.
if root:getTag():lower()=='rdf:rdf' then
res.version='rss10'
else
res.version='rss20'
end
for i, el in ipairs(channel:getChildren('*')) do
local el_base=rebase(el, channel_base)
local tag = el:getTag()
if tag=='link' then
feed.link=resolve(el:getText(), el_base)
tinsert(feed.links, {href=feed.link})
--title
elseif tag == 'title' or tag == 'dc:title' then
feed.title=el:getText() --sanitize!
--subtitle
elseif tag == 'description' or tag =='dc:description' or tag=='itunes:subtitle' then
feed.subtitle=el:getText() --sanitize!
--rights
elseif tag == 'copyright' or tag == 'dc:rights' then
feed.rights=el:getText() --sanitize!
--generator
elseif tag == 'generator' then
feed.generator=el:getText()
elseif tag == 'admin:generatorAgent' then
feed.generator = feed.generator or el:getAttr('rdf:resource')
--info (nobody cares...)
elseif tag == 'feedburner:browserFriendly' then
feed.info = el:getText()
--updated
elseif tag == 'pubDate' or tag == 'dc:date' or tag == 'dcterms:modified' then
feed.updated = el:getText()
feed.updated_parsed = dateparser.parse(feed.updated)
--author
elseif tag=='managingEditor' or tag =='dc:creator' or tag=='itunes:author' or tag =='dc:creator' or tag=='dc:author' then
feed.author=tconcat(el:getChildren('text()'))
feed.author_details={name=feed.author}
elseif tag=='atom:author' then
feed.author_details = atom_person_construct(el, el_base)
feed.author = feed.author_details.name
--contributors
elseif tag == 'dc:contributor' then
tinsert(feed.contributors, {name=el:getText()})
elseif tag == 'atom:contributor' then
tinsert(feed.contributors, atom_person_construct(el, el_base))
--image
elseif tag=='image' or tag=='rdf:image' then
feed.image={
title=el:getChild('title'):getText(),
link=(el:getChild('link') or blanky):getText(),
width=(el:getChild('width') or blanky):getText(),
height=(el:getChild('height') or blanky):getText()
}
local url_el = el:getChild('url')
if url_el then feed.image.href = resolve(url_el:getText(), rebase(url_el, el_base)) end
--language
elseif tag=='language' or tag=='dc:language' then
feed.language=el:getText()
--licence
--publisher
--tags
end
end
res.entries=parse_entries(channel:getChildren('item'),'rss', channel_base)
return res
end
--- parse feed xml
-- @param xml_string feed xml, as a string
-- @param base_url (optional) source url of the feed. useful when resolving relative links found in feed contents
-- @return table with parsed feed info, or nil, error_message on error.
-- the format of the returned table is much like that on http://feedparser.org, with the major difference that
-- dates are parsed into unixtime. Most other fields are very much the same.
function feedparser.parse(xml_string, base_url)
local lom, err = LOM.parse(xml_string)
if not lom then return nil, "couldn't parse xml. lxp says: " .. err or "nothing" end
local rootElement = XMLElement.new(lom)
local root_tag = rootElement:getTag():lower()
if root_tag=='rdf:rdf' or root_tag=='rss' then
return parse_rss(rootElement, base_url)
elseif root_tag=='feed' then
return parse_atom(rootElement, base_url)
else
return nil, "unknown feed format"
end
end
--for the sake of backwards-compatibility, feedparser will export a global reference for lua < 5.3
if _VERSION:sub(-3) < "5.3" then
_G.feedparser=feedparser
end
return feedparser

View File

@ -1,3 +1,41 @@
feedparser = (loadfile "./libs/feedparser.lua")()
local function unescape_for_rss(str)
-- Character encoding
str = string.gsub(str, "&#124;", "|")
str = string.gsub(str, "&#8249;", "")
str = string.gsub(str, "&lt;", "<")
str = string.gsub(str, "&gt;", ">")
str = string.gsub(str, "&#39;", "'")
str = string.gsub(str, "&#8217;", "'")
str = string.gsub(str, "&ndash;", "")
str = string.gsub(str, "&raquo;", "»")
str = string.gsub(str, "&#187;", "»")
str = string.gsub(str, "&#8211;", "")
str = string.gsub(str, "&#8220;", "")
str = string.gsub(str, "&#8221;", "")
str = string.gsub(str, "&#8364;", "")
str = string.gsub(str, "&#223;", "ß")
-- Ä Ö Ü
str = string.gsub(str, "&auml;", "ä")
str = string.gsub(str, "&Auml;", "Ä")
str = string.gsub(str, "&#228;", "ä")
str = string.gsub(str, "&#196;", "Ä")
str = string.gsub(str, "&ouml;", "ö")
str = string.gsub(str, "&Ouml;", "Ö")
str = string.gsub(str, "&#246;", "ö")
str = string.gsub(str, "&#214;", "Ö")
str = string.gsub(str, "&uuml;", "ü")
str = string.gsub(str, "&Uuml;", "Ü")
str = string.gsub(str, "&#252;", "ü")
str = string.gsub(str, "&#220;", "Ü")
-- str = string.gsub( str, '&#(%d+);', function(n) return string.char(n) end ) <- There is a bug, but I don't know!?
str = string.gsub( str, '&#x(%d+);', function(n) return string.char(tonumber(n,16)) end )
str = string.gsub( str, '&amp;', '&' ) -- Be sure to do this after all others
return str
end
local function get_base_redis(id, option, extra)
local ex = ''
if option ~= nil then
@ -27,7 +65,7 @@ local function get_rss(url, prot)
res, code = https.request(url)
end
if code ~= 200 then
return nil, "Error while doing the petition to " .. url
return nil, "Fehler beim Erreichen von " .. url
end
local parsed = feedparser.parse(res)
if parsed == nil then
@ -92,14 +130,14 @@ end
local function unsubscribe(id, n)
if #n > 3 then
return "I don't think that you have that many subscriptions."
return "Du kannst nicht mehr als drei Feeds abonnieren!"
end
n = tonumber(n)
local uhash = get_base_redis(id)
local subs = redis:smembers(uhash)
if n < 1 or n > #subs then
return "Subscription id out of range!"
return "Abonnement-ID zu hoch!"
end
local sub = subs[n]
local lhash = get_base_redis(sub, "subs")
@ -123,6 +161,7 @@ local function cron()
local keys = redis:keys(get_base_redis("*", "subs"))
for k,v in pairs(keys) do
local base = string.match(v, "rss:(.+):subs") -- Get the URL base
--print('RSS: '..base)
local prot = redis:get(get_base_redis(base, "protocol"))
local last = redis:get(get_base_redis(base, "last_entry"))
local url = prot .. "://" .. base
@ -132,12 +171,16 @@ local function cron()
end
local newentr = get_new_entries(last, parsed.entries)
local subscribers = {}
local text = '' -- Send only one message with all updates
local text = '' -- Send one message per feed with the latest entries
for k2, v2 in pairs(newentr) do
local title = v2.title or 'No title'
local link = v2.link or v2.id or 'No Link'
--text = string.gsub(text, "\n", "")
text = text .. '[RSS] '.. title .. '\n(' .. link .. ')\n\n'
local title = v2.title or 'Kein Titel'
local link = v2.link or v2.id or 'Kein Link'
if v2.content then
content = string.sub(unescape_for_rss(v2.content:gsub("%b<>", "")), 1, 250) .. '...'
else
content = string.sub(unescape_for_rss(v2.summary:gsub("%b<>", "")), 1, 250) .. '...'
end
text = text .. '[RSS] '.. title .. '\n'..content..'\n\n(' .. link .. ')\n\n'
end
if text ~= '' then
local newlast = newentr[1].id
@ -173,12 +216,12 @@ end
return {
description = "Manage User/Chat RSS subscriptions. If you are in a chat group, the RSS subscriptions will be of that chat. If you are in an one-to-one talk with the bot, the RSS subscriptions will be yours.",
description = "RSS-Feed Reader",
usage = {
"/rss: Get your rss (or chat rss) subscriptions",
"/rss add (url): Subscribe to that url",
"/rss remove (id): Unsubscribe of that id",
"/rss sync: Download now the updates and send it. Only sudo users can use this option."
"/rss: Feed-Abos anzeigen",
"/rss add (url): Diesen Feed abonnieren",
"/rss remove (id): Diesen Feed deabonnieren",
"/rss sync: Feeds aktualisieren"
},
patterns = {
"^/rss$",