diff --git a/libs/feedparser.lua b/libs/feedparser.lua new file mode 100644 index 0000000..2d66b8c --- /dev/null +++ b/libs/feedparser.lua @@ -0,0 +1,369 @@ +local LOM = assert(require("lxp.lom"), "LuaExpat doesn't seem to be installed. feedparser kind of needs it to work...") +local XMLElement = require "feedparser.XMLElement" +local dateparser = require "feedparser.dateparser" +local URL = require "feedparser.url" +local tinsert, tremove, tconcat = table.insert, table.remove, table.concat +local pairs, ipairs = pairs, ipairs + +--- feedparser, similar to the Universal Feed Parser for python, but a good deal weaker. +-- see http://feedparser.org for details about the Universal Feed Parser +local feedparser= { + _DESCRIPTION = "RSS and Atom feed parser", + _VERSION = "feedparser 0.71" +} + +local blanky = XMLElement.new() --useful in a whole bunch of places + +local function resolve(url, base_url) + return URL.absolute(base_url, url) +end + +local function rebase(el, base_uri) + local xml_base = el:getAttr('xml:base') + if not xml_base then return base_uri end + return resolve(xml_base, base_uri) +end + +local function parse_entries(entries_el, format_str, base) + local entries = {} + for i, entry_el in ipairs(entries_el) do + local entry = {enclosures={}, links={}, contributors={}} + local entry_base = rebase(entry_el, base) + for i, el in ipairs(entry_el:getChildren('*')) do + local tag = el:getTag() + local el_base = rebase(el, entry_base) + --title + if tag == 'title' or tag == 'dc:title' or tag =='rdf:title' then --'dc:title' doesn't occur in atom feeds, but whatever. + entry.title=el:getText() + + --link(s) + elseif format_str == 'rss' and tag=='link' then + entry.link=resolve(el:getText(), el_base) + tinsert(entry.links, {href=entry.link}) + + elseif (format_str=='atom' and tag == 'link') or + (format_str == 'rss' and tag=='atom:link') then + local link = {} + for i, attr in ipairs{'rel','type', 'href','title'} do + link[attr]= (attr=='href') and resolve(el:getAttr(attr), el_base) or el:getAttr(attr) --uri + end + tinsert(entry.links, link) + if link.rel=='enclosure' then + tinsert(entry.enclosures, { + href=link.href, + length=el:getAttr('length'), + type=el:getAttr('type') + }) + end + + --rss enclosures + elseif format_str == 'rss' and tag=='enclosure' then + tinsert(entry.enclosures, { + url=el:getAttr('url'), + length=el:getAttr('length'), + type=el:getAttr('type') + }) + + --summary + elseif (format_str=='atom' and tag=='summary') or + (format_str=='rss' and(tag=='description' or tag=='dc:description' or tag=='rdf:description')) then + entry.summary=el:getText() + --TODO: summary_detail + + --content + elseif (format_str=='atom' and tag=='content') or + (format_str=='rss' and (tag=='body' or tag=='xhtml:body' or tag == 'fullitem' or tag=='content:encoded')) then + entry.content=el:getText() + --TODO: content_detail + + --published + elseif (format_str == 'atom' and (tag=='published' or tag=='issued')) or + (format_str == 'rss' and (tag=='dcterms:issued' or tag=='atom:published' or tag=='atom:issued')) then + entry.published = el:getText() + entry.published_parsed=dateparser.parse(entry.published) + + --updated + elseif (format_str=='atom' and (tag=='updated' or tag=='modified')) or + (format_str=='rss' and (tag=='dc:date' or tag=='pubDate' or tag=='dcterms:modified')) then + entry.updated=el:getText() + entry.updated_parsed=dateparser.parse(entry.updated) + + elseif tag=='created' or tag=='atom:created' or tag=='dcterms:created' then + entry.created=el:getText() + entry.created_parsed=dateparser.parse(entry.created) + + --id + elseif (format_str =='atom' and tag=='id') or + (format_str=='rss' and tag=='guid') then + entry.id=resolve(el:getText(), el_base) -- this is a uri, right?... + + --author + elseif format_str=='rss' and (tag=='author' or tag=='dc:creator') then --author tag should give the author's email. should I respect this? + entry.author=(el:getChild('name') or el):getText() + entry.author_detail={ + name=entry.author + } + elseif format_str=='atom' and tag=='author' then + entry.author=(el:getChild('name') or el):getText() + entry.author_detail = { + name=entry.author, + email=(el:getChild('email') or blanky):getText() + } + local author_url = (el:getChild('url') or blanky):getText() + if author_url and author_url ~= "" then entry.author_detail.href=resolve(author_url, rebase(el:getChild('url'), el_base)) end + + elseif tag=='category' or tag=='dc:subject' then + --todo + + elseif tag=='source' then + --todo + end + end + + --wrap up rss guid + if format_str == 'rss' and (not entry.id) and entry_el:getAttr('rdf:about') then + entry.id=resolve(entry_el:getAttr('rdf:about'), entry_base) --uri + end + + --wrap up entry.link + for i, link in pairs(entry.links) do + if link.rel=="alternate" or (not link.rel) or link.rel=="" then + entry.link=link.href --already resolved. + break + end + end + if not entry.link and format_str=='rss' then + entry.link=entry.id + end + tinsert(entries, entry) + end + return entries +end + +local function atom_person_construct(person_el, base_uri) + local dude ={ + name= (person_el:getChild('name') or blanky):getText(), + email=(person_el:getChild('email') or blanky):getText() + } + local url_el = person_el:getChild('url') + if url_el then dude.href=resolve(url_el:getText(), rebase(url_el, base_uri)) end + return dude +end + +local function parse_atom(root, base_uri) + local res = {} + local feed = { + links = {}, + contributors={}, + language = root:getAttr('lang') or root:getAttr('xml:lang') + } + local root_base = rebase(root, base_uri) + res.feed=feed + res.format='atom' + local version=(root:getAttr('version') or ''):lower() + if version=="1.0" or root:getAttr('xmlns')=='http://www.w3.org/2005/Atom' then res.version='atom10' + elseif version=="0.3" then res.version='atom03' + else res.version='atom' end + + for i, el in ipairs(root:getChildren('*')) do + local tag = el:getTag() + local el_base=rebase(el, root_base) + if tag == 'title' or tag == 'dc:title' or tag == 'atom10:title' or tag == 'atom03:title' then + feed.title=el:getText() --sanitize! + --todo: feed.title_detail + + --link stuff + elseif tag=='link' then + local link = {} + for i, attr in ipairs{'rel','type', 'href','title'} do + link[attr]= (attr=='href') and resolve(el:getAttr(attr), el_base) or el:getAttr(attr) + end + tinsert(feed.links, link) + + --subtitle + elseif tag == 'subtitle' then + feed.subtitle=el:getText() --sanitize! + elseif not feed.subtitle and (tag == 'tagline' or tag =='atom03:tagline' or tag=='dc:description') then + feed.subtitle=el:getText() --sanitize! + + --rights + elseif tag == 'copyright' or tag == 'rights' then + feed.rights=el:getText() --sanitize! + + --generator + elseif tag == 'generator' then + feed.generator=el:getText() --sanitize! + elseif tag == 'admin:generatorAgent' then + feed.generator = feed.generator or el:getAttr('rdf:resource') + + --info + elseif tag == 'info' then --whatever, nobody cared, anyway. + feed.info = el:getText() + + --id + elseif tag=='id' then + feed.id=resolve(el:getText(), el_base) --this is a url, right?.,, + + --updated + elseif tag == 'updated' or tag == 'dc:date' or tag == 'modified' or tag=='rss:pubDate' then + feed.updated = el:getText() + feed.updated_parsed=dateparser.parse(feed.updated) + + --author + elseif tag=='author' or tag=='atom:author' then + feed.author_detail=atom_person_construct(el, el_base) + feed.author=feed.author_detail.name + + --contributors + elseif tag=='contributor' or tag=='atom:contributor' then + tinsert(feed.contributors, atom_person_construct(el, el_base)) + + --icon + elseif tag=='icon' then + feed.icon=resolve(el:getText(), el_base) + + --logo + elseif tag=='logo' then + feed.logo=resolve(el:getText(), el_base) + + --language + elseif tag=='language' or tag=='dc:language' then + feed.language=feed.language or el:getText() + + --licence + end + end + --feed.link (already resolved) + for i, link in pairs(feed.links) do + if link.rel=='alternate' or not link.rel or link.rel=='' then + feed.link=link.href + break + end + end + + res.entries=parse_entries(root:getChildren('entry'),'atom', root_base) + return res +end + +local function parse_rss(root, base_uri) + + local channel = root:getChild({'channel', 'rdf:channel'}) + local channel_base = rebase(channel, base_uri) + if not channel then return nil, "can't parse that." end + + local feed = {links = {}, contributors={}} + local res = { + feed=feed, + format='rss', + entries={} + } + + --this isn't quite right at all. + if root:getTag():lower()=='rdf:rdf' then + res.version='rss10' + else + res.version='rss20' + end + + for i, el in ipairs(channel:getChildren('*')) do + local el_base=rebase(el, channel_base) + local tag = el:getTag() + + if tag=='link' then + feed.link=resolve(el:getText(), el_base) + tinsert(feed.links, {href=feed.link}) + + --title + elseif tag == 'title' or tag == 'dc:title' then + feed.title=el:getText() --sanitize! + + --subtitle + elseif tag == 'description' or tag =='dc:description' or tag=='itunes:subtitle' then + feed.subtitle=el:getText() --sanitize! + + --rights + elseif tag == 'copyright' or tag == 'dc:rights' then + feed.rights=el:getText() --sanitize! + + --generator + elseif tag == 'generator' then + feed.generator=el:getText() + elseif tag == 'admin:generatorAgent' then + feed.generator = feed.generator or el:getAttr('rdf:resource') + + --info (nobody cares...) + elseif tag == 'feedburner:browserFriendly' then + feed.info = el:getText() + + --updated + elseif tag == 'pubDate' or tag == 'dc:date' or tag == 'dcterms:modified' then + feed.updated = el:getText() + feed.updated_parsed = dateparser.parse(feed.updated) + + --author + elseif tag=='managingEditor' or tag =='dc:creator' or tag=='itunes:author' or tag =='dc:creator' or tag=='dc:author' then + feed.author=tconcat(el:getChildren('text()')) + feed.author_details={name=feed.author} + elseif tag=='atom:author' then + feed.author_details = atom_person_construct(el, el_base) + feed.author = feed.author_details.name + + --contributors + elseif tag == 'dc:contributor' then + tinsert(feed.contributors, {name=el:getText()}) + elseif tag == 'atom:contributor' then + tinsert(feed.contributors, atom_person_construct(el, el_base)) + + --image + elseif tag=='image' or tag=='rdf:image' then + feed.image={ + title=el:getChild('title'):getText(), + link=(el:getChild('link') or blanky):getText(), + width=(el:getChild('width') or blanky):getText(), + height=(el:getChild('height') or blanky):getText() + } + local url_el = el:getChild('url') + if url_el then feed.image.href = resolve(url_el:getText(), rebase(url_el, el_base)) end + + --language + elseif tag=='language' or tag=='dc:language' then + feed.language=el:getText() + + --licence + --publisher + --tags + end + end + + res.entries=parse_entries(channel:getChildren('item'),'rss', channel_base) + return res +end + + +--- parse feed xml +-- @param xml_string feed xml, as a string +-- @param base_url (optional) source url of the feed. useful when resolving relative links found in feed contents +-- @return table with parsed feed info, or nil, error_message on error. +-- the format of the returned table is much like that on http://feedparser.org, with the major difference that +-- dates are parsed into unixtime. Most other fields are very much the same. +function feedparser.parse(xml_string, base_url) + local lom, err = LOM.parse(xml_string) + if not lom then return nil, "couldn't parse xml. lxp says: " .. err or "nothing" end + local rootElement = XMLElement.new(lom) + local root_tag = rootElement:getTag():lower() + if root_tag=='rdf:rdf' or root_tag=='rss' then + return parse_rss(rootElement, base_url) + elseif root_tag=='feed' then + return parse_atom(rootElement, base_url) + else + return nil, "unknown feed format" + end +end + +--for the sake of backwards-compatibility, feedparser will export a global reference for lua < 5.3 +if _VERSION:sub(-3) < "5.3" then + _G.feedparser=feedparser +end + + +return feedparser \ No newline at end of file diff --git a/plugins/rss.lua b/plugins/rss.lua index c06aa94..65cd90f 100644 --- a/plugins/rss.lua +++ b/plugins/rss.lua @@ -1,3 +1,41 @@ +feedparser = (loadfile "./libs/feedparser.lua")() + +local function unescape_for_rss(str) + -- Character encoding + str = string.gsub(str, "|", "|") + str = string.gsub(str, "‹", "‹") + str = string.gsub(str, "<", "<") + str = string.gsub(str, ">", ">") + str = string.gsub(str, "'", "'") + str = string.gsub(str, "’", "'") + str = string.gsub(str, "–", "–") + str = string.gsub(str, "»", "»") + str = string.gsub(str, "»", "»") + str = string.gsub(str, "–", "–") + str = string.gsub(str, "“", "“") + str = string.gsub(str, "”", "”") + str = string.gsub(str, "€", "€") + str = string.gsub(str, "ß", "ß") + + -- Ä Ö Ü + str = string.gsub(str, "ä", "ä") + str = string.gsub(str, "Ä", "Ä") + str = string.gsub(str, "ä", "ä") + str = string.gsub(str, "Ä", "Ä") + str = string.gsub(str, "ö", "ö") + str = string.gsub(str, "Ö", "Ö") + str = string.gsub(str, "ö", "ö") + str = string.gsub(str, "Ö", "Ö") + str = string.gsub(str, "ü", "ü") + str = string.gsub(str, "Ü", "Ü") + str = string.gsub(str, "ü", "ü") + str = string.gsub(str, "Ü", "Ü") + -- str = string.gsub( str, '&#(%d+);', function(n) return string.char(n) end ) <- There is a bug, but I don't know!? + str = string.gsub( str, '&#x(%d+);', function(n) return string.char(tonumber(n,16)) end ) + str = string.gsub( str, '&', '&' ) -- Be sure to do this after all others + return str +end + local function get_base_redis(id, option, extra) local ex = '' if option ~= nil then @@ -27,7 +65,7 @@ local function get_rss(url, prot) res, code = https.request(url) end if code ~= 200 then - return nil, "Error while doing the petition to " .. url + return nil, "Fehler beim Erreichen von " .. url end local parsed = feedparser.parse(res) if parsed == nil then @@ -92,14 +130,14 @@ end local function unsubscribe(id, n) if #n > 3 then - return "I don't think that you have that many subscriptions." + return "Du kannst nicht mehr als drei Feeds abonnieren!" end n = tonumber(n) local uhash = get_base_redis(id) local subs = redis:smembers(uhash) if n < 1 or n > #subs then - return "Subscription id out of range!" + return "Abonnement-ID zu hoch!" end local sub = subs[n] local lhash = get_base_redis(sub, "subs") @@ -123,6 +161,7 @@ local function cron() local keys = redis:keys(get_base_redis("*", "subs")) for k,v in pairs(keys) do local base = string.match(v, "rss:(.+):subs") -- Get the URL base + --print('RSS: '..base) local prot = redis:get(get_base_redis(base, "protocol")) local last = redis:get(get_base_redis(base, "last_entry")) local url = prot .. "://" .. base @@ -132,12 +171,16 @@ local function cron() end local newentr = get_new_entries(last, parsed.entries) local subscribers = {} - local text = '' -- Send only one message with all updates + local text = '' -- Send one message per feed with the latest entries for k2, v2 in pairs(newentr) do - local title = v2.title or 'No title' - local link = v2.link or v2.id or 'No Link' - --text = string.gsub(text, "\n", "") - text = text .. '[RSS] '.. title .. '\n(' .. link .. ')\n\n' + local title = v2.title or 'Kein Titel' + local link = v2.link or v2.id or 'Kein Link' + if v2.content then + content = string.sub(unescape_for_rss(v2.content:gsub("%b<>", "")), 1, 250) .. '...' + else + content = string.sub(unescape_for_rss(v2.summary:gsub("%b<>", "")), 1, 250) .. '...' + end + text = text .. '[RSS] '.. title .. '\n'..content..'\n\n(' .. link .. ')\n\n' end if text ~= '' then local newlast = newentr[1].id @@ -173,12 +216,12 @@ end return { - description = "Manage User/Chat RSS subscriptions. If you are in a chat group, the RSS subscriptions will be of that chat. If you are in an one-to-one talk with the bot, the RSS subscriptions will be yours.", + description = "RSS-Feed Reader", usage = { - "/rss: Get your rss (or chat rss) subscriptions", - "/rss add (url): Subscribe to that url", - "/rss remove (id): Unsubscribe of that id", - "/rss sync: Download now the updates and send it. Only sudo users can use this option." + "/rss: Feed-Abos anzeigen", + "/rss add (url): Diesen Feed abonnieren", + "/rss remove (id): Diesen Feed deabonnieren", + "/rss sync: Feeds aktualisieren" }, patterns = { "^/rss$",