369 lines
12 KiB
Lua
369 lines
12 KiB
Lua
local LOM = assert(require("lxp.lom"), "LuaExpat doesn't seem to be installed. feedparser kind of needs it to work...")
|
|
local XMLElement = require "feedparser.XMLElement"
|
|
local dateparser = require "feedparser.dateparser"
|
|
local URL = require "feedparser.url"
|
|
local tinsert, tremove, tconcat = table.insert, table.remove, table.concat
|
|
local pairs, ipairs = pairs, ipairs
|
|
|
|
--- feedparser, similar to the Universal Feed Parser for python, but a good deal weaker.
|
|
-- see http://feedparser.org for details about the Universal Feed Parser
|
|
local feedparser= {
|
|
_DESCRIPTION = "RSS and Atom feed parser",
|
|
_VERSION = "feedparser 0.71"
|
|
}
|
|
|
|
local blanky = XMLElement.new() --useful in a whole bunch of places
|
|
|
|
local function resolve(url, base_url)
|
|
return URL.absolute(base_url, url)
|
|
end
|
|
|
|
local function rebase(el, base_uri)
|
|
local xml_base = el:getAttr('xml:base')
|
|
if not xml_base then return base_uri end
|
|
return resolve(xml_base, base_uri)
|
|
end
|
|
|
|
local function parse_entries(entries_el, format_str, base)
|
|
local entries = {}
|
|
for i, entry_el in ipairs(entries_el) do
|
|
local entry = {enclosures={}, links={}, contributors={}}
|
|
local entry_base = rebase(entry_el, base)
|
|
for i, el in ipairs(entry_el:getChildren('*')) do
|
|
local tag = el:getTag()
|
|
local el_base = rebase(el, entry_base)
|
|
--title
|
|
if tag == 'title' or tag == 'dc:title' or tag =='rdf:title' then --'dc:title' doesn't occur in atom feeds, but whatever.
|
|
entry.title=el:getText()
|
|
|
|
--link(s)
|
|
elseif format_str == 'rss' and tag=='link' then
|
|
entry.link=resolve(el:getText(), el_base)
|
|
tinsert(entry.links, {href=entry.link})
|
|
|
|
elseif (format_str=='atom' and tag == 'link') or
|
|
(format_str == 'rss' and tag=='atom:link') then
|
|
local link = {}
|
|
for i, attr in ipairs{'rel','type', 'href','title'} do
|
|
link[attr]= (attr=='href') and resolve(el:getAttr(attr), el_base) or el:getAttr(attr) --uri
|
|
end
|
|
tinsert(entry.links, link)
|
|
if link.rel=='enclosure' then
|
|
tinsert(entry.enclosures, {
|
|
href=link.href,
|
|
length=el:getAttr('length'),
|
|
type=el:getAttr('type')
|
|
})
|
|
end
|
|
|
|
--rss enclosures
|
|
elseif format_str == 'rss' and tag=='enclosure' then
|
|
tinsert(entry.enclosures, {
|
|
url=el:getAttr('url'),
|
|
length=el:getAttr('length'),
|
|
type=el:getAttr('type')
|
|
})
|
|
|
|
--summary
|
|
elseif (format_str=='atom' and tag=='summary') or
|
|
(format_str=='rss' and(tag=='description' or tag=='dc:description' or tag=='rdf:description')) then
|
|
entry.summary=el:getText()
|
|
--TODO: summary_detail
|
|
|
|
--content
|
|
elseif (format_str=='atom' and tag=='content') or
|
|
(format_str=='rss' and (tag=='body' or tag=='xhtml:body' or tag == 'fullitem' or tag=='content:encoded')) then
|
|
entry.content=el:getText()
|
|
--TODO: content_detail
|
|
|
|
--published
|
|
elseif (format_str == 'atom' and (tag=='published' or tag=='issued')) or
|
|
(format_str == 'rss' and (tag=='dcterms:issued' or tag=='atom:published' or tag=='atom:issued')) then
|
|
entry.published = el:getText()
|
|
entry.published_parsed=dateparser.parse(entry.published)
|
|
|
|
--updated
|
|
elseif (format_str=='atom' and (tag=='updated' or tag=='modified')) or
|
|
(format_str=='rss' and (tag=='dc:date' or tag=='pubDate' or tag=='dcterms:modified')) then
|
|
entry.updated=el:getText()
|
|
entry.updated_parsed=dateparser.parse(entry.updated)
|
|
|
|
elseif tag=='created' or tag=='atom:created' or tag=='dcterms:created' then
|
|
entry.created=el:getText()
|
|
entry.created_parsed=dateparser.parse(entry.created)
|
|
|
|
--id
|
|
elseif (format_str =='atom' and tag=='id') or
|
|
(format_str=='rss' and tag=='guid') then
|
|
entry.id=resolve(el:getText(), el_base) -- this is a uri, right?...
|
|
|
|
--author
|
|
elseif format_str=='rss' and (tag=='author' or tag=='dc:creator') then --author tag should give the author's email. should I respect this?
|
|
entry.author=(el:getChild('name') or el):getText()
|
|
entry.author_detail={
|
|
name=entry.author
|
|
}
|
|
elseif format_str=='atom' and tag=='author' then
|
|
entry.author=(el:getChild('name') or el):getText()
|
|
entry.author_detail = {
|
|
name=entry.author,
|
|
email=(el:getChild('email') or blanky):getText()
|
|
}
|
|
local author_url = (el:getChild('url') or blanky):getText()
|
|
if author_url and author_url ~= "" then entry.author_detail.href=resolve(author_url, rebase(el:getChild('url'), el_base)) end
|
|
|
|
elseif tag=='category' or tag=='dc:subject' then
|
|
--todo
|
|
|
|
elseif tag=='source' then
|
|
--todo
|
|
end
|
|
end
|
|
|
|
--wrap up rss guid
|
|
if format_str == 'rss' and (not entry.id) and entry_el:getAttr('rdf:about') then
|
|
entry.id=resolve(entry_el:getAttr('rdf:about'), entry_base) --uri
|
|
end
|
|
|
|
--wrap up entry.link
|
|
for i, link in pairs(entry.links) do
|
|
if link.rel=="alternate" or (not link.rel) or link.rel=="" then
|
|
entry.link=link.href --already resolved.
|
|
break
|
|
end
|
|
end
|
|
if not entry.link and format_str=='rss' then
|
|
entry.link=entry.id
|
|
end
|
|
tinsert(entries, entry)
|
|
end
|
|
return entries
|
|
end
|
|
|
|
local function atom_person_construct(person_el, base_uri)
|
|
local dude ={
|
|
name= (person_el:getChild('name') or blanky):getText(),
|
|
email=(person_el:getChild('email') or blanky):getText()
|
|
}
|
|
local url_el = person_el:getChild('url')
|
|
if url_el then dude.href=resolve(url_el:getText(), rebase(url_el, base_uri)) end
|
|
return dude
|
|
end
|
|
|
|
local function parse_atom(root, base_uri)
|
|
local res = {}
|
|
local feed = {
|
|
links = {},
|
|
contributors={},
|
|
language = root:getAttr('lang') or root:getAttr('xml:lang')
|
|
}
|
|
local root_base = rebase(root, base_uri)
|
|
res.feed=feed
|
|
res.format='atom'
|
|
local version=(root:getAttr('version') or ''):lower()
|
|
if version=="1.0" or root:getAttr('xmlns')=='http://www.w3.org/2005/Atom' then res.version='atom10'
|
|
elseif version=="0.3" then res.version='atom03'
|
|
else res.version='atom' end
|
|
|
|
for i, el in ipairs(root:getChildren('*')) do
|
|
local tag = el:getTag()
|
|
local el_base=rebase(el, root_base)
|
|
if tag == 'title' or tag == 'dc:title' or tag == 'atom10:title' or tag == 'atom03:title' then
|
|
feed.title=el:getText() --sanitize!
|
|
--todo: feed.title_detail
|
|
|
|
--link stuff
|
|
elseif tag=='link' then
|
|
local link = {}
|
|
for i, attr in ipairs{'rel','type', 'href','title'} do
|
|
link[attr]= (attr=='href') and resolve(el:getAttr(attr), el_base) or el:getAttr(attr)
|
|
end
|
|
tinsert(feed.links, link)
|
|
|
|
--subtitle
|
|
elseif tag == 'subtitle' then
|
|
feed.subtitle=el:getText() --sanitize!
|
|
elseif not feed.subtitle and (tag == 'tagline' or tag =='atom03:tagline' or tag=='dc:description') then
|
|
feed.subtitle=el:getText() --sanitize!
|
|
|
|
--rights
|
|
elseif tag == 'copyright' or tag == 'rights' then
|
|
feed.rights=el:getText() --sanitize!
|
|
|
|
--generator
|
|
elseif tag == 'generator' then
|
|
feed.generator=el:getText() --sanitize!
|
|
elseif tag == 'admin:generatorAgent' then
|
|
feed.generator = feed.generator or el:getAttr('rdf:resource')
|
|
|
|
--info
|
|
elseif tag == 'info' then --whatever, nobody cared, anyway.
|
|
feed.info = el:getText()
|
|
|
|
--id
|
|
elseif tag=='id' then
|
|
feed.id=resolve(el:getText(), el_base) --this is a url, right?.,,
|
|
|
|
--updated
|
|
elseif tag == 'updated' or tag == 'dc:date' or tag == 'modified' or tag=='rss:pubDate' then
|
|
feed.updated = el:getText()
|
|
feed.updated_parsed=dateparser.parse(feed.updated)
|
|
|
|
--author
|
|
elseif tag=='author' or tag=='atom:author' then
|
|
feed.author_detail=atom_person_construct(el, el_base)
|
|
feed.author=feed.author_detail.name
|
|
|
|
--contributors
|
|
elseif tag=='contributor' or tag=='atom:contributor' then
|
|
tinsert(feed.contributors, atom_person_construct(el, el_base))
|
|
|
|
--icon
|
|
elseif tag=='icon' then
|
|
feed.icon=resolve(el:getText(), el_base)
|
|
|
|
--logo
|
|
elseif tag=='logo' then
|
|
feed.logo=resolve(el:getText(), el_base)
|
|
|
|
--language
|
|
elseif tag=='language' or tag=='dc:language' then
|
|
feed.language=feed.language or el:getText()
|
|
|
|
--licence
|
|
end
|
|
end
|
|
--feed.link (already resolved)
|
|
for i, link in pairs(feed.links) do
|
|
if link.rel=='alternate' or not link.rel or link.rel=='' then
|
|
feed.link=link.href
|
|
break
|
|
end
|
|
end
|
|
|
|
res.entries=parse_entries(root:getChildren('entry'),'atom', root_base)
|
|
return res
|
|
end
|
|
|
|
local function parse_rss(root, base_uri)
|
|
|
|
local channel = root:getChild({'channel', 'rdf:channel'})
|
|
local channel_base = rebase(channel, base_uri)
|
|
if not channel then return nil, "can't parse that." end
|
|
|
|
local feed = {links = {}, contributors={}}
|
|
local res = {
|
|
feed=feed,
|
|
format='rss',
|
|
entries={}
|
|
}
|
|
|
|
--this isn't quite right at all.
|
|
if root:getTag():lower()=='rdf:rdf' then
|
|
res.version='rss10'
|
|
else
|
|
res.version='rss20'
|
|
end
|
|
|
|
for i, el in ipairs(channel:getChildren('*')) do
|
|
local el_base=rebase(el, channel_base)
|
|
local tag = el:getTag()
|
|
|
|
if tag=='link' then
|
|
feed.link=resolve(el:getText(), el_base)
|
|
tinsert(feed.links, {href=feed.link})
|
|
|
|
--title
|
|
elseif tag == 'title' or tag == 'dc:title' then
|
|
feed.title=el:getText() --sanitize!
|
|
|
|
--subtitle
|
|
elseif tag == 'description' or tag =='dc:description' or tag=='itunes:subtitle' then
|
|
feed.subtitle=el:getText() --sanitize!
|
|
|
|
--rights
|
|
elseif tag == 'copyright' or tag == 'dc:rights' then
|
|
feed.rights=el:getText() --sanitize!
|
|
|
|
--generator
|
|
elseif tag == 'generator' then
|
|
feed.generator=el:getText()
|
|
elseif tag == 'admin:generatorAgent' then
|
|
feed.generator = feed.generator or el:getAttr('rdf:resource')
|
|
|
|
--info (nobody cares...)
|
|
elseif tag == 'feedburner:browserFriendly' then
|
|
feed.info = el:getText()
|
|
|
|
--updated
|
|
elseif tag == 'pubDate' or tag == 'dc:date' or tag == 'dcterms:modified' then
|
|
feed.updated = el:getText()
|
|
feed.updated_parsed = dateparser.parse(feed.updated)
|
|
|
|
--author
|
|
elseif tag=='managingEditor' or tag =='dc:creator' or tag=='itunes:author' or tag =='dc:creator' or tag=='dc:author' then
|
|
feed.author=tconcat(el:getChildren('text()'))
|
|
feed.author_details={name=feed.author}
|
|
elseif tag=='atom:author' then
|
|
feed.author_details = atom_person_construct(el, el_base)
|
|
feed.author = feed.author_details.name
|
|
|
|
--contributors
|
|
elseif tag == 'dc:contributor' then
|
|
tinsert(feed.contributors, {name=el:getText()})
|
|
elseif tag == 'atom:contributor' then
|
|
tinsert(feed.contributors, atom_person_construct(el, el_base))
|
|
|
|
--image
|
|
elseif tag=='image' or tag=='rdf:image' then
|
|
feed.image={
|
|
title=el:getChild('title'):getText(),
|
|
link=(el:getChild('link') or blanky):getText(),
|
|
width=(el:getChild('width') or blanky):getText(),
|
|
height=(el:getChild('height') or blanky):getText()
|
|
}
|
|
local url_el = el:getChild('url')
|
|
if url_el then feed.image.href = resolve(url_el:getText(), rebase(url_el, el_base)) end
|
|
|
|
--language
|
|
elseif tag=='language' or tag=='dc:language' then
|
|
feed.language=el:getText()
|
|
|
|
--licence
|
|
--publisher
|
|
--tags
|
|
end
|
|
end
|
|
|
|
res.entries=parse_entries(channel:getChildren('item'),'rss', channel_base)
|
|
return res
|
|
end
|
|
|
|
|
|
--- parse feed xml
|
|
-- @param xml_string feed xml, as a string
|
|
-- @param base_url (optional) source url of the feed. useful when resolving relative links found in feed contents
|
|
-- @return table with parsed feed info, or nil, error_message on error.
|
|
-- the format of the returned table is much like that on http://feedparser.org, with the major difference that
|
|
-- dates are parsed into unixtime. Most other fields are very much the same.
|
|
function feedparser.parse(xml_string, base_url)
|
|
local lom, err = LOM.parse(xml_string)
|
|
if not lom then return nil, "couldn't parse xml. lxp says: " .. err or "nothing" end
|
|
local rootElement = XMLElement.new(lom)
|
|
local root_tag = rootElement:getTag():lower()
|
|
if root_tag=='rdf:rdf' or root_tag=='rss' then
|
|
return parse_rss(rootElement, base_url)
|
|
elseif root_tag=='feed' then
|
|
return parse_atom(rootElement, base_url)
|
|
else
|
|
return nil, "unknown feed format"
|
|
end
|
|
end
|
|
|
|
--for the sake of backwards-compatibility, feedparser will export a global reference for lua < 5.3
|
|
if _VERSION:sub(-3) < "5.3" then
|
|
_G.feedparser=feedparser
|
|
end
|
|
|
|
|
|
return feedparser |