rss.py

from xml.dom import minidom
import urllib

def load(rssURL):
  return minidom.parse(urllib.urlopen(rssURL))

DEFAULT_NAMESPACES = \
  (None, # RSS 0.91, 0.92, 0.93, 0.94, 2.0
  'http://purl.org/rss/1.0/', # RSS 1.0
  'http://my.netscape.com/rdf/simple/0.9/' # RSS 0.90
  )

def getElementsByTagName(node, tagName, possibleNamespaces=DEFAULT_NAMESPACES):
  for namespace in possibleNamespaces:
    children = node.getElementsByTagNameNS(namespace, tagName)
    if len(children): return children
  return []

def first(node, tagName, possibleNamespaces=DEFAULT_NAMESPACES):
  children = getElementsByTagName(node, tagName, possibleNamespaces)
  return len(children) and children[0] or None

def textOf(node):
  return node and "".join([child.data for child in node.childNodes]) or ""

DUBLIN_CORE = ('http://purl.org/dc/elements/1.1/',)

if __name__ == '__main__':
  import sys
  rssDocument = load(sys.argv[1])
  for item in getElementsByTagName(rssDocument, 'item'):
    print 'title:', textOf(first(item, 'title'))
    print 'link:', textOf(first(item, 'link'))
    print 'description:', textOf(first(item, 'description'))
    print 'date:', textOf(first(item, 'date', DUBLIN_CORE))
    print 'author:', textOf(first(item, 'creator', DUBLIN_CORE))
    print