Monday, 26 September 2011

XML Parsing Feed Parsing & DOM Parsing.

>>> import feedparser
>>> d = feedparser.parse("http://feedparser.org/docs/examples/atom10.xml")
>>> d['feed']['title']             # feed data is a dictionary
u'Sample Feed'
>>> d.feed.title                   # get values attr-style or dict-style
u'Sample Feed'
>>> d.channel.title                # use RSS or Atom terminology anywhere
u'Sample Feed'
>>> d.feed.link                    # resolves relative links
u'http://example.org/'
>>> d.feed.subtitle                 # parses escaped HTML
u'For documentation <em>only</em>'
>>> d.channel.description          # RSS terminology works here too
u'For documentation <em>only</em>'
>>> len(d['entries'])              # entries are a list
1
>>> d['entries'][0]['title']       # each entry is a dictionary
u'First entry title'
>>> d.entries[0].title             # attr-style works here too
u'First entry title'
>>> d['items'][0].title            # RSS terminology works here too
u'First entry title'
>>> e = d.entries[0]
>>> e.link                         # easy access to alternate link
u'http://example.org/entry/3'
>>> e.links[1].rel                 # full access to all Atom links
u'related'
>>> e.links[0].href                # resolves relative links here too
u'http://example.org/entry/3'
>>> e.author_detail.name           # author data is a dictionary
u'Mark Pilgrim'
>>> e.updated_parsed              # parses all date formats
(2005, 11, 9, 11, 56, 34, 2, 313, 0)
>>> e.content[0].value             # sanitizes dangerous HTML
u'<div>Watch out for <em>nasty tricks</em></div>'
>>> d.version                      # reports feed type and version
u'atom10'
>>> d.encoding                     # auto-detects character encoding
u'utf-8'
>>> d.headers.get('Content-type')  # full access to all HTTP headers
u'application/xml' 
 
 
XML DOM Parsing. 
from xml.dom import minidom

src = """<Schedule Season="2010" Timezone="Eastern">
  <Game gameId="1" Week="1" GameDate="2010-09-09" AwayTeam="MIN" HomeTeam="NO" GameTime="8:30 PM"/>
  <Game gameId="2" Week="1" GameDate="2010-09-12" AwayTeam="MIA" HomeTeam="BUF" GameTime="1:00 PM"/>
  <Game gameId="3" Week="1" GameDate="2010-09-12" AwayTeam="DET" HomeTeam="CHI" GameTime="1:00 PM"/>
  <Game gameId="4" Week="1" GameDate="2010-09-12" AwayTeam="OAK" HomeTeam="TEN" GameTime="1:00 PM"/>
</Schedule>
"""

def test_print(dom):
    for node in dom.getElementsByTagName('Game'):
        print node.getAttribute('AwayTeam'),
        print node.getAttribute('HomeTeam'),
        print node.getAttribute('Week'),
        print node.getAttribute('gameId'),
        print node.getAttribute('GameDate'),
        print node.getAttribute('GameTime')
    print ''

dom = minidom.parseString(src)
test_print(dom)

dom = minidom.parse('data.xml')
test_print(dom)

f = open('data.xml', 'r')
dom = minidom.parse(f)
test_print(dom)
f.close()

url = 'http://api.fantasyfootballnerd.com/ffnScheduleXML.php?apiKey=1'
dom = minidom.parse(urllib2.urlopen(url))
test_print(dom)