[wikia-pywikibot] / BeautifulSoup.py Repository:

Annotation of /BeautifulSoup.py

Parent Directory Parent Directory Revision Log Revision Log


Revision 4 - View Download

1 : dantman 4 """Beautiful Soup
2 :     Elixir and Tonic
3 :     "The Screen-Scraper's Friend"
4 :     http://www.crummy.com/software/BeautifulSoup/
5 :    
6 :     Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7 :     tree representation. It provides methods and Pythonic idioms that make
8 :     it easy to navigate, search, and modify the tree.
9 :    
10 :     A well-formed XML/HTML document yields a well-formed data
11 :     structure. An ill-formed XML/HTML document yields a correspondingly
12 :     ill-formed data structure. If your document is only locally
13 :     well-formed, you can use this library to find and process the
14 :     well-formed part of it.
15 :    
16 :     Beautiful Soup works with Python 2.2 and up. It has no external
17 :     dependencies, but you'll have more success at converting data to UTF-8
18 :     if you also install these three packages:
19 :    
20 :     * chardet, for auto-detecting character encodings
21 :     http://chardet.feedparser.org/
22 :     * cjkcodecs and iconv_codec, which add more encodings to the ones supported
23 :     by stock Python.
24 :     http://cjkpython.i18n.org/
25 :    
26 :     Beautiful Soup defines classes for two main parsing strategies:
27 :    
28 :     * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29 :     language that kind of looks like XML.
30 :    
31 :     * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32 :     or invalid. This class has web browser-like heuristics for
33 :     obtaining a sensible parse tree in the face of common HTML errors.
34 :    
35 :     Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36 :     the encoding of an HTML or XML document, and converting it to
37 :     Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38 :    
39 :     For more than you ever wanted to know about Beautiful Soup, see the
40 :     documentation:
41 :     http://www.crummy.com/software/BeautifulSoup/documentation.html
42 :    
43 :     Here, have some legalese:
44 :    
45 :     Copyright (c) 2004-2007, Leonard Richardson
46 :    
47 :     All rights reserved.
48 :    
49 :     Redistribution and use in source and binary forms, with or without
50 :     modification, are permitted provided that the following conditions are
51 :     met:
52 :    
53 :     * Redistributions of source code must retain the above copyright
54 :     notice, this list of conditions and the following disclaimer.
55 :    
56 :     * Redistributions in binary form must reproduce the above
57 :     copyright notice, this list of conditions and the following
58 :     disclaimer in the documentation and/or other materials provided
59 :     with the distribution.
60 :    
61 :     * Neither the name of the the Beautiful Soup Consortium and All
62 :     Night Kosher Bakery nor the names of its contributors may be
63 :     used to endorse or promote products derived from this software
64 :     without specific prior written permission.
65 :    
66 :     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67 :     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68 :     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69 :     A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70 :     CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71 :     EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72 :     PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73 :     PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74 :     LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75 :     NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76 :     SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
77 :    
78 :     """
79 :     from __future__ import generators
80 :    
81 :     __author__ = "Leonard Richardson (leonardr@segfault.org)"
82 :     __version__ = "3.0.5"
83 :     __copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
84 :     __license__ = "New-style BSD"
85 :    
86 :     from sgmllib import SGMLParser, SGMLParseError
87 :     import codecs
88 :     import types
89 :     import re
90 :     import sgmllib
91 :     try:
92 :     from htmlentitydefs import name2codepoint
93 :     except ImportError:
94 :     name2codepoint = {}
95 :    
96 :     #This hack makes Beautiful Soup able to parse XML with namespaces
97 :     sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
98 :    
99 :     DEFAULT_OUTPUT_ENCODING = "utf-8"
100 :    
101 :     # First, the classes that represent markup elements.
102 :    
103 :     class PageElement:
104 :     """Contains the navigational information for some part of the page
105 :     (either a tag or a piece of text)"""
106 :    
107 :     def setup(self, parent=None, previous=None):
108 :     """Sets up the initial relations between this element and
109 :     other elements."""
110 :     self.parent = parent
111 :     self.previous = previous
112 :     self.next = None
113 :     self.previousSibling = None
114 :     self.nextSibling = None
115 :     if self.parent and self.parent.contents:
116 :     self.previousSibling = self.parent.contents[-1]
117 :     self.previousSibling.nextSibling = self
118 :    
119 :     def replaceWith(self, replaceWith):
120 :     oldParent = self.parent
121 :     myIndex = self.parent.contents.index(self)
122 :     if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
123 :     # We're replacing this element with one of its siblings.
124 :     index = self.parent.contents.index(replaceWith)
125 :     if index and index < myIndex:
126 :     # Furthermore, it comes before this element. That
127 :     # means that when we extract it, the index of this
128 :     # element will change.
129 :     myIndex = myIndex - 1
130 :     self.extract()
131 :     oldParent.insert(myIndex, replaceWith)
132 :    
133 :     def extract(self):
134 :     """Destructively rips this element out of the tree."""
135 :     if self.parent:
136 :     try:
137 :     self.parent.contents.remove(self)
138 :     except ValueError:
139 :     pass
140 :    
141 :     #Find the two elements that would be next to each other if
142 :     #this element (and any children) hadn't been parsed. Connect
143 :     #the two.
144 :     lastChild = self._lastRecursiveChild()
145 :     nextElement = lastChild.next
146 :    
147 :     if self.previous:
148 :     self.previous.next = nextElement
149 :     if nextElement:
150 :     nextElement.previous = self.previous
151 :     self.previous = None
152 :     lastChild.next = None
153 :    
154 :     self.parent = None
155 :     if self.previousSibling:
156 :     self.previousSibling.nextSibling = self.nextSibling
157 :     if self.nextSibling:
158 :     self.nextSibling.previousSibling = self.previousSibling
159 :     self.previousSibling = self.nextSibling = None
160 :    
161 :     def _lastRecursiveChild(self):
162 :     "Finds the last element beneath this object to be parsed."
163 :     lastChild = self
164 :     while hasattr(lastChild, 'contents') and lastChild.contents:
165 :     lastChild = lastChild.contents[-1]
166 :     return lastChild
167 :    
168 :     def insert(self, position, newChild):
169 :     if (isinstance(newChild, basestring)
170 :     or isinstance(newChild, unicode)) \
171 :     and not isinstance(newChild, NavigableString):
172 :     newChild = NavigableString(newChild)
173 :    
174 :     position = min(position, len(self.contents))
175 :     if hasattr(newChild, 'parent') and newChild.parent != None:
176 :     # We're 'inserting' an element that's already one
177 :     # of this object's children.
178 :     if newChild.parent == self:
179 :     index = self.find(newChild)
180 :     if index and index < position:
181 :     # Furthermore we're moving it further down the
182 :     # list of this object's children. That means that
183 :     # when we extract this element, our target index
184 :     # will jump down one.
185 :     position = position - 1
186 :     newChild.extract()
187 :    
188 :     newChild.parent = self
189 :     previousChild = None
190 :     if position == 0:
191 :     newChild.previousSibling = None
192 :     newChild.previous = self
193 :     else:
194 :     previousChild = self.contents[position-1]
195 :     newChild.previousSibling = previousChild
196 :     newChild.previousSibling.nextSibling = newChild
197 :     newChild.previous = previousChild._lastRecursiveChild()
198 :     if newChild.previous:
199 :     newChild.previous.next = newChild
200 :    
201 :     newChildsLastElement = newChild._lastRecursiveChild()
202 :    
203 :     if position >= len(self.contents):
204 :     newChild.nextSibling = None
205 :    
206 :     parent = self
207 :     parentsNextSibling = None
208 :     while not parentsNextSibling:
209 :     parentsNextSibling = parent.nextSibling
210 :     parent = parent.parent
211 :     if not parent: # This is the last element in the document.
212 :     break
213 :     if parentsNextSibling:
214 :     newChildsLastElement.next = parentsNextSibling
215 :     else:
216 :     newChildsLastElement.next = None
217 :     else:
218 :     nextChild = self.contents[position]
219 :     newChild.nextSibling = nextChild
220 :     if newChild.nextSibling:
221 :     newChild.nextSibling.previousSibling = newChild
222 :     newChildsLastElement.next = nextChild
223 :    
224 :     if newChildsLastElement.next:
225 :     newChildsLastElement.next.previous = newChildsLastElement
226 :     self.contents.insert(position, newChild)
227 :    
228 :     def append(self, tag):
229 :     """Appends the given tag to the contents of this tag."""
230 :     self.insert(len(self.contents), tag)
231 :    
232 :     def findNext(self, name=None, attrs={}, text=None, **kwargs):
233 :     """Returns the first item that matches the given criteria and
234 :     appears after this Tag in the document."""
235 :     return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
236 :    
237 :     def findAllNext(self, name=None, attrs={}, text=None, limit=None,
238 :     **kwargs):
239 :     """Returns all items that match the given criteria and appear
240 :     before after Tag in the document."""
241 :     return self._findAll(name, attrs, text, limit, self.nextGenerator)
242 :    
243 :     def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
244 :     """Returns the closest sibling to this Tag that matches the
245 :     given criteria and appears after this Tag in the document."""
246 :     return self._findOne(self.findNextSiblings, name, attrs, text,
247 :     **kwargs)
248 :    
249 :     def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
250 :     **kwargs):
251 :     """Returns the siblings of this Tag that match the given
252 :     criteria and appear after this Tag in the document."""
253 :     return self._findAll(name, attrs, text, limit,
254 :     self.nextSiblingGenerator, **kwargs)
255 :     fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
256 :    
257 :     def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
258 :     """Returns the first item that matches the given criteria and
259 :     appears before this Tag in the document."""
260 :     return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
261 :    
262 :     def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
263 :     **kwargs):
264 :     """Returns all items that match the given criteria and appear
265 :     before this Tag in the document."""
266 :     return self._findAll(name, attrs, text, limit, self.previousGenerator,
267 :     **kwargs)
268 :     fetchPrevious = findAllPrevious # Compatibility with pre-3.x
269 :    
270 :     def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
271 :     """Returns the closest sibling to this Tag that matches the
272 :     given criteria and appears before this Tag in the document."""
273 :     return self._findOne(self.findPreviousSiblings, name, attrs, text,
274 :     **kwargs)
275 :    
276 :     def findPreviousSiblings(self, name=None, attrs={}, text=None,
277 :     limit=None, **kwargs):
278 :     """Returns the siblings of this Tag that match the given
279 :     criteria and appear before this Tag in the document."""
280 :     return self._findAll(name, attrs, text, limit,
281 :     self.previousSiblingGenerator, **kwargs)
282 :     fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
283 :    
284 :     def findParent(self, name=None, attrs={}, **kwargs):
285 :     """Returns the closest parent of this Tag that matches the given
286 :     criteria."""
287 :     # NOTE: We can't use _findOne because findParents takes a different
288 :     # set of arguments.
289 :     r = None
290 :     l = self.findParents(name, attrs, 1)
291 :     if l:
292 :     r = l[0]
293 :     return r
294 :    
295 :     def findParents(self, name=None, attrs={}, limit=None, **kwargs):
296 :     """Returns the parents of this Tag that match the given
297 :     criteria."""
298 :    
299 :     return self._findAll(name, attrs, None, limit, self.parentGenerator,
300 :     **kwargs)
301 :     fetchParents = findParents # Compatibility with pre-3.x
302 :    
303 :     #These methods do the real heavy lifting.
304 :    
305 :     def _findOne(self, method, name, attrs, text, **kwargs):
306 :     r = None
307 :     l = method(name, attrs, text, 1, **kwargs)
308 :     if l:
309 :     r = l[0]
310 :     return r
311 :    
312 :     def _findAll(self, name, attrs, text, limit, generator, **kwargs):
313 :     "Iterates over a generator looking for things that match."
314 :    
315 :     if isinstance(name, SoupStrainer):
316 :     strainer = name
317 :     else:
318 :     # Build a SoupStrainer
319 :     strainer = SoupStrainer(name, attrs, text, **kwargs)
320 :     results = ResultSet(strainer)
321 :     g = generator()
322 :     while True:
323 :     try:
324 :     i = g.next()
325 :     except StopIteration:
326 :     break
327 :     if i:
328 :     found = strainer.search(i)
329 :     if found:
330 :     results.append(found)
331 :     if limit and len(results) >= limit:
332 :     break
333 :     return results
334 :    
335 :     #These Generators can be used to navigate starting from both
336 :     #NavigableStrings and Tags.
337 :     def nextGenerator(self):
338 :     i = self
339 :     while i:
340 :     i = i.next
341 :     yield i
342 :    
343 :     def nextSiblingGenerator(self):
344 :     i = self
345 :     while i:
346 :     i = i.nextSibling
347 :     yield i
348 :    
349 :     def previousGenerator(self):
350 :     i = self
351 :     while i:
352 :     i = i.previous
353 :     yield i
354 :    
355 :     def previousSiblingGenerator(self):
356 :     i = self
357 :     while i:
358 :     i = i.previousSibling
359 :     yield i
360 :    
361 :     def parentGenerator(self):
362 :     i = self
363 :     while i:
364 :     i = i.parent
365 :     yield i
366 :    
367 :     # Utility methods
368 :     def substituteEncoding(self, str, encoding=None):
369 :     encoding = encoding or "utf-8"
370 :     return str.replace("%SOUP-ENCODING%", encoding)
371 :    
372 :     def toEncoding(self, s, encoding=None):
373 :     """Encodes an object to a string in some encoding, or to Unicode.
374 :     ."""
375 :     if isinstance(s, unicode):
376 :     if encoding:
377 :     s = s.encode(encoding)
378 :     elif isinstance(s, str):
379 :     if encoding:
380 :     s = s.encode(encoding)
381 :     else:
382 :     s = unicode(s)
383 :     else:
384 :     if encoding:
385 :     s = self.toEncoding(str(s), encoding)
386 :     else:
387 :     s = unicode(s)
388 :     return s
389 :    
390 :     class NavigableString(unicode, PageElement):
391 :    
392 :     def __getnewargs__(self):
393 :     return (NavigableString.__str__(self),)
394 :    
395 :     def __getattr__(self, attr):
396 :     """text.string gives you text. This is for backwards
397 :     compatibility for Navigable*String, but for CData* it lets you
398 :     get the string without the CData wrapper."""
399 :     if attr == 'string':
400 :     return self
401 :     else:
402 :     raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
403 :    
404 :     def __unicode__(self):
405 :     return unicode(str(self))
406 :    
407 :     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
408 :     if encoding:
409 :     return self.encode(encoding)
410 :     else:
411 :     return self
412 :    
413 :     class CData(NavigableString):
414 :    
415 :     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
416 :     return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
417 :    
418 :     class ProcessingInstruction(NavigableString):
419 :     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
420 :     output = self
421 :     if "%SOUP-ENCODING%" in output:
422 :     output = self.substituteEncoding(output, encoding)
423 :     return "<?%s?>" % self.toEncoding(output, encoding)
424 :    
425 :     class Comment(NavigableString):
426 :     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
427 :     return "<!--%s-->" % NavigableString.__str__(self, encoding)
428 :    
429 :     class Declaration(NavigableString):
430 :     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
431 :     return "<!%s>" % NavigableString.__str__(self, encoding)
432 :    
433 :     class Tag(PageElement):
434 :    
435 :     """Represents a found HTML tag with its attributes and contents."""
436 :    
437 :     def _invert(h):
438 :     "Cheap function to invert a hash."
439 :     i = {}
440 :     for k,v in h.items():
441 :     i[v] = k
442 :     return i
443 :    
444 :     XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
445 :     "quot" : '"',
446 :     "amp" : "&",
447 :     "lt" : "<",
448 :     "gt" : ">" }
449 :    
450 :     XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
451 :    
452 :     def _convertEntities(self, match):
453 :     """Used in a call to re.sub to replace HTML, XML, and numeric
454 :     entities with the appropriate Unicode characters. If HTML
455 :     entities are being converted, any unrecognized entities are
456 :     escaped."""
457 :     x = match.group(1)
458 :     if self.convertHTMLEntities and x in name2codepoint:
459 :     return unichr(name2codepoint[x])
460 :     elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
461 :     if self.convertXMLEntities:
462 :     return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
463 :     else:
464 :     return u'&%s;' % x
465 :     elif len(x) > 0 and x[0] == '#':
466 :     # Handle numeric entities
467 :     if len(x) > 1 and x[1] == 'x':
468 :     return unichr(int(x[2:], 16))
469 :     else:
470 :     return unichr(int(x[1:]))
471 :    
472 :     elif self.escapeUnrecognizedEntities:
473 :     return u'&amp;%s;' % x
474 :     else:
475 :     return u'&%s;' % x
476 :    
477 :     def __init__(self, parser, name, attrs=None, parent=None,
478 :     previous=None):
479 :     "Basic constructor."
480 :    
481 :     # We don't actually store the parser object: that lets extracted
482 :     # chunks be garbage-collected
483 :     self.parserClass = parser.__class__
484 :     self.isSelfClosing = parser.isSelfClosingTag(name)
485 :     self.name = name
486 :     if attrs == None:
487 :     attrs = []
488 :     self.attrs = attrs
489 :     self.contents = []
490 :     self.setup(parent, previous)
491 :     self.hidden = False
492 :     self.containsSubstitutions = False
493 :     self.convertHTMLEntities = parser.convertHTMLEntities
494 :     self.convertXMLEntities = parser.convertXMLEntities
495 :     self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
496 :    
497 :     # Convert any HTML, XML, or numeric entities in the attribute values.
498 :     convert = lambda(k, val): (k,
499 :     re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
500 :     self._convertEntities,
501 :     val))
502 :     self.attrs = map(convert, self.attrs)
503 :    
504 :     def get(self, key, default=None):
505 :     """Returns the value of the 'key' attribute for the tag, or
506 :     the value given for 'default' if it doesn't have that
507 :     attribute."""
508 :     return self._getAttrMap().get(key, default)
509 :    
510 :     def has_key(self, key):
511 :     return self._getAttrMap().has_key(key)
512 :    
513 :     def __getitem__(self, key):
514 :     """tag[key] returns the value of the 'key' attribute for the tag,
515 :     and throws an exception if it's not there."""
516 :     return self._getAttrMap()[key]
517 :    
518 :     def __iter__(self):
519 :     "Iterating over a tag iterates over its contents."
520 :     return iter(self.contents)
521 :    
522 :     def __len__(self):
523 :     "The length of a tag is the length of its list of contents."
524 :     return len(self.contents)
525 :    
526 :     def __contains__(self, x):
527 :     return x in self.contents
528 :    
529 :     def __nonzero__(self):
530 :     "A tag is non-None even if it has no contents."
531 :     return True
532 :    
533 :     def __setitem__(self, key, value):
534 :     """Setting tag[key] sets the value of the 'key' attribute for the
535 :     tag."""
536 :     self._getAttrMap()
537 :     self.attrMap[key] = value
538 :     found = False
539 :     for i in range(0, len(self.attrs)):
540 :     if self.attrs[i][0] == key:
541 :     self.attrs[i] = (key, value)
542 :     found = True
543 :     if not found:
544 :     self.attrs.append((key, value))
545 :     self._getAttrMap()[key] = value
546 :    
547 :     def __delitem__(self, key):
548 :     "Deleting tag[key] deletes all 'key' attributes for the tag."
549 :     for item in self.attrs:
550 :     if item[0] == key:
551 :     self.attrs.remove(item)
552 :     #We don't break because bad HTML can define the same
553 :     #attribute multiple times.
554 :     self._getAttrMap()
555 :     if self.attrMap.has_key(key):
556 :     del self.attrMap[key]
557 :    
558 :     def __call__(self, *args, **kwargs):
559 :     """Calling a tag like a function is the same as calling its
560 :     findAll() method. Eg. tag('a') returns a list of all the A tags
561 :     found within this tag."""
562 :     return apply(self.findAll, args, kwargs)
563 :    
564 :     def __getattr__(self, tag):
565 :     #print "Getattr %s.%s" % (self.__class__, tag)
566 :     if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
567 :     return self.find(tag[:-3])
568 :     elif tag.find('__') != 0:
569 :     return self.find(tag)
570 :     raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
571 :    
572 :     def __eq__(self, other):
573 :     """Returns true iff this tag has the same name, the same attributes,
574 :     and the same contents (recursively) as the given tag.
575 :    
576 :     NOTE: right now this will return false if two tags have the
577 :     same attributes in a different order. Should this be fixed?"""
578 :     if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
579 :     return False
580 :     for i in range(0, len(self.contents)):
581 :     if self.contents[i] != other.contents[i]:
582 :     return False
583 :     return True
584 :    
585 :     def __ne__(self, other):
586 :     """Returns true iff this tag is not identical to the other tag,
587 :     as defined in __eq__."""
588 :     return not self == other
589 :    
590 :     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
591 :     """Renders this tag as a string."""
592 :     return self.__str__(encoding)
593 :    
594 :     def __unicode__(self):
595 :     return self.__str__(None)
596 :    
597 :     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
598 :     + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
599 :     + ")")
600 :    
601 :     def _sub_entity(self, x):
602 :     """Used with a regular expression to substitute the
603 :     appropriate XML entity for an XML special character."""
604 :     return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
605 :    
606 :     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
607 :     prettyPrint=False, indentLevel=0):
608 :     """Returns a string or Unicode representation of this tag and
609 :     its contents. To get Unicode, pass None for encoding.
610 :    
611 :     NOTE: since Python's HTML parser consumes whitespace, this
612 :     method is not certain to reproduce the whitespace present in
613 :     the original string."""
614 :    
615 :     encodedName = self.toEncoding(self.name, encoding)
616 :    
617 :     attrs = []
618 :     if self.attrs:
619 :     for key, val in self.attrs:
620 :     fmt = '%s="%s"'
621 :     if isString(val):
622 :     if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
623 :     val = self.substituteEncoding(val, encoding)
624 :    
625 :     # The attribute value either:
626 :     #
627 :     # * Contains no embedded double quotes or single quotes.
628 :     # No problem: we enclose it in double quotes.
629 :     # * Contains embedded single quotes. No problem:
630 :     # double quotes work here too.
631 :     # * Contains embedded double quotes. No problem:
632 :     # we enclose it in single quotes.
633 :     # * Embeds both single _and_ double quotes. This
634 :     # can't happen naturally, but it can happen if
635 :     # you modify an attribute value after parsing
636 :     # the document. Now we have a bit of a
637 :     # problem. We solve it by enclosing the
638 :     # attribute in single quotes, and escaping any
639 :     # embedded single quotes to XML entities.
640 :     if '"' in val:
641 :     fmt = "%s='%s'"
642 :     if "'" in val:
643 :     # TODO: replace with apos when
644 :     # appropriate.
645 :     val = val.replace("'", "&squot;")
646 :    
647 :     # Now we're okay w/r/t quotes. But the attribute
648 :     # value might also contain angle brackets, or
649 :     # ampersands that aren't part of entities. We need
650 :     # to escape those to XML entities too.
651 :     val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
652 :    
653 :     attrs.append(fmt % (self.toEncoding(key, encoding),
654 :     self.toEncoding(val, encoding)))
655 :     close = ''
656 :     closeTag = ''
657 :     if self.isSelfClosing:
658 :     close = ' /'
659 :     else:
660 :     closeTag = '</%s>' % encodedName
661 :    
662 :     indentTag, indentContents = 0, 0
663 :     if prettyPrint:
664 :     indentTag = indentLevel
665 :     space = (' ' * (indentTag-1))
666 :     indentContents = indentTag + 1
667 :     contents = self.renderContents(encoding, prettyPrint, indentContents)
668 :     if self.hidden:
669 :     s = contents
670 :     else:
671 :     s = []
672 :     attributeString = ''
673 :     if attrs:
674 :     attributeString = ' ' + ' '.join(attrs)
675 :     if prettyPrint:
676 :     s.append(space)
677 :     s.append('<%s%s%s>' % (encodedName, attributeString, close))
678 :     if prettyPrint:
679 :     s.append("\n")
680 :     s.append(contents)
681 :     if prettyPrint and contents and contents[-1] != "\n":
682 :     s.append("\n")
683 :     if prettyPrint and closeTag:
684 :     s.append(space)
685 :     s.append(closeTag)
686 :     if prettyPrint and closeTag and self.nextSibling:
687 :     s.append("\n")
688 :     s = ''.join(s)
689 :     return s
690 :    
691 :     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
692 :     return self.__str__(encoding, True)
693 :    
694 :     def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
695 :     prettyPrint=False, indentLevel=0):
696 :     """Renders the contents of this tag as a string in the given
697 :     encoding. If encoding is None, returns a Unicode string.."""
698 :     s=[]
699 :     for c in self:
700 :     text = None
701 :     if isinstance(c, NavigableString):
702 :     text = c.__str__(encoding)
703 :     elif isinstance(c, Tag):
704 :     s.append(c.__str__(encoding, prettyPrint, indentLevel))
705 :     if text and prettyPrint:
706 :     text = text.strip()
707 :     if text:
708 :     if prettyPrint:
709 :     s.append(" " * (indentLevel-1))
710 :     s.append(text)
711 :     if prettyPrint:
712 :     s.append("\n")
713 :     return ''.join(s)
714 :    
715 :     #Soup methods
716 :    
717 :     def find(self, name=None, attrs={}, recursive=True, text=None,
718 :     **kwargs):
719 :     """Return only the first child of this Tag matching the given
720 :     criteria."""
721 :     r = None
722 :     l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
723 :     if l:
724 :     r = l[0]
725 :     return r
726 :     findChild = find
727 :    
728 :     def findAll(self, name=None, attrs={}, recursive=True, text=None,
729 :     limit=None, **kwargs):
730 :     """Extracts a list of Tag objects that match the given
731 :     criteria. You can specify the name of the Tag and any
732 :     attributes you want the Tag to have.
733 :    
734 :     The value of a key-value pair in the 'attrs' map can be a
735 :     string, a list of strings, a regular expression object, or a
736 :     callable that takes a string and returns whether or not the
737 :     string matches for some custom definition of 'matches'. The
738 :     same is true of the tag name."""
739 :     generator = self.recursiveChildGenerator
740 :     if not recursive:
741 :     generator = self.childGenerator
742 :     return self._findAll(name, attrs, text, limit, generator, **kwargs)
743 :     findChildren = findAll
744 :    
745 :     # Pre-3.x compatibility methods
746 :     first = find
747 :     fetch = findAll
748 :    
749 :     def fetchText(self, text=None, recursive=True, limit=None):
750 :     return self.findAll(text=text, recursive=recursive, limit=limit)
751 :    
752 :     def firstText(self, text=None, recursive=True):
753 :     return self.find(text=text, recursive=recursive)
754 :    
755 :     #Private methods
756 :    
757 :     def _getAttrMap(self):
758 :     """Initializes a map representation of this tag's attributes,
759 :     if not already initialized."""
760 :     if not getattr(self, 'attrMap'):
761 :     self.attrMap = {}
762 :     for (key, value) in self.attrs:
763 :     self.attrMap[key] = value
764 :     return self.attrMap
765 :    
766 :     #Generator methods
767 :     def childGenerator(self):
768 :     for i in range(0, len(self.contents)):
769 :     yield self.contents[i]
770 :     raise StopIteration
771 :    
772 :     def recursiveChildGenerator(self):
773 :     stack = [(self, 0)]
774 :     while stack:
775 :     tag, start = stack.pop()
776 :     if isinstance(tag, Tag):
777 :     for i in range(start, len(tag.contents)):
778 :     a = tag.contents[i]
779 :     yield a
780 :     if isinstance(a, Tag) and tag.contents:
781 :     if i < len(tag.contents) - 1:
782 :     stack.append((tag, i+1))
783 :     stack.append((a, 0))
784 :     break
785 :     raise StopIteration
786 :    
787 :     # Next, a couple classes to represent queries and their results.
788 :     class SoupStrainer:
789 :     """Encapsulates a number of ways of matching a markup element (tag or
790 :     text)."""
791 :    
792 :     def __init__(self, name=None, attrs={}, text=None, **kwargs):
793 :     self.name = name
794 :     if isString(attrs):
795 :     kwargs['class'] = attrs
796 :     attrs = None
797 :     if kwargs:
798 :     if attrs:
799 :     attrs = attrs.copy()
800 :     attrs.update(kwargs)
801 :     else:
802 :     attrs = kwargs
803 :     self.attrs = attrs
804 :     self.text = text
805 :    
806 :     def __str__(self):
807 :     if self.text:
808 :     return self.text
809 :     else:
810 :     return "%s|%s" % (self.name, self.attrs)
811 :    
812 :     def searchTag(self, markupName=None, markupAttrs={}):
813 :     found = None
814 :     markup = None
815 :     if isinstance(markupName, Tag):
816 :     markup = markupName
817 :     markupAttrs = markup
818 :     callFunctionWithTagData = callable(self.name) \
819 :     and not isinstance(markupName, Tag)
820 :    
821 :     if (not self.name) \
822 :     or callFunctionWithTagData \
823 :     or (markup and self._matches(markup, self.name)) \
824 :     or (not markup and self._matches(markupName, self.name)):
825 :     if callFunctionWithTagData:
826 :     match = self.name(markupName, markupAttrs)
827 :     else:
828 :     match = True
829 :     markupAttrMap = None
830 :     for attr, matchAgainst in self.attrs.items():
831 :     if not markupAttrMap:
832 :     if hasattr(markupAttrs, 'get'):
833 :     markupAttrMap = markupAttrs
834 :     else:
835 :     markupAttrMap = {}
836 :     for k,v in markupAttrs:
837 :     markupAttrMap[k] = v
838 :     attrValue = markupAttrMap.get(attr)
839 :     if not self._matches(attrValue, matchAgainst):
840 :     match = False
841 :     break
842 :     if match:
843 :     if markup:
844 :     found = markup
845 :     else:
846 :     found = markupName
847 :     return found
848 :    
849 :     def search(self, markup):
850 :     #print 'looking for %s in %s' % (self, markup)
851 :     found = None
852 :     # If given a list of items, scan it for a text element that
853 :     # matches.
854 :     if isList(markup) and not isinstance(markup, Tag):
855 :     for element in markup:
856 :     if isinstance(element, NavigableString) \
857 :     and self.search(element):
858 :     found = element
859 :     break
860 :     # If it's a Tag, make sure its name or attributes match.
861 :     # Don't bother with Tags if we're searching for text.
862 :     elif isinstance(markup, Tag):
863 :     if not self.text:
864 :     found = self.searchTag(markup)
865 :     # If it's text, make sure the text matches.
866 :     elif isinstance(markup, NavigableString) or \
867 :     isString(markup):
868 :     if self._matches(markup, self.text):
869 :     found = markup
870 :     else:
871 :     raise Exception, "I don't know how to match against a %s" \
872 :     % markup.__class__
873 :     return found
874 :    
875 :     def _matches(self, markup, matchAgainst):
876 :     #print "Matching %s against %s" % (markup, matchAgainst)
877 :     result = False
878 :     if matchAgainst == True and type(matchAgainst) == types.BooleanType:
879 :     result = markup != None
880 :     elif callable(matchAgainst):
881 :     result = matchAgainst(markup)
882 :     else:
883 :     #Custom match methods take the tag as an argument, but all
884 :     #other ways of matching match the tag name as a string.
885 :     if isinstance(markup, Tag):
886 :     markup = markup.name
887 :     if markup and not isString(markup):
888 :     markup = unicode(markup)
889 :     #Now we know that chunk is either a string, or None.
890 :     if hasattr(matchAgainst, 'match'):
891 :     # It's a regexp object.
892 :     result = markup and matchAgainst.search(markup)
893 :     elif isList(matchAgainst):
894 :     result = markup in matchAgainst
895 :     elif hasattr(matchAgainst, 'items'):
896 :     result = markup.has_key(matchAgainst)
897 :     elif matchAgainst and isString(markup):
898 :     if isinstance(markup, unicode):
899 :     matchAgainst = unicode(matchAgainst)
900 :     else:
901 :     matchAgainst = str(matchAgainst)
902 :    
903 :     if not result:
904 :     result = matchAgainst == markup
905 :     return result
906 :    
907 :     class ResultSet(list):
908 :     """A ResultSet is just a list that keeps track of the SoupStrainer
909 :     that created it."""
910 :     def __init__(self, source):
911 :     list.__init__([])
912 :     self.source = source
913 :    
914 :     # Now, some helper functions.
915 :    
916 :     def isList(l):
917 :     """Convenience method that works with all 2.x versions of Python
918 :     to determine whether or not something is listlike."""
919 :     return hasattr(l, '__iter__') \
920 :     or (type(l) in (types.ListType, types.TupleType))
921 :    
922 :     def isString(s):
923 :     """Convenience method that works with all 2.x versions of Python
924 :     to determine whether or not something is stringlike."""
925 :     try:
926 :     return isinstance(s, unicode) or isinstance(s, basestring)
927 :     except NameError:
928 :     return isinstance(s, str)
929 :    
930 :     def buildTagMap(default, *args):
931 :     """Turns a list of maps, lists, or scalars into a single map.
932 :     Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
933 :     NESTING_RESET_TAGS maps out of lists and partial maps."""
934 :     built = {}
935 :     for portion in args:
936 :     if hasattr(portion, 'items'):
937 :     #It's a map. Merge it.
938 :     for k,v in portion.items():
939 :     built[k] = v
940 :     elif isList(portion):
941 :     #It's a list. Map each item to the default.
942 :     for k in portion:
943 :     built[k] = default
944 :     else:
945 :     #It's a scalar. Map it to the default.
946 :     built[portion] = default
947 :     return built
948 :    
949 :     # Now, the parser classes.
950 :    
951 :     class BeautifulStoneSoup(Tag, SGMLParser):
952 :    
953 :     """This class contains the basic parser and search code. It defines
954 :     a parser that knows nothing about tag behavior except for the
955 :     following:
956 :    
957 :     You can't close a tag without closing all the tags it encloses.
958 :     That is, "<foo><bar></foo>" actually means
959 :     "<foo><bar></bar></foo>".
960 :    
961 :     [Another possible explanation is "<foo><bar /></foo>", but since
962 :     this class defines no SELF_CLOSING_TAGS, it will never use that
963 :     explanation.]
964 :    
965 :     This class is useful for parsing XML or made-up markup languages,
966 :     or when BeautifulSoup makes an assumption counter to what you were
967 :     expecting."""
968 :    
969 :     SELF_CLOSING_TAGS = {}
970 :     NESTABLE_TAGS = {}
971 :     RESET_NESTING_TAGS = {}
972 :     QUOTE_TAGS = {}
973 :    
974 :     MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
975 :     lambda x: x.group(1) + ' />'),
976 :     (re.compile('<!\s+([^<>]*)>'),
977 :     lambda x: '<!' + x.group(1) + '>')
978 :     ]
979 :    
980 :     ROOT_TAG_NAME = u'[document]'
981 :    
982 :     HTML_ENTITIES = "html"
983 :     XML_ENTITIES = "xml"
984 :     XHTML_ENTITIES = "xhtml"
985 :     # TODO: This only exists for backwards-compatibility
986 :     ALL_ENTITIES = XHTML_ENTITIES
987 :    
988 :     # Used when determining whether a text node is all whitespace and
989 :     # can be replaced with a single space. A text node that contains
990 :     # fancy Unicode spaces (usually non-breaking) should be left
991 :     # alone.
992 :     STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
993 :    
994 :     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
995 :     markupMassage=True, smartQuotesTo=XML_ENTITIES,
996 :     convertEntities=None, selfClosingTags=None):
997 :     """The Soup object is initialized as the 'root tag', and the
998 :     provided markup (which can be a string or a file-like object)
999 :     is fed into the underlying parser.
1000 :    
1001 :     sgmllib will process most bad HTML, and the BeautifulSoup
1002 :     class has some tricks for dealing with some HTML that kills
1003 :     sgmllib, but Beautiful Soup can nonetheless choke or lose data
1004 :     if your data uses self-closing tags or declarations
1005 :     incorrectly.
1006 :    
1007 :     By default, Beautiful Soup uses regexes to sanitize input,
1008 :     avoiding the vast majority of these problems. If the problems
1009 :     don't apply to you, pass in False for markupMassage, and
1010 :     you'll get better performance.
1011 :    
1012 :     The default parser massage techniques fix the two most common
1013 :     instances of invalid HTML that choke sgmllib:
1014 :    
1015 :     <br/> (No space between name of closing tag and tag close)
1016 :     <! --Comment--> (Extraneous whitespace in declaration)
1017 :    
1018 :     You can pass in a custom list of (RE object, replace method)
1019 :     tuples to get Beautiful Soup to scrub your input the way you
1020 :     want."""
1021 :    
1022 :     self.parseOnlyThese = parseOnlyThese
1023 :     self.fromEncoding = fromEncoding
1024 :     self.smartQuotesTo = smartQuotesTo
1025 :     self.convertEntities = convertEntities
1026 :     # Set the rules for how we'll deal with the entities we
1027 :     # encounter
1028 :     if self.convertEntities:
1029 :     # It doesn't make sense to convert encoded characters to
1030 :     # entities even while you're converting entities to Unicode.
1031 :     # Just convert it all to Unicode.
1032 :     self.smartQuotesTo = None
1033 :     if convertEntities == self.HTML_ENTITIES:
1034 :     self.convertXMLEntities = False
1035 :     self.convertHTMLEntities = True
1036 :     self.escapeUnrecognizedEntities = True
1037 :     elif convertEntities == self.XHTML_ENTITIES:
1038 :     self.convertXMLEntities = True
1039 :     self.convertHTMLEntities = True
1040 :     self.escapeUnrecognizedEntities = False
1041 :     elif convertEntities == self.XML_ENTITIES:
1042 :     self.convertXMLEntities = True
1043 :     self.convertHTMLEntities = False
1044 :     self.escapeUnrecognizedEntities = False
1045 :     else:
1046 :     self.convertXMLEntities = False
1047 :     self.convertHTMLEntities = False
1048 :     self.escapeUnrecognizedEntities = False
1049 :    
1050 :     self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1051 :     SGMLParser.__init__(self)
1052 :    
1053 :     if hasattr(markup, 'read'): # It's a file-type object.
1054 :     markup = markup.read()
1055 :     self.markup = markup
1056 :     self.markupMassage = markupMassage
1057 :     try:
1058 :     self._feed()
1059 :     except StopParsing:
1060 :     pass
1061 :     self.markup = None # The markup can now be GCed
1062 :    
1063 :     def convert_charref(self, name):
1064 :     """This method fixes a bug in Python's SGMLParser."""
1065 :     try:
1066 :     n = int(name)
1067 :     except ValueError:
1068 :     return
1069 :     if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1070 :     return
1071 :     return self.convert_codepoint(n)
1072 :    
1073 :     def _feed(self, inDocumentEncoding=None):
1074 :     # Convert the document to Unicode.
1075 :     markup = self.markup
1076 :     if isinstance(markup, unicode):
1077 :     if not hasattr(self, 'originalEncoding'):
1078 :     self.originalEncoding = None
1079 :     else:
1080 :     dammit = UnicodeDammit\
1081 :     (markup, [self.fromEncoding, inDocumentEncoding],
1082 :     smartQuotesTo=self.smartQuotesTo)
1083 :     markup = dammit.unicode
1084 :     self.originalEncoding = dammit.originalEncoding
1085 :     if markup:
1086 :     if self.markupMassage:
1087 :     if not isList(self.markupMassage):
1088 :     self.markupMassage = self.MARKUP_MASSAGE
1089 :     for fix, m in self.markupMassage:
1090 :     markup = fix.sub(m, markup)
1091 :     # TODO: We get rid of markupMassage so that the
1092 :     # soup object can be deepcopied later on. Some
1093 :     # Python installations can't copy regexes. If anyone
1094 :     # was relying on the existence of markupMassage, this
1095 :     # might cause problems.
1096 :     del(self.markupMassage)
1097 :     self.reset()
1098 :    
1099 :     SGMLParser.feed(self, markup)
1100 :     # Close out any unfinished strings and close all the open tags.
1101 :     self.endData()
1102 :     while self.currentTag.name != self.ROOT_TAG_NAME:
1103 :     self.popTag()
1104 :    
1105 :     def __getattr__(self, methodName):
1106 :     """This method routes method call requests to either the SGMLParser
1107 :     superclass or the Tag superclass, depending on the method name."""
1108 :     #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1109 :    
1110 :     if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
1111 :     or methodName.find('do_') == 0:
1112 :     return SGMLParser.__getattr__(self, methodName)
1113 :     elif methodName.find('__') != 0:
1114 :     return Tag.__getattr__(self, methodName)
1115 :     else:
1116 :     raise AttributeError
1117 :    
1118 :     def isSelfClosingTag(self, name):
1119 :     """Returns true iff the given string is the name of a
1120 :     self-closing tag according to this parser."""
1121 :     return self.SELF_CLOSING_TAGS.has_key(name) \
1122 :     or self.instanceSelfClosingTags.has_key(name)
1123 :    
1124 :     def reset(self):
1125 :     Tag.__init__(self, self, self.ROOT_TAG_NAME)
1126 :     self.hidden = 1
1127 :     SGMLParser.reset(self)
1128 :     self.currentData = []
1129 :     self.currentTag = None
1130 :     self.tagStack = []
1131 :     self.quoteStack = []
1132 :     self.pushTag(self)
1133 :    
1134 :     def popTag(self):
1135 :     tag = self.tagStack.pop()
1136 :     # Tags with just one string-owning child get the child as a
1137 :     # 'string' property, so that soup.tag.string is shorthand for
1138 :     # soup.tag.contents[0]
1139 :     if len(self.currentTag.contents) == 1 and \
1140 :     isinstance(self.currentTag.contents[0], NavigableString):
1141 :     self.currentTag.string = self.currentTag.contents[0]
1142 :    
1143 :     #print "Pop", tag.name
1144 :     if self.tagStack:
1145 :     self.currentTag = self.tagStack[-1]
1146 :     return self.currentTag
1147 :    
1148 :     def pushTag(self, tag):
1149 :     #print "Push", tag.name
1150 :     if self.currentTag:
1151 :     self.currentTag.contents.append(tag)
1152 :     self.tagStack.append(tag)
1153 :     self.currentTag = self.tagStack[-1]
1154 :    
1155 :     def endData(self, containerClass=NavigableString):
1156 :     if self.currentData:
1157 :     currentData = ''.join(self.currentData)
1158 :     if not currentData.translate(self.STRIP_ASCII_SPACES):
1159 :     if '\n' in currentData:
1160 :     currentData = '\n'
1161 :     else:
1162 :     currentData = ' '
1163 :     self.currentData = []
1164 :     if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1165 :     (not self.parseOnlyThese.text or \
1166 :     not self.parseOnlyThese.search(currentData)):
1167 :     return
1168 :     o = containerClass(currentData)
1169 :     o.setup(self.currentTag, self.previous)
1170 :     if self.previous:
1171 :     self.previous.next = o
1172 :     self.previous = o
1173 :     self.currentTag.contents.append(o)
1174 :    
1175 :    
1176 :     def _popToTag(self, name, inclusivePop=True):
1177 :     """Pops the tag stack up to and including the most recent
1178 :     instance of the given tag. If inclusivePop is false, pops the tag
1179 :     stack up to but *not* including the most recent instqance of
1180 :     the given tag."""
1181 :     #print "Popping to %s" % name
1182 :     if name == self.ROOT_TAG_NAME:
1183 :     return
1184 :    
1185 :     numPops = 0
1186 :     mostRecentTag = None
1187 :     for i in range(len(self.tagStack)-1, 0, -1):
1188 :     if name == self.tagStack[i].name:
1189 :     numPops = len(self.tagStack)-i
1190 :     break
1191 :     if not inclusivePop:
1192 :     numPops = numPops - 1
1193 :    
1194 :     for i in range(0, numPops):
1195 :     mostRecentTag = self.popTag()
1196 :     return mostRecentTag
1197 :    
1198 :     def _smartPop(self, name):
1199 :    
1200 :     """We need to pop up to the previous tag of this type, unless
1201 :     one of this tag's nesting reset triggers comes between this
1202 :     tag and the previous tag of this type, OR unless this tag is a
1203 :     generic nesting trigger and another generic nesting trigger
1204 :     comes between this tag and the previous tag of this type.
1205 :    
1206 :     Examples:
1207 :     <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1208 :     <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1209 :     <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1210 :    
1211 :     <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1212 :     <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1213 :     <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1214 :     """
1215 :    
1216 :     nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1217 :     isNestable = nestingResetTriggers != None
1218 :     isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1219 :     popTo = None
1220 :     inclusive = True
1221 :     for i in range(len(self.tagStack)-1, 0, -1):
1222 :     p = self.tagStack[i]
1223 :     if (not p or p.name == name) and not isNestable:
1224 :     #Non-nestable tags get popped to the top or to their
1225 :     #last occurance.
1226 :     popTo = name
1227 :     break
1228 :     if (nestingResetTriggers != None
1229 :     and p.name in nestingResetTriggers) \
1230 :     or (nestingResetTriggers == None and isResetNesting
1231 :     and self.RESET_NESTING_TAGS.has_key(p.name)):
1232 :    
1233 :     #If we encounter one of the nesting reset triggers
1234 :     #peculiar to this tag, or we encounter another tag
1235 :     #that causes nesting to reset, pop up to but not
1236 :     #including that tag.
1237 :     popTo = p.name
1238 :     inclusive = False
1239 :     break
1240 :     p = p.parent
1241 :     if popTo:
1242 :     self._popToTag(popTo, inclusive)
1243 :    
1244 :     def unknown_starttag(self, name, attrs, selfClosing=0):
1245 :     #print "Start tag %s: %s" % (name, attrs)
1246 :     if self.quoteStack:
1247 :     #This is not a real tag.
1248 :     #print "<%s> is not real!" % name
1249 :     attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1250 :     self.handle_data('<%s%s>' % (name, attrs))
1251 :     return
1252 :     self.endData()
1253 :    
1254 :     if not self.isSelfClosingTag(name) and not selfClosing:
1255 :     self._smartPop(name)
1256 :    
1257 :     if self.parseOnlyThese and len(self.tagStack) <= 1 \
1258 :     and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1259 :     return
1260 :    
1261 :     tag = Tag(self, name, attrs, self.currentTag, self.previous)
1262 :     if self.previous:
1263 :     self.previous.next = tag
1264 :     self.previous = tag
1265 :     self.pushTag(tag)
1266 :     if selfClosing or self.isSelfClosingTag(name):
1267 :     self.popTag()
1268 :     if name in self.QUOTE_TAGS:
1269 :     #print "Beginning quote (%s)" % name
1270 :     self.quoteStack.append(name)
1271 :     self.literal = 1
1272 :     return tag
1273 :    
1274 :     def unknown_endtag(self, name):
1275 :     #print "End tag %s" % name
1276 :     if self.quoteStack and self.quoteStack[-1] != name:
1277 :     #This is not a real end tag.
1278 :     #print "</%s> is not real!" % name
1279 :     self.handle_data('</%s>' % name)
1280 :     return
1281 :     self.endData()
1282 :     self._popToTag(name)
1283 :     if self.quoteStack and self.quoteStack[-1] == name:
1284 :     self.quoteStack.pop()
1285 :     self.literal = (len(self.quoteStack) > 0)
1286 :    
1287 :     def handle_data(self, data):
1288 :     self.currentData.append(data)
1289 :    
1290 :     def _toStringSubclass(self, text, subclass):
1291 :     """Adds a certain piece of text to the tree as a NavigableString
1292 :     subclass."""
1293 :     self.endData()
1294 :     self.handle_data(text)
1295 :     self.endData(subclass)
1296 :    
1297 :     def handle_pi(self, text):
1298 :     """Handle a processing instruction as a ProcessingInstruction
1299 :     object, possibly one with a %SOUP-ENCODING% slot into which an
1300 :     encoding will be plugged later."""
1301 :     if text[:3] == "xml":
1302 :     text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1303 :     self._toStringSubclass(text, ProcessingInstruction)
1304 :    
1305 :     def handle_comment(self, text):
1306 :     "Handle comments as Comment objects."
1307 :     self._toStringSubclass(text, Comment)
1308 :    
1309 :     def handle_charref(self, ref):
1310 :     "Handle character references as data."
1311 :     if self.convertEntities:
1312 :     data = unichr(int(ref))
1313 :     else:
1314 :     data = '&#%s;' % ref
1315 :     self.handle_data(data)
1316 :    
1317 :     def handle_entityref(self, ref):
1318 :     """Handle entity references as data, possibly converting known
1319 :     HTML and/or XML entity references to the corresponding Unicode
1320 :     characters."""
1321 :     data = None
1322 :     if self.convertHTMLEntities:
1323 :     try:
1324 :     data = unichr(name2codepoint[ref])
1325 :     except KeyError:
1326 :     pass
1327 :    
1328 :     if not data and self.convertXMLEntities:
1329 :     data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1330 :    
1331 :     if not data and self.convertHTMLEntities and \
1332 :     not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1333 :     # TODO: We've got a problem here. We're told this is
1334 :     # an entity reference, but it's not an XML entity
1335 :     # reference or an HTML entity reference. Nonetheless,
1336 :     # the logical thing to do is to pass it through as an
1337 :     # unrecognized entity reference.
1338 :     #
1339 :     # Except: when the input is "&carol;" this function
1340 :     # will be called with input "carol". When the input is
1341 :     # "AT&T", this function will be called with input
1342 :     # "T". We have no way of knowing whether a semicolon
1343 :     # was present originally, so we don't know whether
1344 :     # this is an unknown entity or just a misplaced
1345 :     # ampersand.
1346 :     #
1347 :     # The more common case is a misplaced ampersand, so I
1348 :     # escape the ampersand and omit the trailing semicolon.
1349 :     data = "&amp;%s" % ref
1350 :     if not data:
1351 :     # This case is different from the one above, because we
1352 :     # haven't already gone through a supposedly comprehensive
1353 :     # mapping of entities to Unicode characters. We might not
1354 :     # have gone through any mapping at all. So the chances are
1355 :     # very high that this is a real entity, and not a
1356 :     # misplaced ampersand.
1357 :     data = "&%s;" % ref
1358 :     self.handle_data(data)
1359 :    
1360 :     def handle_decl(self, data):
1361 :     "Handle DOCTYPEs and the like as Declaration objects."
1362 :     self._toStringSubclass(data, Declaration)
1363 :    
1364 :     def parse_declaration(self, i):
1365 :     """Treat a bogus SGML declaration as raw data. Treat a CDATA
1366 :     declaration as a CData object."""
1367 :     j = None
1368 :     if self.rawdata[i:i+9] == '<![CDATA[':
1369 :     k = self.rawdata.find(']]>', i)
1370 :     if k == -1:
1371 :     k = len(self.rawdata)
1372 :     data = self.rawdata[i+9:k]
1373 :     j = k+3
1374 :     self._toStringSubclass(data, CData)
1375 :     else:
1376 :     try:
1377 :     j = SGMLParser.parse_declaration(self, i)
1378 :     except SGMLParseError:
1379 :     toHandle = self.rawdata[i:]
1380 :     self.handle_data(toHandle)
1381 :     j = i + len(toHandle)
1382 :     return j
1383 :    
1384 :     class BeautifulSoup(BeautifulStoneSoup):
1385 :    
1386 :     """This parser knows the following facts about HTML:
1387 :    
1388 :     * Some tags have no closing tag and should be interpreted as being
1389 :     closed as soon as they are encountered.
1390 :    
1391 :     * The text inside some tags (ie. 'script') may contain tags which
1392 :     are not really part of the document and which should be parsed
1393 :     as text, not tags. If you want to parse the text as tags, you can
1394 :     always fetch it and parse it explicitly.
1395 :    
1396 :     * Tag nesting rules:
1397 :    
1398 :     Most tags can't be nested at all. For instance, the occurance of
1399 :     a <p> tag should implicitly close the previous <p> tag.
1400 :    
1401 :     <p>Para1<p>Para2
1402 :     should be transformed into:
1403 :     <p>Para1</p><p>Para2
1404 :    
1405 :     Some tags can be nested arbitrarily. For instance, the occurance
1406 :     of a <blockquote> tag should _not_ implicitly close the previous
1407 :     <blockquote> tag.
1408 :    
1409 :     Alice said: <blockquote>Bob said: <blockquote>Blah
1410 :     should NOT be transformed into:
1411 :     Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1412 :    
1413 :     Some tags can be nested, but the nesting is reset by the
1414 :     interposition of other tags. For instance, a <tr> tag should
1415 :     implicitly close the previous <tr> tag within the same <table>,
1416 :     but not close a <tr> tag in another table.
1417 :    
1418 :     <table><tr>Blah<tr>Blah
1419 :     should be transformed into:
1420 :     <table><tr>Blah</tr><tr>Blah
1421 :     but,
1422 :     <tr>Blah<table><tr>Blah
1423 :     should NOT be transformed into
1424 :     <tr>Blah<table></tr><tr>Blah
1425 :    
1426 :     Differing assumptions about tag nesting rules are a major source
1427 :     of problems with the BeautifulSoup class. If BeautifulSoup is not
1428 :     treating as nestable a tag your page author treats as nestable,
1429 :     try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1430 :     BeautifulStoneSoup before writing your own subclass."""
1431 :    
1432 :     def __init__(self, *args, **kwargs):
1433 :     if not kwargs.has_key('smartQuotesTo'):
1434 :     kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1435 :     BeautifulStoneSoup.__init__(self, *args, **kwargs)
1436 :    
1437 :     SELF_CLOSING_TAGS = buildTagMap(None,
1438 :     ['br' , 'hr', 'input', 'img', 'meta',
1439 :     'spacer', 'link', 'frame', 'base'])
1440 :    
1441 :     QUOTE_TAGS = {'script' : None, 'textarea' : None}
1442 :    
1443 :     #According to the HTML standard, each of these inline tags can
1444 :     #contain another tag of the same type. Furthermore, it's common
1445 :     #to actually use these tags this way.
1446 :     NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1447 :     'center']
1448 :    
1449 :     #According to the HTML standard, these block tags can contain
1450 :     #another tag of the same type. Furthermore, it's common
1451 :     #to actually use these tags this way.
1452 :     NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1453 :    
1454 :     #Lists can contain other lists, but there are restrictions.
1455 :     NESTABLE_LIST_TAGS = { 'ol' : [],
1456 :     'ul' : [],
1457 :     'li' : ['ul', 'ol'],
1458 :     'dl' : [],
1459 :     'dd' : ['dl'],
1460 :     'dt' : ['dl'] }
1461 :    
1462 :     #Tables can contain other tables, but there are restrictions.
1463 :     NESTABLE_TABLE_TAGS = {'table' : [],
1464 :     'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1465 :     'td' : ['tr'],
1466 :     'th' : ['tr'],
1467 :     'thead' : ['table'],
1468 :     'tbody' : ['table'],
1469 :     'tfoot' : ['table'],
1470 :     }
1471 :    
1472 :     NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1473 :    
1474 :     #If one of these tags is encountered, all tags up to the next tag of
1475 :     #this type are popped.
1476 :     RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1477 :     NON_NESTABLE_BLOCK_TAGS,
1478 :     NESTABLE_LIST_TAGS,
1479 :     NESTABLE_TABLE_TAGS)
1480 :    
1481 :     NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1482 :     NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1483 :    
1484 :     # Used to detect the charset in a META tag; see start_meta
1485 :     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
1486 :    
1487 :     def start_meta(self, attrs):
1488 :     """Beautiful Soup can detect a charset included in a META tag,
1489 :     try to convert the document to that charset, and re-parse the
1490 :     document from the beginning."""
1491 :     httpEquiv = None
1492 :     contentType = None
1493 :     contentTypeIndex = None
1494 :     tagNeedsEncodingSubstitution = False
1495 :    
1496 :     for i in range(0, len(attrs)):
1497 :     key, value = attrs[i]
1498 :     key = key.lower()
1499 :     if key == 'http-equiv':
1500 :     httpEquiv = value
1501 :     elif key == 'content':
1502 :     contentType = value
1503 :     contentTypeIndex = i
1504 :    
1505 :     if httpEquiv and contentType: # It's an interesting meta tag.
1506 :     match = self.CHARSET_RE.search(contentType)
1507 :     if match:
1508 :     if getattr(self, 'declaredHTMLEncoding') or \
1509 :     (self.originalEncoding == self.fromEncoding):
1510 :     # This is our second pass through the document, or
1511 :     # else an encoding was specified explicitly and it
1512 :     # worked. Rewrite the meta tag.
1513 :     newAttr = self.CHARSET_RE.sub\
1514 :     (lambda(match):match.group(1) +
1515 :     "%SOUP-ENCODING%", value)
1516 :     attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1517 :     newAttr)
1518 :     tagNeedsEncodingSubstitution = True
1519 :     else:
1520 :     # This is our first pass through the document.
1521 :     # Go through it again with the new information.
1522 :     newCharset = match.group(3)
1523 :     if newCharset and newCharset != self.originalEncoding:
1524 :     self.declaredHTMLEncoding = newCharset
1525 :     self._feed(self.declaredHTMLEncoding)
1526 :     raise StopParsing
1527 :     tag = self.unknown_starttag("meta", attrs)
1528 :     if tag and tagNeedsEncodingSubstitution:
1529 :     tag.containsSubstitutions = True
1530 :    
1531 :     class StopParsing(Exception):
1532 :     pass
1533 :    
1534 :     class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1535 :    
1536 :     """The BeautifulSoup class is oriented towards skipping over
1537 :     common HTML errors like unclosed tags. However, sometimes it makes
1538 :     errors of its own. For instance, consider this fragment:
1539 :    
1540 :     <b>Foo<b>Bar</b></b>
1541 :    
1542 :     This is perfectly valid (if bizarre) HTML. However, the
1543 :     BeautifulSoup class will implicitly close the first b tag when it
1544 :     encounters the second 'b'. It will think the author wrote
1545 :     "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1546 :     there's no real-world reason to bold something that's already
1547 :     bold. When it encounters '</b></b>' it will close two more 'b'
1548 :     tags, for a grand total of three tags closed instead of two. This
1549 :     can throw off the rest of your document structure. The same is
1550 :     true of a number of other tags, listed below.
1551 :    
1552 :     It's much more common for someone to forget to close a 'b' tag
1553 :     than to actually use nested 'b' tags, and the BeautifulSoup class
1554 :     handles the common case. This class handles the not-co-common
1555 :     case: where you can't believe someone wrote what they did, but
1556 :     it's valid HTML and BeautifulSoup screwed up by assuming it
1557 :     wouldn't be."""
1558 :    
1559 :     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1560 :     ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1561 :     'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1562 :     'big']
1563 :    
1564 :     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1565 :    
1566 :     NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1567 :     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1568 :     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1569 :    
1570 :     class MinimalSoup(BeautifulSoup):
1571 :     """The MinimalSoup class is for parsing HTML that contains
1572 :     pathologically bad markup. It makes no assumptions about tag
1573 :     nesting, but it does know which tags are self-closing, that
1574 :     <script> tags contain Javascript and should not be parsed, that
1575 :     META tags may contain encoding information, and so on.
1576 :    
1577 :     This also makes it better for subclassing than BeautifulStoneSoup
1578 :     or BeautifulSoup."""
1579 :    
1580 :     RESET_NESTING_TAGS = buildTagMap('noscript')
1581 :     NESTABLE_TAGS = {}
1582 :    
1583 :     class BeautifulSOAP(BeautifulStoneSoup):
1584 :     """This class will push a tag with only a single string child into
1585 :     the tag's parent as an attribute. The attribute's name is the tag
1586 :     name, and the value is the string child. An example should give
1587 :     the flavor of the change:
1588 :    
1589 :     <foo><bar>baz</bar></foo>
1590 :     =>
1591 :     <foo bar="baz"><bar>baz</bar></foo>
1592 :    
1593 :     You can then access fooTag['bar'] instead of fooTag.barTag.string.
1594 :    
1595 :     This is, of course, useful for scraping structures that tend to
1596 :     use subelements instead of attributes, such as SOAP messages. Note
1597 :     that it modifies its input, so don't print the modified version
1598 :     out.
1599 :    
1600 :     I'm not sure how many people really want to use this class; let me
1601 :     know if you do. Mainly I like the name."""
1602 :    
1603 :     def popTag(self):
1604 :     if len(self.tagStack) > 1:
1605 :     tag = self.tagStack[-1]
1606 :     parent = self.tagStack[-2]
1607 :     parent._getAttrMap()
1608 :     if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1609 :     isinstance(tag.contents[0], NavigableString) and
1610 :     not parent.attrMap.has_key(tag.name)):
1611 :     parent[tag.name] = tag.contents[0]
1612 :     BeautifulStoneSoup.popTag(self)
1613 :    
1614 :     #Enterprise class names! It has come to our attention that some people
1615 :     #think the names of the Beautiful Soup parser classes are too silly
1616 :     #and "unprofessional" for use in enterprise screen-scraping. We feel
1617 :     #your pain! For such-minded folk, the Beautiful Soup Consortium And
1618 :     #All-Night Kosher Bakery recommends renaming this file to
1619 :     #"RobustParser.py" (or, in cases of extreme enterprisiness,
1620 :     #"RobustParserBeanInterface.class") and using the following
1621 :     #enterprise-friendly class aliases:
1622 :     class RobustXMLParser(BeautifulStoneSoup):
1623 :     pass
1624 :     class RobustHTMLParser(BeautifulSoup):
1625 :     pass
1626 :     class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1627 :     pass
1628 :     class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1629 :     pass
1630 :     class SimplifyingSOAPParser(BeautifulSOAP):
1631 :     pass
1632 :    
1633 :     ######################################################
1634 :     #
1635 :     # Bonus library: Unicode, Dammit
1636 :     #
1637 :     # This class forces XML data into a standard format (usually to UTF-8
1638 :     # or Unicode). It is heavily based on code from Mark Pilgrim's
1639 :     # Universal Feed Parser. It does not rewrite the XML or HTML to
1640 :     # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1641 :     # (XML) and BeautifulSoup.start_meta (HTML).
1642 :    
1643 :     # Autodetects character encodings.
1644 :     # Download from http://chardet.feedparser.org/
1645 :     try:
1646 :     import chardet
1647 :     # import chardet.constants
1648 :     # chardet.constants._debug = 1
1649 :     except:
1650 :     chardet = None
1651 :    
1652 :     # cjkcodecs and iconv_codec make Python know about more character encodings.
1653 :     # Both are available from http://cjkpython.i18n.org/
1654 :     # They're built in if you use Python 2.4.
1655 :     try:
1656 :     import cjkcodecs.aliases
1657 :     except:
1658 :     pass
1659 :     try:
1660 :     import iconv_codec
1661 :     except:
1662 :     pass
1663 :    
1664 :     class UnicodeDammit:
1665 :     """A class for detecting the encoding of a *ML document and
1666 :     converting it to a Unicode string. If the source encoding is
1667 :     windows-1252, can replace MS smart quotes with their HTML or XML
1668 :     equivalents."""
1669 :    
1670 :     # This dictionary maps commonly seen values for "charset" in HTML
1671 :     # meta tags to the corresponding Python codec names. It only covers
1672 :     # values that aren't in Python's aliases and can't be determined
1673 :     # by the heuristics in find_codec.
1674 :     CHARSET_ALIASES = { "macintosh" : "mac-roman",
1675 :     "x-sjis" : "shift-jis" }
1676 :    
1677 :     def __init__(self, markup, overrideEncodings=[],
1678 :     smartQuotesTo='xml'):
1679 :     self.markup, documentEncoding, sniffedEncoding = \
1680 :     self._detectEncoding(markup)
1681 :     self.smartQuotesTo = smartQuotesTo
1682 :     self.triedEncodings = []
1683 :     if markup == '' or isinstance(markup, unicode):
1684 :     self.originalEncoding = None
1685 :     self.unicode = unicode(markup)
1686 :     return
1687 :    
1688 :     u = None
1689 :     for proposedEncoding in overrideEncodings:
1690 :     u = self._convertFrom(proposedEncoding)
1691 :     if u: break
1692 :     if not u:
1693 :     for proposedEncoding in (documentEncoding, sniffedEncoding):
1694 :     u = self._convertFrom(proposedEncoding)
1695 :     if u: break
1696 :    
1697 :     # If no luck and we have auto-detection library, try that:
1698 :     if not u and chardet and not isinstance(self.markup, unicode):
1699 :     u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1700 :    
1701 :     # As a last resort, try utf-8 and windows-1252:
1702 :     if not u:
1703 :     for proposed_encoding in ("utf-8", "windows-1252"):
1704 :     u = self._convertFrom(proposed_encoding)
1705 :     if u: break
1706 :     self.unicode = u
1707 :     if not u: self.originalEncoding = None
1708 :    
1709 :     def _subMSChar(self, orig):
1710 :     """Changes a MS smart quote character to an XML or HTML
1711 :     entity."""
1712 :     sub = self.MS_CHARS.get(orig)
1713 :     if type(sub) == types.TupleType:
1714 :     if self.smartQuotesTo == 'xml':
1715 :     sub = '&#x%s;' % sub[1]
1716 :     else:
1717 :     sub = '&%s;' % sub[0]
1718 :     return sub
1719 :    
1720 :     def _convertFrom(self, proposed):
1721 :     proposed = self.find_codec(proposed)
1722 :     if not proposed or proposed in self.triedEncodings:
1723 :     return None
1724 :     self.triedEncodings.append(proposed)
1725 :     markup = self.markup
1726 :    
1727 :     # Convert smart quotes to HTML if coming from an encoding
1728 :     # that might have them.
1729 :     if self.smartQuotesTo and proposed.lower() in("windows-1252",
1730 :     "iso-8859-1",
1731 :     "iso-8859-2"):
1732 :     markup = re.compile("([\x80-\x9f])").sub \
1733 :     (lambda(x): self._subMSChar(x.group(1)),
1734 :     markup)
1735 :    
1736 :     try:
1737 :     # print "Trying to convert document to %s" % proposed
1738 :     u = self._toUnicode(markup, proposed)
1739 :     self.markup = u
1740 :     self.originalEncoding = proposed
1741 :     except Exception, e:
1742 :     # print "That didn't work!"
1743 :     # print e
1744 :     return None
1745 :     #print "Correct encoding: %s" % proposed
1746 :     return self.markup
1747 :    
1748 :     def _toUnicode(self, data, encoding):
1749 :     '''Given a string and its encoding, decodes the string into Unicode.
1750 :     %encoding is a string recognized by encodings.aliases'''
1751 :    
1752 :     # strip Byte Order Mark (if present)
1753 :     if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1754 :     and (data[2:4] != '\x00\x00'):
1755 :     encoding = 'utf-16be'
1756 :     data = data[2:]
1757 :     elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1758 :     and (data[2:4] != '\x00\x00'):
1759 :     encoding = 'utf-16le'
1760 :     data = data[2:]
1761 :     elif data[:3] == '\xef\xbb\xbf':
1762 :     encoding = 'utf-8'
1763 :     data = data[3:]
1764 :     elif data[:4] == '\x00\x00\xfe\xff':
1765 :     encoding = 'utf-32be'
1766 :     data = data[4:]
1767 :     elif data[:4] == '\xff\xfe\x00\x00':
1768 :     encoding = 'utf-32le'
1769 :     data = data[4:]
1770 :     newdata = unicode(data, encoding)
1771 :     return newdata
1772 :    
1773 :     def _detectEncoding(self, xml_data):
1774 :     """Given a document, tries to detect its XML encoding."""
1775 :     xml_encoding = sniffed_xml_encoding = None
1776 :     try:
1777 :     if xml_data[:4] == '\x4c\x6f\xa7\x94':
1778 :     # EBCDIC
1779 :     xml_data = self._ebcdic_to_ascii(xml_data)
1780 :     elif xml_data[:4] == '\x00\x3c\x00\x3f':
1781 :     # UTF-16BE
1782 :     sniffed_xml_encoding = 'utf-16be'
1783 :     xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1784 :     elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1785 :     and (xml_data[2:4] != '\x00\x00'):
1786 :     # UTF-16BE with BOM
1787 :     sniffed_xml_encoding = 'utf-16be'
1788 :     xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1789 :     elif xml_data[:4] == '\x3c\x00\x3f\x00':
1790 :     # UTF-16LE
1791 :     sniffed_xml_encoding = 'utf-16le'
1792 :     xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1793 :     elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1794 :     (xml_data[2:4] != '\x00\x00'):
1795 :     # UTF-16LE with BOM
1796 :     sniffed_xml_encoding = 'utf-16le'
1797 :     xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1798 :     elif xml_data[:4] == '\x00\x00\x00\x3c':
1799 :     # UTF-32BE
1800 :     sniffed_xml_encoding = 'utf-32be'
1801 :     xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1802 :     elif xml_data[:4] == '\x3c\x00\x00\x00':
1803 :     # UTF-32LE
1804 :     sniffed_xml_encoding = 'utf-32le'
1805 :     xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1806 :     elif xml_data[:4] == '\x00\x00\xfe\xff':
1807 :     # UTF-32BE with BOM
1808 :     sniffed_xml_encoding = 'utf-32be'
1809 :     xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1810 :     elif xml_data[:4] == '\xff\xfe\x00\x00':
1811 :     # UTF-32LE with BOM
1812 :     sniffed_xml_encoding = 'utf-32le'
1813 :     xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1814 :     elif xml_data[:3] == '\xef\xbb\xbf':
1815 :     # UTF-8 with BOM
1816 :     sniffed_xml_encoding = 'utf-8'
1817 :     xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1818 :     else:
1819 :     sniffed_xml_encoding = 'ascii'
1820 :     pass
1821 :     xml_encoding_match = re.compile \
1822 :     ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
1823 :     .match(xml_data)
1824 :     except:
1825 :     xml_encoding_match = None
1826 :     if xml_encoding_match:
1827 :     xml_encoding = xml_encoding_match.groups()[0].lower()
1828 :     if sniffed_xml_encoding and \
1829 :     (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1830 :     'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1831 :     'utf-16', 'utf-32', 'utf_16', 'utf_32',
1832 :     'utf16', 'u16')):
1833 :     xml_encoding = sniffed_xml_encoding
1834 :     return xml_data, xml_encoding, sniffed_xml_encoding
1835 :    
1836 :    
1837 :     def find_codec(self, charset):
1838 :     return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1839 :     or (charset and self._codec(charset.replace("-", ""))) \
1840 :     or (charset and self._codec(charset.replace("-", "_"))) \
1841 :     or charset
1842 :    
1843 :     def _codec(self, charset):
1844 :     if not charset: return charset
1845 :     codec = None
1846 :     try:
1847 :     codecs.lookup(charset)
1848 :     codec = charset
1849 :     except (LookupError, ValueError):
1850 :     pass
1851 :     return codec
1852 :    
1853 :     EBCDIC_TO_ASCII_MAP = None
1854 :     def _ebcdic_to_ascii(self, s):
1855 :     c = self.__class__
1856 :     if not c.EBCDIC_TO_ASCII_MAP:
1857 :     emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1858 :     16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1859 :     128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1860 :     144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1861 :     32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1862 :     38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1863 :     45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1864 :     186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1865 :     195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1866 :     201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1867 :     206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1868 :     211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1869 :     225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1870 :     73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1871 :     82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1872 :     90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1873 :     250,251,252,253,254,255)
1874 :     import string
1875 :     c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1876 :     ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1877 :     return s.translate(c.EBCDIC_TO_ASCII_MAP)
1878 :    
1879 :     MS_CHARS = { '\x80' : ('euro', '20AC'),
1880 :     '\x81' : ' ',
1881 :     '\x82' : ('sbquo', '201A'),
1882 :     '\x83' : ('fnof', '192'),
1883 :     '\x84' : ('bdquo', '201E'),
1884 :     '\x85' : ('hellip', '2026'),
1885 :     '\x86' : ('dagger', '2020'),
1886 :     '\x87' : ('Dagger', '2021'),
1887 :     '\x88' : ('circ', '2C6'),
1888 :     '\x89' : ('permil', '2030'),
1889 :     '\x8A' : ('Scaron', '160'),
1890 :     '\x8B' : ('lsaquo', '2039'),
1891 :     '\x8C' : ('OElig', '152'),
1892 :     '\x8D' : '?',
1893 :     '\x8E' : ('#x17D', '17D'),
1894 :     '\x8F' : '?',
1895 :     '\x90' : '?',
1896 :     '\x91' : ('lsquo', '2018'),
1897 :     '\x92' : ('rsquo', '2019'),
1898 :     '\x93' : ('ldquo', '201C'),
1899 :     '\x94' : ('rdquo', '201D'),
1900 :     '\x95' : ('bull', '2022'),
1901 :     '\x96' : ('ndash', '2013'),
1902 :     '\x97' : ('mdash', '2014'),
1903 :     '\x98' : ('tilde', '2DC'),
1904 :     '\x99' : ('trade', '2122'),
1905 :     '\x9a' : ('scaron', '161'),
1906 :     '\x9b' : ('rsaquo', '203A'),
1907 :     '\x9c' : ('oelig', '153'),
1908 :     '\x9d' : '?',
1909 :     '\x9e' : ('#x17E', '17E'),
1910 :     '\x9f' : ('Yuml', ''),}
1911 :    
1912 :     #######################################################################
1913 :    
1914 :    
1915 :     #By default, act as an HTML pretty-printer.
1916 :     if __name__ == '__main__':
1917 :     import sys
1918 :     soup = BeautifulSoup(sys.stdin.read())
1919 :     print soup.prettify()

svn@nadir-point.com
Subversion  TortoiseSVN  ViewVC