bitbake: bitbake/bs4: Upgrade 4.3.2 -> 4.4.1 (python 3 version)

Upgrade to 4.4.1 which has been run through 2to3 as per the maintainers
recommendation for v3 use.

(Bitbake rev: 2f4b98af93c971a8c466ffaf3c09cca0edb6e3ad)

Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
This commit is contained in:
Richard Purdie 2016-05-06 09:06:51 +01:00
parent 4f8959324d
commit 822eabf32d
15 changed files with 972 additions and 361 deletions

View File

@ -17,8 +17,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
""" """
__author__ = "Leonard Richardson (leonardr@segfault.org)" __author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.3.2" __version__ = "4.4.1"
__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson" __copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
__license__ = "MIT" __license__ = "MIT"
__all__ = ['BeautifulSoup'] __all__ = ['BeautifulSoup']
@ -45,7 +45,7 @@ from .element import (
# The very first thing we do is give a useful error if someone is # The very first thing we do is give a useful error if someone is
# running this code under Python 3 without converting it. # running this code under Python 3 without converting it.
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
class BeautifulSoup(Tag): class BeautifulSoup(Tag):
""" """
@ -69,7 +69,7 @@ class BeautifulSoup(Tag):
like HTML's <br> tag), call handle_starttag and then like HTML's <br> tag), call handle_starttag and then
handle_endtag. handle_endtag.
""" """
ROOT_TAG_NAME = u'[document]' ROOT_TAG_NAME = '[document]'
# If the end-user gives no indication which tree builder they # If the end-user gives no indication which tree builder they
# want, look for one with these features. # want, look for one with these features.
@ -77,8 +77,11 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
def __init__(self, markup="", features=None, builder=None, def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, **kwargs): parse_only=None, from_encoding=None, exclude_encodings=None,
**kwargs):
"""The Soup object is initialized as the 'root tag', and the """The Soup object is initialized as the 'root tag', and the
provided markup (which can be a string or a file-like object) provided markup (which can be a string or a file-like object)
is fed into the underlying parser.""" is fed into the underlying parser."""
@ -114,9 +117,9 @@ class BeautifulSoup(Tag):
del kwargs['isHTML'] del kwargs['isHTML']
warnings.warn( warnings.warn(
"BS4 does not respect the isHTML argument to the " "BS4 does not respect the isHTML argument to the "
"BeautifulSoup constructor. You can pass in features='html' " "BeautifulSoup constructor. Suggest you use "
"or features='xml' to get a builder capable of handling " "features='lxml' for HTML and features='lxml-xml' for "
"one or the other.") "XML.")
def deprecated_argument(old_name, new_name): def deprecated_argument(old_name, new_name):
if old_name in kwargs: if old_name in kwargs:
@ -135,12 +138,13 @@ class BeautifulSoup(Tag):
"fromEncoding", "from_encoding") "fromEncoding", "from_encoding")
if len(kwargs) > 0: if len(kwargs) > 0:
arg = kwargs.keys().pop() arg = list(kwargs.keys()).pop()
raise TypeError( raise TypeError(
"__init__() got an unexpected keyword argument '%s'" % arg) "__init__() got an unexpected keyword argument '%s'" % arg)
if builder is None: if builder is None:
if isinstance(features, basestring): original_features = features
if isinstance(features, str):
features = [features] features = [features]
if features is None or len(features) == 0: if features is None or len(features) == 0:
features = self.DEFAULT_BUILDER_FEATURES features = self.DEFAULT_BUILDER_FEATURES
@ -151,6 +155,16 @@ class BeautifulSoup(Tag):
"requested: %s. Do you need to install a parser library?" "requested: %s. Do you need to install a parser library?"
% ",".join(features)) % ",".join(features))
builder = builder_class() builder = builder_class()
if not (original_features == builder.NAME or
original_features in builder.ALTERNATE_NAMES):
if builder.is_xml:
markup_type = "XML"
else:
markup_type = "HTML"
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
parser=builder.NAME,
markup_type=markup_type))
self.builder = builder self.builder = builder
self.is_xml = builder.is_xml self.is_xml = builder.is_xml
self.builder.soup = self self.builder.soup = self
@ -164,7 +178,7 @@ class BeautifulSoup(Tag):
# involving passing non-markup to Beautiful Soup. # involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup, # Beautiful Soup will still parse the input as markup,
# just in case that's what the user really wants. # just in case that's what the user really wants.
if (isinstance(markup, unicode) if (isinstance(markup, str)
and not os.path.supports_unicode_filenames): and not os.path.supports_unicode_filenames):
possible_filename = markup.encode("utf8") possible_filename = markup.encode("utf8")
else: else:
@ -172,25 +186,30 @@ class BeautifulSoup(Tag):
is_file = False is_file = False
try: try:
is_file = os.path.exists(possible_filename) is_file = os.path.exists(possible_filename)
except Exception, e: except Exception as e:
# This is almost certainly a problem involving # This is almost certainly a problem involving
# characters not valid in filenames on this # characters not valid in filenames on this
# system. Just let it go. # system. Just let it go.
pass pass
if is_file: if is_file:
if isinstance(markup, str):
markup = markup.encode("utf8")
warnings.warn( warnings.warn(
'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
if markup[:5] == "http:" or markup[:6] == "https:": if markup[:5] == "http:" or markup[:6] == "https:":
# TODO: This is ugly but I couldn't get it to work in # TODO: This is ugly but I couldn't get it to work in
# Python 3 otherwise. # Python 3 otherwise.
if ((isinstance(markup, bytes) and not b' ' in markup) if ((isinstance(markup, bytes) and not b' ' in markup)
or (isinstance(markup, unicode) and not u' ' in markup)): or (isinstance(markup, str) and not ' ' in markup)):
if isinstance(markup, str):
markup = markup.encode("utf8")
warnings.warn( warnings.warn(
'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
for (self.markup, self.original_encoding, self.declared_html_encoding, for (self.markup, self.original_encoding, self.declared_html_encoding,
self.contains_replacement_characters) in ( self.contains_replacement_characters) in (
self.builder.prepare_markup(markup, from_encoding)): self.builder.prepare_markup(
markup, from_encoding, exclude_encodings=exclude_encodings)):
self.reset() self.reset()
try: try:
self._feed() self._feed()
@ -203,6 +222,16 @@ class BeautifulSoup(Tag):
self.markup = None self.markup = None
self.builder.soup = None self.builder.soup = None
def __copy__(self):
return type(self)(self.encode(), builder=self.builder)
def __getstate__(self):
# Frequently a tree builder can't be pickled.
d = dict(self.__dict__)
if 'builder' in d and not self.builder.picklable:
del d['builder']
return d
def _feed(self): def _feed(self):
# Convert the document to Unicode. # Convert the document to Unicode.
self.builder.reset() self.builder.reset()
@ -229,9 +258,7 @@ class BeautifulSoup(Tag):
def new_string(self, s, subclass=NavigableString): def new_string(self, s, subclass=NavigableString):
"""Create a new NavigableString associated with this soup.""" """Create a new NavigableString associated with this soup."""
navigable = subclass(s) return subclass(s)
navigable.setup()
return navigable
def insert_before(self, successor): def insert_before(self, successor):
raise NotImplementedError("BeautifulSoup objects don't support insert_before().") raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
@ -259,7 +286,7 @@ class BeautifulSoup(Tag):
def endData(self, containerClass=NavigableString): def endData(self, containerClass=NavigableString):
if self.current_data: if self.current_data:
current_data = u''.join(self.current_data) current_data = ''.join(self.current_data)
# If whitespace is not preserved, and this string contains # If whitespace is not preserved, and this string contains
# nothing but ASCII spaces, replace it with a single space # nothing but ASCII spaces, replace it with a single space
# or newline. # or newline.
@ -290,14 +317,49 @@ class BeautifulSoup(Tag):
def object_was_parsed(self, o, parent=None, most_recent_element=None): def object_was_parsed(self, o, parent=None, most_recent_element=None):
"""Add an object to the parse tree.""" """Add an object to the parse tree."""
parent = parent or self.currentTag parent = parent or self.currentTag
most_recent_element = most_recent_element or self._most_recent_element previous_element = most_recent_element or self._most_recent_element
o.setup(parent, most_recent_element)
next_element = previous_sibling = next_sibling = None
if isinstance(o, Tag):
next_element = o.next_element
next_sibling = o.next_sibling
previous_sibling = o.previous_sibling
if not previous_element:
previous_element = o.previous_element
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
if most_recent_element is not None:
most_recent_element.next_element = o
self._most_recent_element = o self._most_recent_element = o
parent.contents.append(o) parent.contents.append(o)
if parent.next_sibling:
# This node is being inserted into an element that has
# already been parsed. Deal with any dangling references.
index = parent.contents.index(o)
if index == 0:
previous_element = parent
previous_sibling = None
else:
previous_element = previous_sibling = parent.contents[index-1]
if index == len(parent.contents)-1:
next_element = parent.next_sibling
next_sibling = None
else:
next_element = next_sibling = parent.contents[index+1]
o.previous_element = previous_element
if previous_element:
previous_element.next_element = o
o.next_element = next_element
if next_element:
next_element.previous_element = o
o.next_sibling = next_sibling
if next_sibling:
next_sibling.previous_sibling = o
o.previous_sibling = previous_sibling
if previous_sibling:
previous_sibling.next_sibling = o
def _popToTag(self, name, nsprefix=None, inclusivePop=True): def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent """Pops the tag stack up to and including the most recent
instance of the given tag. If inclusivePop is false, pops the tag instance of the given tag. If inclusivePop is false, pops the tag
@ -367,9 +429,9 @@ class BeautifulSoup(Tag):
encoding_part = '' encoding_part = ''
if eventual_encoding != None: if eventual_encoding != None:
encoding_part = ' encoding="%s"' % eventual_encoding encoding_part = ' encoding="%s"' % eventual_encoding
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part prefix = '<?xml version="1.0"%s?>\n' % encoding_part
else: else:
prefix = u'' prefix = ''
if not pretty_print: if not pretty_print:
indent_level = None indent_level = None
else: else:
@ -403,4 +465,4 @@ class FeatureNotFound(ValueError):
if __name__ == '__main__': if __name__ == '__main__':
import sys import sys
soup = BeautifulSoup(sys.stdin) soup = BeautifulSoup(sys.stdin)
print soup.prettify() print(soup.prettify())

View File

@ -80,9 +80,12 @@ builder_registry = TreeBuilderRegistry()
class TreeBuilder(object): class TreeBuilder(object):
"""Turn a document into a Beautiful Soup object tree.""" """Turn a document into a Beautiful Soup object tree."""
NAME = "[Unknown tree builder]"
ALTERNATE_NAMES = []
features = [] features = []
is_xml = False is_xml = False
picklable = False
preserve_whitespace_tags = set() preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents. # tag when and only when it has no contents.
@ -153,13 +156,13 @@ class TreeBuilder(object):
universal = self.cdata_list_attributes.get('*', []) universal = self.cdata_list_attributes.get('*', [])
tag_specific = self.cdata_list_attributes.get( tag_specific = self.cdata_list_attributes.get(
tag_name.lower(), None) tag_name.lower(), None)
for attr in attrs.keys(): for attr in list(attrs.keys()):
if attr in universal or (tag_specific and attr in tag_specific): if attr in universal or (tag_specific and attr in tag_specific):
# We have a "class"-type attribute whose string # We have a "class"-type attribute whose string
# value is a whitespace-separated list of # value is a whitespace-separated list of
# values. Split it into a list. # values. Split it into a list.
value = attrs[attr] value = attrs[attr]
if isinstance(value, basestring): if isinstance(value, str):
values = whitespace_re.split(value) values = whitespace_re.split(value)
else: else:
# html5lib sometimes calls setAttributes twice # html5lib sometimes calls setAttributes twice

View File

@ -2,6 +2,7 @@ __all__ = [
'HTML5TreeBuilder', 'HTML5TreeBuilder',
] ]
from pdb import set_trace
import warnings import warnings
from bs4.builder import ( from bs4.builder import (
PERMISSIVE, PERMISSIVE,
@ -9,7 +10,10 @@ from bs4.builder import (
HTML_5, HTML_5,
HTMLTreeBuilder, HTMLTreeBuilder,
) )
from bs4.element import NamespacedAttribute from bs4.element import (
NamespacedAttribute,
whitespace_re,
)
import html5lib import html5lib
from html5lib.constants import namespaces from html5lib.constants import namespaces
from bs4.element import ( from bs4.element import (
@ -22,11 +26,20 @@ from bs4.element import (
class HTML5TreeBuilder(HTMLTreeBuilder): class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree.""" """Use html5lib to build a tree."""
features = ['html5lib', PERMISSIVE, HTML_5, HTML] NAME = "html5lib"
def prepare_markup(self, markup, user_specified_encoding): features = [NAME, PERMISSIVE, HTML_5, HTML]
def prepare_markup(self, markup, user_specified_encoding,
document_declared_encoding=None, exclude_encodings=None):
# Store the user-specified encoding for use later on. # Store the user-specified encoding for use later on.
self.user_specified_encoding = user_specified_encoding self.user_specified_encoding = user_specified_encoding
# document_declared_encoding and exclude_encodings aren't used
# ATM because the html5lib TreeBuilder doesn't use
# UnicodeDammit.
if exclude_encodings:
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
yield (markup, None, None, False) yield (markup, None, None, False)
# These methods are defined by Beautiful Soup. # These methods are defined by Beautiful Soup.
@ -37,7 +50,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
doc = parser.parse(markup, encoding=self.user_specified_encoding) doc = parser.parse(markup, encoding=self.user_specified_encoding)
# Set the character encoding detected by the tokenizer. # Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode): if isinstance(markup, str):
# We need to special-case this because html5lib sets # We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input. # charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None doc.original_encoding = None
@ -51,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`.""" """See `TreeBuilder`."""
return u'<html><head></head><body>%s</body></html>' % fragment return '<html><head></head><body>%s</body></html>' % fragment
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
@ -101,7 +114,16 @@ class AttrList(object):
def __iter__(self): def __iter__(self):
return list(self.attrs.items()).__iter__() return list(self.attrs.items()).__iter__()
def __setitem__(self, name, value): def __setitem__(self, name, value):
"set attr", name, value # If this attribute is a multi-valued attribute for this element,
# turn its value into a list.
list_attr = HTML5TreeBuilder.cdata_list_attributes
if (name in list_attr['*']
or (self.element.name in list_attr
and name in list_attr[self.element.name])):
# A node that is being cloned may have already undergone
# this procedure.
if not isinstance(value, list):
value = whitespace_re.split(value)
self.element[name] = value self.element[name] = value
def items(self): def items(self):
return list(self.attrs.items()) return list(self.attrs.items())
@ -124,7 +146,7 @@ class Element(html5lib.treebuilders._base.Node):
def appendChild(self, node): def appendChild(self, node):
string_child = child = None string_child = child = None
if isinstance(node, basestring): if isinstance(node, str):
# Some other piece of code decided to pass in a string # Some other piece of code decided to pass in a string
# instead of creating a TextElement object to contain the # instead of creating a TextElement object to contain the
# string. # string.
@ -139,7 +161,7 @@ class Element(html5lib.treebuilders._base.Node):
else: else:
child = node.element child = node.element
if not isinstance(child, basestring) and child.parent is not None: if not isinstance(child, str) and child.parent is not None:
node.element.extract() node.element.extract()
if (string_child and self.element.contents if (string_child and self.element.contents
@ -152,7 +174,7 @@ class Element(html5lib.treebuilders._base.Node):
old_element.replace_with(new_element) old_element.replace_with(new_element)
self.soup._most_recent_element = new_element self.soup._most_recent_element = new_element
else: else:
if isinstance(node, basestring): if isinstance(node, str):
# Create a brand new NavigableString from this string. # Create a brand new NavigableString from this string.
child = self.soup.new_string(node) child = self.soup.new_string(node)
@ -161,6 +183,12 @@ class Element(html5lib.treebuilders._base.Node):
# immediately after the parent, if it has no children.) # immediately after the parent, if it has no children.)
if self.element.contents: if self.element.contents:
most_recent_element = self.element._last_descendant(False) most_recent_element = self.element._last_descendant(False)
elif self.element.next_element is not None:
# Something from further ahead in the parse tree is
# being inserted into this earlier element. This is
# very annoying because it means an expensive search
# for the last element in the tree.
most_recent_element = self.soup._last_descendant()
else: else:
most_recent_element = self.element most_recent_element = self.element
@ -172,6 +200,7 @@ class Element(html5lib.treebuilders._base.Node):
return AttrList(self.element) return AttrList(self.element)
def setAttributes(self, attributes): def setAttributes(self, attributes):
if attributes is not None and len(attributes) > 0: if attributes is not None and len(attributes) > 0:
converted_attributes = [] converted_attributes = []
@ -183,7 +212,7 @@ class Element(html5lib.treebuilders._base.Node):
self.soup.builder._replace_cdata_list_attribute_values( self.soup.builder._replace_cdata_list_attribute_values(
self.name, attributes) self.name, attributes)
for name, value in attributes.items(): for name, value in list(attributes.items()):
self.element[name] = value self.element[name] = value
# The attributes may contain variables that need substitution. # The attributes may contain variables that need substitution.
@ -218,6 +247,9 @@ class Element(html5lib.treebuilders._base.Node):
def reparentChildren(self, new_parent): def reparentChildren(self, new_parent):
"""Move all of this tag's children into another tag.""" """Move all of this tag's children into another tag."""
# print "MOVE", self.element.contents
# print "FROM", self.element
# print "TO", new_parent.element
element = self.element element = self.element
new_parent_element = new_parent.element new_parent_element = new_parent.element
# Determine what this tag's next_element will be once all the children # Determine what this tag's next_element will be once all the children
@ -236,17 +268,28 @@ class Element(html5lib.treebuilders._base.Node):
new_parents_last_descendant_next_element = new_parent_element.next_element new_parents_last_descendant_next_element = new_parent_element.next_element
to_append = element.contents to_append = element.contents
append_after = new_parent.element.contents append_after = new_parent_element.contents
if len(to_append) > 0: if len(to_append) > 0:
# Set the first child's previous_element and previous_sibling # Set the first child's previous_element and previous_sibling
# to elements within the new parent # to elements within the new parent
first_child = to_append[0] first_child = to_append[0]
first_child.previous_element = new_parents_last_descendant if new_parents_last_descendant:
first_child.previous_element = new_parents_last_descendant
else:
first_child.previous_element = new_parent_element
first_child.previous_sibling = new_parents_last_child first_child.previous_sibling = new_parents_last_child
if new_parents_last_descendant:
new_parents_last_descendant.next_element = first_child
else:
new_parent_element.next_element = first_child
if new_parents_last_child:
new_parents_last_child.next_sibling = first_child
# Fix the last child's next_element and next_sibling # Fix the last child's next_element and next_sibling
last_child = to_append[-1] last_child = to_append[-1]
last_child.next_element = new_parents_last_descendant_next_element last_child.next_element = new_parents_last_descendant_next_element
if new_parents_last_descendant_next_element:
new_parents_last_descendant_next_element.previous_element = last_child
last_child.next_sibling = None last_child.next_sibling = None
for child in to_append: for child in to_append:
@ -257,6 +300,10 @@ class Element(html5lib.treebuilders._base.Node):
element.contents = [] element.contents = []
element.next_element = final_next_element element.next_element = final_next_element
# print "DONE WITH MOVE"
# print "FROM", self.element
# print "TO", new_parent_element
def cloneNode(self): def cloneNode(self):
tag = self.soup.new_tag(self.element.name, self.namespace) tag = self.soup.new_tag(self.element.name, self.namespace)
node = Element(tag, self.soup, self.namespace) node = Element(tag, self.soup, self.namespace)

View File

@ -4,10 +4,16 @@ __all__ = [
'HTMLParserTreeBuilder', 'HTMLParserTreeBuilder',
] ]
from HTMLParser import ( from html.parser import HTMLParser
HTMLParser,
HTMLParseError, try:
) from html.parser import HTMLParseError
except ImportError as e:
# HTMLParseError is removed in Python 3.5. Since it can never be
# thrown in 3.5, we can just define our own class as a placeholder.
class HTMLParseError(Exception):
pass
import sys import sys
import warnings import warnings
@ -19,10 +25,10 @@ import warnings
# At the end of this file, we monkeypatch HTMLParser so that # At the end of this file, we monkeypatch HTMLParser so that
# strict=True works well on Python 3.2.2. # strict=True works well on Python 3.2.2.
major, minor, release = sys.version_info[:3] major, minor, release = sys.version_info[:3]
CONSTRUCTOR_TAKES_STRICT = ( CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
major > 3 CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
or (major == 3 and minor > 2) CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
or (major == 3 and minor == 2 and release >= 3))
from bs4.element import ( from bs4.element import (
CData, CData,
@ -63,7 +69,8 @@ class BeautifulSoupHTMLParser(HTMLParser):
def handle_charref(self, name): def handle_charref(self, name):
# XXX workaround for a bug in HTMLParser. Remove this once # XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed. # it's fixed in all supported versions.
# http://bugs.python.org/issue13633
if name.startswith('x'): if name.startswith('x'):
real_name = int(name.lstrip('x'), 16) real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'): elif name.startswith('X'):
@ -72,9 +79,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
real_name = int(name) real_name = int(name)
try: try:
data = unichr(real_name) data = chr(real_name)
except (ValueError, OverflowError), e: except (ValueError, OverflowError) as e:
data = u"\N{REPLACEMENT CHARACTER}" data = "\N{REPLACEMENT CHARACTER}"
self.handle_data(data) self.handle_data(data)
@ -113,14 +120,6 @@ class BeautifulSoupHTMLParser(HTMLParser):
def handle_pi(self, data): def handle_pi(self, data):
self.soup.endData() self.soup.endData()
if data.endswith("?") and data.lower().startswith("xml"):
# "An XHTML processing instruction using the trailing '?'
# will cause the '?' to be included in data." - HTMLParser
# docs.
#
# Strip the question mark so we don't end up with two
# question marks.
data = data[:-1]
self.soup.handle_data(data) self.soup.handle_data(data)
self.soup.endData(ProcessingInstruction) self.soup.endData(ProcessingInstruction)
@ -128,26 +127,31 @@ class BeautifulSoupHTMLParser(HTMLParser):
class HTMLParserTreeBuilder(HTMLTreeBuilder): class HTMLParserTreeBuilder(HTMLTreeBuilder):
is_xml = False is_xml = False
features = [HTML, STRICT, HTMLPARSER] picklable = True
NAME = HTMLPARSER
features = [NAME, HTML, STRICT]
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
if CONSTRUCTOR_TAKES_STRICT: if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
kwargs['strict'] = False kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
kwargs['convert_charrefs'] = False
self.parser_args = (args, kwargs) self.parser_args = (args, kwargs)
def prepare_markup(self, markup, user_specified_encoding=None, def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None): document_declared_encoding=None, exclude_encodings=None):
""" """
:return: A 4-tuple (markup, original encoding, encoding :return: A 4-tuple (markup, original encoding, encoding
declared within markup, whether any characters had to be declared within markup, whether any characters had to be
replaced with REPLACEMENT CHARACTER). replaced with REPLACEMENT CHARACTER).
""" """
if isinstance(markup, unicode): if isinstance(markup, str):
yield (markup, None, None, False) yield (markup, None, None, False)
return return
try_encodings = [user_specified_encoding, document_declared_encoding] try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, is_html=True) dammit = UnicodeDammit(markup, try_encodings, is_html=True,
exclude_encodings=exclude_encodings)
yield (dammit.markup, dammit.original_encoding, yield (dammit.markup, dammit.original_encoding,
dammit.declared_html_encoding, dammit.declared_html_encoding,
dammit.contains_replacement_characters) dammit.contains_replacement_characters)
@ -158,7 +162,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser.soup = self.soup parser.soup = self.soup
try: try:
parser.feed(markup) parser.feed(markup)
except HTMLParseError, e: except HTMLParseError as e:
warnings.warn(RuntimeWarning( warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e raise e

View File

@ -4,10 +4,15 @@ __all__ = [
] ]
from io import BytesIO from io import BytesIO
from StringIO import StringIO from io import StringIO
import collections import collections
from lxml import etree from lxml import etree
from bs4.element import Comment, Doctype, NamespacedAttribute from bs4.element import (
Comment,
Doctype,
NamespacedAttribute,
ProcessingInstruction,
)
from bs4.builder import ( from bs4.builder import (
FAST, FAST,
HTML, HTML,
@ -25,8 +30,11 @@ class LXMLTreeBuilderForXML(TreeBuilder):
is_xml = True is_xml = True
NAME = "lxml-xml"
ALTERNATE_NAMES = ["xml"]
# Well, it's permissive by XML parser standards. # Well, it's permissive by XML parser standards.
features = [LXML, XML, FAST, PERMISSIVE] features = [NAME, LXML, XML, FAST, PERMISSIVE]
CHUNK_SIZE = 512 CHUNK_SIZE = 512
@ -70,6 +78,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
return (None, tag) return (None, tag)
def prepare_markup(self, markup, user_specified_encoding=None, def prepare_markup(self, markup, user_specified_encoding=None,
exclude_encodings=None,
document_declared_encoding=None): document_declared_encoding=None):
""" """
:yield: A series of 4-tuples. :yield: A series of 4-tuples.
@ -78,12 +87,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
Each 4-tuple represents a strategy for parsing the document. Each 4-tuple represents a strategy for parsing the document.
""" """
if isinstance(markup, unicode): if isinstance(markup, str):
# We were given Unicode. Maybe lxml can parse Unicode on # We were given Unicode. Maybe lxml can parse Unicode on
# this system? # this system?
yield markup, None, document_declared_encoding, False yield markup, None, document_declared_encoding, False
if isinstance(markup, unicode): if isinstance(markup, str):
# No, apparently not. Convert the Unicode to UTF-8 and # No, apparently not. Convert the Unicode to UTF-8 and
# tell lxml to parse it as UTF-8. # tell lxml to parse it as UTF-8.
yield (markup.encode("utf8"), "utf8", yield (markup.encode("utf8"), "utf8",
@ -95,14 +104,15 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# the document as each one in turn. # the document as each one in turn.
is_html = not self.is_xml is_html = not self.is_xml
try_encodings = [user_specified_encoding, document_declared_encoding] try_encodings = [user_specified_encoding, document_declared_encoding]
detector = EncodingDetector(markup, try_encodings, is_html) detector = EncodingDetector(
markup, try_encodings, is_html, exclude_encodings)
for encoding in detector.encodings: for encoding in detector.encodings:
yield (detector.markup, encoding, document_declared_encoding, False) yield (detector.markup, encoding, document_declared_encoding, False)
def feed(self, markup): def feed(self, markup):
if isinstance(markup, bytes): if isinstance(markup, bytes):
markup = BytesIO(markup) markup = BytesIO(markup)
elif isinstance(markup, unicode): elif isinstance(markup, str):
markup = StringIO(markup) markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty, # Call feed() at least once, even if the markup is empty,
@ -117,7 +127,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if len(data) != 0: if len(data) != 0:
self.parser.feed(data) self.parser.feed(data)
self.parser.close() self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e: except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e)) raise ParserRejectedMarkup(str(e))
def close(self): def close(self):
@ -135,12 +145,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.nsmaps.append(None) self.nsmaps.append(None)
elif len(nsmap) > 0: elif len(nsmap) > 0:
# A new namespace mapping has come into play. # A new namespace mapping has come into play.
inverted_nsmap = dict((value, key) for key, value in nsmap.items()) inverted_nsmap = dict((value, key) for key, value in list(nsmap.items()))
self.nsmaps.append(inverted_nsmap) self.nsmaps.append(inverted_nsmap)
# Also treat the namespace mapping as a set of attributes on the # Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later. # tag, so we can recreate it later.
attrs = attrs.copy() attrs = attrs.copy()
for prefix, namespace in nsmap.items(): for prefix, namespace in list(nsmap.items()):
attribute = NamespacedAttribute( attribute = NamespacedAttribute(
"xmlns", prefix, "http://www.w3.org/2000/xmlns/") "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace attrs[attribute] = namespace
@ -149,7 +159,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# from lxml with namespaces attached to their names, and # from lxml with namespaces attached to their names, and
# turn then into NamespacedAttribute objects. # turn then into NamespacedAttribute objects.
new_attrs = {} new_attrs = {}
for attr, value in attrs.items(): for attr, value in list(attrs.items()):
namespace, attr = self._getNsTag(attr) namespace, attr = self._getNsTag(attr)
if namespace is None: if namespace is None:
new_attrs[attr] = value new_attrs[attr] = value
@ -189,7 +199,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.nsmaps.pop() self.nsmaps.pop()
def pi(self, target, data): def pi(self, target, data):
pass self.soup.endData()
self.soup.handle_data(target + ' ' + data)
self.soup.endData(ProcessingInstruction)
def data(self, content): def data(self, content):
self.soup.handle_data(content) self.soup.handle_data(content)
@ -207,12 +219,15 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`.""" """See `TreeBuilder`."""
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
features = [LXML, HTML, FAST, PERMISSIVE] NAME = LXML
ALTERNATE_NAMES = ["lxml-html"]
features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
is_xml = False is_xml = False
def default_parser(self, encoding): def default_parser(self, encoding):
@ -224,10 +239,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
self.parser = self.parser_for(encoding) self.parser = self.parser_for(encoding)
self.parser.feed(markup) self.parser.feed(markup)
self.parser.close() self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e: except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e)) raise ParserRejectedMarkup(str(e))
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`.""" """See `TreeBuilder`."""
return u'<html><body>%s</body></html>' % fragment return '<html><body>%s</body></html>' % fragment

View File

@ -3,12 +3,14 @@
This library converts a bytestream to Unicode through any means This library converts a bytestream to Unicode through any means
necessary. It is heavily based on code from Mark Pilgrim's Universal necessary. It is heavily based on code from Mark Pilgrim's Universal
Feed Parser. It works best on XML and XML, but it does not rewrite the Feed Parser. It works best on XML and HTML, but it does not rewrite the
XML or HTML to reflect a new encoding; that's the tree builder's job. XML or HTML to reflect a new encoding; that's the tree builder's job.
""" """
__license__ = "MIT"
from pdb import set_trace
import codecs import codecs
from htmlentitydefs import codepoint2name from html.entities import codepoint2name
import re import re
import logging import logging
import string import string
@ -56,7 +58,7 @@ class EntitySubstitution(object):
reverse_lookup = {} reverse_lookup = {}
characters_for_re = [] characters_for_re = []
for codepoint, name in list(codepoint2name.items()): for codepoint, name in list(codepoint2name.items()):
character = unichr(codepoint) character = chr(codepoint)
if codepoint != 34: if codepoint != 34:
# There's no point in turning the quotation mark into # There's no point in turning the quotation mark into
# &quot;, unless it happens within an attribute value, which # &quot;, unless it happens within an attribute value, which
@ -212,8 +214,11 @@ class EncodingDetector:
5. Windows-1252. 5. Windows-1252.
""" """
def __init__(self, markup, override_encodings=None, is_html=False): def __init__(self, markup, override_encodings=None, is_html=False,
exclude_encodings=None):
self.override_encodings = override_encodings or [] self.override_encodings = override_encodings or []
exclude_encodings = exclude_encodings or []
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
self.chardet_encoding = None self.chardet_encoding = None
self.is_html = is_html self.is_html = is_html
self.declared_encoding = None self.declared_encoding = None
@ -224,6 +229,8 @@ class EncodingDetector:
def _usable(self, encoding, tried): def _usable(self, encoding, tried):
if encoding is not None: if encoding is not None:
encoding = encoding.lower() encoding = encoding.lower()
if encoding in self.exclude_encodings:
return False
if encoding not in tried: if encoding not in tried:
tried.add(encoding) tried.add(encoding)
return True return True
@ -266,6 +273,9 @@ class EncodingDetector:
def strip_byte_order_mark(cls, data): def strip_byte_order_mark(cls, data):
"""If a byte-order mark is present, strip it and return the encoding it implies.""" """If a byte-order mark is present, strip it and return the encoding it implies."""
encoding = None encoding = None
if isinstance(data, str):
# Unicode data cannot have a byte-order mark.
return data, encoding
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
and (data[2:4] != '\x00\x00'): and (data[2:4] != '\x00\x00'):
encoding = 'utf-16be' encoding = 'utf-16be'
@ -306,7 +316,7 @@ class EncodingDetector:
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
if declared_encoding_match is not None: if declared_encoding_match is not None:
declared_encoding = declared_encoding_match.groups()[0].decode( declared_encoding = declared_encoding_match.groups()[0].decode(
'ascii') 'ascii', 'replace')
if declared_encoding: if declared_encoding:
return declared_encoding.lower() return declared_encoding.lower()
return None return None
@ -331,18 +341,19 @@ class UnicodeDammit:
] ]
def __init__(self, markup, override_encodings=[], def __init__(self, markup, override_encodings=[],
smart_quotes_to=None, is_html=False): smart_quotes_to=None, is_html=False, exclude_encodings=[]):
self.smart_quotes_to = smart_quotes_to self.smart_quotes_to = smart_quotes_to
self.tried_encodings = [] self.tried_encodings = []
self.contains_replacement_characters = False self.contains_replacement_characters = False
self.is_html = is_html self.is_html = is_html
self.detector = EncodingDetector(markup, override_encodings, is_html) self.detector = EncodingDetector(
markup, override_encodings, is_html, exclude_encodings)
# Short-circuit if the data is in Unicode to begin with. # Short-circuit if the data is in Unicode to begin with.
if isinstance(markup, unicode) or markup == '': if isinstance(markup, str) or markup == '':
self.markup = markup self.markup = markup
self.unicode_markup = unicode(markup) self.unicode_markup = str(markup)
self.original_encoding = None self.original_encoding = None
return return
@ -425,7 +436,7 @@ class UnicodeDammit:
def _to_unicode(self, data, encoding, errors="strict"): def _to_unicode(self, data, encoding, errors="strict"):
'''Given a string and its encoding, decodes the string into Unicode. '''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases''' %encoding is a string recognized by encodings.aliases'''
return unicode(data, encoding, errors) return str(data, encoding, errors)
@property @property
def declared_html_encoding(self): def declared_html_encoding(self):

View File

@ -1,7 +1,10 @@
"""Diagnostic functions, mainly for use when doing tech support.""" """Diagnostic functions, mainly for use when doing tech support."""
__license__ = "MIT"
import cProfile import cProfile
from StringIO import StringIO from io import StringIO
from HTMLParser import HTMLParser from html.parser import HTMLParser
import bs4 import bs4
from bs4 import BeautifulSoup, __version__ from bs4 import BeautifulSoup, __version__
from bs4.builder import builder_registry from bs4.builder import builder_registry
@ -17,8 +20,8 @@ import cProfile
def diagnose(data): def diagnose(data):
"""Diagnostic suite for isolating common problems.""" """Diagnostic suite for isolating common problems."""
print "Diagnostic running on Beautiful Soup %s" % __version__ print("Diagnostic running on Beautiful Soup %s" % __version__)
print "Python version %s" % sys.version print("Python version %s" % sys.version)
basic_parsers = ["html.parser", "html5lib", "lxml"] basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers: for name in basic_parsers:
@ -27,44 +30,53 @@ def diagnose(data):
break break
else: else:
basic_parsers.remove(name) basic_parsers.remove(name)
print ( print((
"I noticed that %s is not installed. Installing it may help." % "I noticed that %s is not installed. Installing it may help." %
name) name))
if 'lxml' in basic_parsers: if 'lxml' in basic_parsers:
basic_parsers.append(["lxml", "xml"]) basic_parsers.append(["lxml", "xml"])
from lxml import etree try:
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) from lxml import etree
print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
except ImportError as e:
print (
"lxml is not installed or couldn't be imported.")
if 'html5lib' in basic_parsers: if 'html5lib' in basic_parsers:
import html5lib try:
print "Found html5lib version %s" % html5lib.__version__ import html5lib
print("Found html5lib version %s" % html5lib.__version__)
except ImportError as e:
print (
"html5lib is not installed or couldn't be imported.")
if hasattr(data, 'read'): if hasattr(data, 'read'):
data = data.read() data = data.read()
elif os.path.exists(data): elif os.path.exists(data):
print '"%s" looks like a filename. Reading data from the file.' % data print('"%s" looks like a filename. Reading data from the file.' % data)
data = open(data).read() data = open(data).read()
elif data.startswith("http:") or data.startswith("https:"): elif data.startswith("http:") or data.startswith("https:"):
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
return return
print print()
for parser in basic_parsers: for parser in basic_parsers:
print "Trying to parse your markup with %s" % parser print("Trying to parse your markup with %s" % parser)
success = False success = False
try: try:
soup = BeautifulSoup(data, parser) soup = BeautifulSoup(data, parser)
success = True success = True
except Exception, e: except Exception as e:
print "%s could not parse the markup." % parser print("%s could not parse the markup." % parser)
traceback.print_exc() traceback.print_exc()
if success: if success:
print "Here's what %s did with the markup:" % parser print("Here's what %s did with the markup:" % parser)
print soup.prettify() print(soup.prettify())
print "-" * 80 print("-" * 80)
def lxml_trace(data, html=True, **kwargs): def lxml_trace(data, html=True, **kwargs):
"""Print out the lxml events that occur during parsing. """Print out the lxml events that occur during parsing.
@ -74,7 +86,7 @@ def lxml_trace(data, html=True, **kwargs):
""" """
from lxml import etree from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
print("%s, %4s, %s" % (event, element.tag, element.text)) print(("%s, %4s, %s" % (event, element.tag, element.text)))
class AnnouncingParser(HTMLParser): class AnnouncingParser(HTMLParser):
"""Announces HTMLParser parse events, without doing anything else.""" """Announces HTMLParser parse events, without doing anything else."""
@ -156,9 +168,9 @@ def rdoc(num_elements=1000):
def benchmark_parsers(num_elements=100000): def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark.""" """Very basic head-to-head performance benchmark."""
print "Comparative parser benchmark on Beautiful Soup %s" % __version__ print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
data = rdoc(num_elements) data = rdoc(num_elements)
print "Generated a large invalid HTML document (%d bytes)." % len(data) print("Generated a large invalid HTML document (%d bytes)." % len(data))
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False success = False
@ -167,24 +179,24 @@ def benchmark_parsers(num_elements=100000):
soup = BeautifulSoup(data, parser) soup = BeautifulSoup(data, parser)
b = time.time() b = time.time()
success = True success = True
except Exception, e: except Exception as e:
print "%s could not parse the markup." % parser print("%s could not parse the markup." % parser)
traceback.print_exc() traceback.print_exc()
if success: if success:
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
from lxml import etree from lxml import etree
a = time.time() a = time.time()
etree.HTML(data) etree.HTML(data)
b = time.time() b = time.time()
print "Raw lxml parsed the markup in %.2fs." % (b-a) print("Raw lxml parsed the markup in %.2fs." % (b-a))
import html5lib import html5lib
parser = html5lib.HTMLParser() parser = html5lib.HTMLParser()
a = time.time() a = time.time()
parser.parse(data) parser.parse(data)
b = time.time() b = time.time()
print "Raw html5lib parsed the markup in %.2fs." % (b-a) print("Raw html5lib parsed the markup in %.2fs." % (b-a))
def profile(num_elements=100000, parser="lxml"): def profile(num_elements=100000, parser="lxml"):

View File

@ -1,3 +1,6 @@
__license__ = "MIT"
from pdb import set_trace
import collections import collections
import re import re
import sys import sys
@ -21,22 +24,22 @@ def _alias(attr):
return alias return alias
class NamespacedAttribute(unicode): class NamespacedAttribute(str):
def __new__(cls, prefix, name, namespace=None): def __new__(cls, prefix, name, namespace=None):
if name is None: if name is None:
obj = unicode.__new__(cls, prefix) obj = str.__new__(cls, prefix)
elif prefix is None: elif prefix is None:
# Not really namespaced. # Not really namespaced.
obj = unicode.__new__(cls, name) obj = str.__new__(cls, name)
else: else:
obj = unicode.__new__(cls, prefix + ":" + name) obj = str.__new__(cls, prefix + ":" + name)
obj.prefix = prefix obj.prefix = prefix
obj.name = name obj.name = name
obj.namespace = namespace obj.namespace = namespace
return obj return obj
class AttributeValueWithCharsetSubstitution(unicode): class AttributeValueWithCharsetSubstitution(str):
"""A stand-in object for a character encoding specified in HTML.""" """A stand-in object for a character encoding specified in HTML."""
class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
@ -47,7 +50,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
""" """
def __new__(cls, original_value): def __new__(cls, original_value):
obj = unicode.__new__(cls, original_value) obj = str.__new__(cls, original_value)
obj.original_value = original_value obj.original_value = original_value
return obj return obj
@ -70,9 +73,9 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
match = cls.CHARSET_RE.search(original_value) match = cls.CHARSET_RE.search(original_value)
if match is None: if match is None:
# No substitution necessary. # No substitution necessary.
return unicode.__new__(unicode, original_value) return str.__new__(str, original_value)
obj = unicode.__new__(cls, original_value) obj = str.__new__(cls, original_value)
obj.original_value = original_value obj.original_value = original_value
return obj return obj
@ -152,7 +155,7 @@ class PageElement(object):
def format_string(self, s, formatter='minimal'): def format_string(self, s, formatter='minimal'):
"""Format the given string using the given formatter.""" """Format the given string using the given formatter."""
if not callable(formatter): if not isinstance(formatter, collections.Callable):
formatter = self._formatter_for_name(formatter) formatter = self._formatter_for_name(formatter)
if formatter is None: if formatter is None:
output = s output = s
@ -185,24 +188,40 @@ class PageElement(object):
return self.HTML_FORMATTERS.get( return self.HTML_FORMATTERS.get(
name, HTMLAwareEntitySubstitution.substitute_xml) name, HTMLAwareEntitySubstitution.substitute_xml)
def setup(self, parent=None, previous_element=None): def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
"""Sets up the initial relations between this element and """Sets up the initial relations between this element and
other elements.""" other elements."""
self.parent = parent self.parent = parent
self.previous_element = previous_element self.previous_element = previous_element
if previous_element is not None: if previous_element is not None:
self.previous_element.next_element = self self.previous_element.next_element = self
self.next_element = None
self.previous_sibling = None self.next_element = next_element
self.next_sibling = None if self.next_element:
if self.parent is not None and self.parent.contents: self.next_element.previous_element = self
self.previous_sibling = self.parent.contents[-1]
self.next_sibling = next_sibling
if self.next_sibling:
self.next_sibling.previous_sibling = self
if (not previous_sibling
and self.parent is not None and self.parent.contents):
previous_sibling = self.parent.contents[-1]
self.previous_sibling = previous_sibling
if previous_sibling:
self.previous_sibling.next_sibling = self self.previous_sibling.next_sibling = self
nextSibling = _alias("next_sibling") # BS3 nextSibling = _alias("next_sibling") # BS3
previousSibling = _alias("previous_sibling") # BS3 previousSibling = _alias("previous_sibling") # BS3
def replace_with(self, replace_with): def replace_with(self, replace_with):
if not self.parent:
raise ValueError(
"Cannot replace one element with another when the"
"element to be replaced is not part of a tree.")
if replace_with is self: if replace_with is self:
return return
if replace_with is self.parent: if replace_with is self.parent:
@ -216,6 +235,10 @@ class PageElement(object):
def unwrap(self): def unwrap(self):
my_parent = self.parent my_parent = self.parent
if not self.parent:
raise ValueError(
"Cannot replace an element with its contents when that"
"element is not part of a tree.")
my_index = self.parent.index(self) my_index = self.parent.index(self)
self.extract() self.extract()
for child in reversed(self.contents[:]): for child in reversed(self.contents[:]):
@ -240,17 +263,20 @@ class PageElement(object):
last_child = self._last_descendant() last_child = self._last_descendant()
next_element = last_child.next_element next_element = last_child.next_element
if self.previous_element is not None: if (self.previous_element is not None and
self.previous_element is not next_element):
self.previous_element.next_element = next_element self.previous_element.next_element = next_element
if next_element is not None: if next_element is not None and next_element is not self.previous_element:
next_element.previous_element = self.previous_element next_element.previous_element = self.previous_element
self.previous_element = None self.previous_element = None
last_child.next_element = None last_child.next_element = None
self.parent = None self.parent = None
if self.previous_sibling is not None: if (self.previous_sibling is not None
and self.previous_sibling is not self.next_sibling):
self.previous_sibling.next_sibling = self.next_sibling self.previous_sibling.next_sibling = self.next_sibling
if self.next_sibling is not None: if (self.next_sibling is not None
and self.next_sibling is not self.previous_sibling):
self.next_sibling.previous_sibling = self.previous_sibling self.next_sibling.previous_sibling = self.previous_sibling
self.previous_sibling = self.next_sibling = None self.previous_sibling = self.next_sibling = None
return self return self
@ -263,16 +289,18 @@ class PageElement(object):
last_child = self last_child = self
while isinstance(last_child, Tag) and last_child.contents: while isinstance(last_child, Tag) and last_child.contents:
last_child = last_child.contents[-1] last_child = last_child.contents[-1]
if not accept_self and last_child == self: if not accept_self and last_child is self:
last_child = None last_child = None
return last_child return last_child
# BS3: Not part of the API! # BS3: Not part of the API!
_lastRecursiveChild = _last_descendant _lastRecursiveChild = _last_descendant
def insert(self, position, new_child): def insert(self, position, new_child):
if new_child is None:
raise ValueError("Cannot insert None into a tag.")
if new_child is self: if new_child is self:
raise ValueError("Cannot insert a tag into itself.") raise ValueError("Cannot insert a tag into itself.")
if (isinstance(new_child, basestring) if (isinstance(new_child, str)
and not isinstance(new_child, NavigableString)): and not isinstance(new_child, NavigableString)):
new_child = NavigableString(new_child) new_child = NavigableString(new_child)
@ -478,6 +506,10 @@ class PageElement(object):
def _find_all(self, name, attrs, text, limit, generator, **kwargs): def _find_all(self, name, attrs, text, limit, generator, **kwargs):
"Iterates over a generator looking for things that match." "Iterates over a generator looking for things that match."
if text is None and 'string' in kwargs:
text = kwargs['string']
del kwargs['string']
if isinstance(name, SoupStrainer): if isinstance(name, SoupStrainer):
strainer = name strainer = name
else: else:
@ -489,7 +521,7 @@ class PageElement(object):
result = (element for element in generator result = (element for element in generator
if isinstance(element, Tag)) if isinstance(element, Tag))
return ResultSet(strainer, result) return ResultSet(strainer, result)
elif isinstance(name, basestring): elif isinstance(name, str):
# Optimization to find all tags with a given name. # Optimization to find all tags with a given name.
result = (element for element in generator result = (element for element in generator
if isinstance(element, Tag) if isinstance(element, Tag)
@ -548,17 +580,17 @@ class PageElement(object):
# Methods for supporting CSS selectors. # Methods for supporting CSS selectors.
tag_name_re = re.compile('^[a-z0-9]+$') tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
# /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
# \---/ \---/\-------------/ \-------/ # \---------------------------/ \---/\-------------/ \-------/
# | | | | # | | | |
# | | | The value # | | | The value
# | | ~,|,^,$,* or = # | | ~,|,^,$,* or =
# | Attribute # | Attribute
# Tag # Tag
attribselect_re = re.compile( attribselect_re = re.compile(
r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
r'=?"?(?P<value>[^\]"]*)"?\]$' r'=?"?(?P<value>[^\]"]*)"?\]$'
) )
@ -640,7 +672,7 @@ class PageElement(object):
return self.parents return self.parents
class NavigableString(unicode, PageElement): class NavigableString(str, PageElement):
PREFIX = '' PREFIX = ''
SUFFIX = '' SUFFIX = ''
@ -653,15 +685,21 @@ class NavigableString(unicode, PageElement):
passed in to the superclass's __new__ or the superclass won't know passed in to the superclass's __new__ or the superclass won't know
how to handle non-ASCII characters. how to handle non-ASCII characters.
""" """
if isinstance(value, unicode): if isinstance(value, str):
return unicode.__new__(cls, value) u = str.__new__(cls, value)
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) else:
u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
u.setup()
return u
def __copy__(self): def __copy__(self):
return self """A copy of a NavigableString has the same contents and class
as the original, but it is not connected to the parse tree.
"""
return type(self)(self)
def __getnewargs__(self): def __getnewargs__(self):
return (unicode(self),) return (str(self),)
def __getattr__(self, attr): def __getattr__(self, attr):
"""text.string gives you text. This is for backwards """text.string gives you text. This is for backwards
@ -701,23 +739,23 @@ class PreformattedString(NavigableString):
class CData(PreformattedString): class CData(PreformattedString):
PREFIX = u'<![CDATA[' PREFIX = '<![CDATA['
SUFFIX = u']]>' SUFFIX = ']]>'
class ProcessingInstruction(PreformattedString): class ProcessingInstruction(PreformattedString):
PREFIX = u'<?' PREFIX = '<?'
SUFFIX = u'?>' SUFFIX = '>'
class Comment(PreformattedString): class Comment(PreformattedString):
PREFIX = u'<!--' PREFIX = '<!--'
SUFFIX = u'-->' SUFFIX = '-->'
class Declaration(PreformattedString): class Declaration(PreformattedString):
PREFIX = u'<!' PREFIX = '<?'
SUFFIX = u'!>' SUFFIX = '?>'
class Doctype(PreformattedString): class Doctype(PreformattedString):
@ -734,8 +772,8 @@ class Doctype(PreformattedString):
return Doctype(value) return Doctype(value)
PREFIX = u'<!DOCTYPE ' PREFIX = '<!DOCTYPE '
SUFFIX = u'>\n' SUFFIX = '>\n'
class Tag(PageElement): class Tag(PageElement):
@ -759,9 +797,12 @@ class Tag(PageElement):
self.prefix = prefix self.prefix = prefix
if attrs is None: if attrs is None:
attrs = {} attrs = {}
elif attrs and builder.cdata_list_attributes: elif attrs:
attrs = builder._replace_cdata_list_attribute_values( if builder is not None and builder.cdata_list_attributes:
self.name, attrs) attrs = builder._replace_cdata_list_attribute_values(
self.name, attrs)
else:
attrs = dict(attrs)
else: else:
attrs = dict(attrs) attrs = dict(attrs)
self.attrs = attrs self.attrs = attrs
@ -778,6 +819,18 @@ class Tag(PageElement):
parserClass = _alias("parser_class") # BS3 parserClass = _alias("parser_class") # BS3
def __copy__(self):
"""A copy of a Tag is a new Tag, unconnected to the parse tree.
Its contents are a copy of the old Tag's contents.
"""
clone = type(self)(None, self.builder, self.name, self.namespace,
self.nsprefix, self.attrs)
for attr in ('can_be_empty_element', 'hidden'):
setattr(clone, attr, getattr(self, attr))
for child in self.contents:
clone.append(child.__copy__())
return clone
@property @property
def is_empty_element(self): def is_empty_element(self):
"""Is this tag an empty-element tag? (aka a self-closing tag) """Is this tag an empty-element tag? (aka a self-closing tag)
@ -843,7 +896,7 @@ class Tag(PageElement):
for string in self._all_strings(True): for string in self._all_strings(True):
yield string yield string
def get_text(self, separator=u"", strip=False, def get_text(self, separator="", strip=False,
types=(NavigableString, CData)): types=(NavigableString, CData)):
""" """
Get all child strings, concatenated using the given separator. Get all child strings, concatenated using the given separator.
@ -915,7 +968,7 @@ class Tag(PageElement):
def __contains__(self, x): def __contains__(self, x):
return x in self.contents return x in self.contents
def __nonzero__(self): def __bool__(self):
"A tag is non-None even if it has no contents." "A tag is non-None even if it has no contents."
return True return True
@ -971,15 +1024,25 @@ class Tag(PageElement):
as defined in __eq__.""" as defined in __eq__."""
return not self == other return not self == other
def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): def __repr__(self, encoding="unicode-escape"):
"""Renders this tag as a string.""" """Renders this tag as a string."""
return self.encode(encoding) if PY3K:
# "The return value must be a string object", i.e. Unicode
return self.decode()
else:
# "The return value must be a string object", i.e. a bytestring.
# By convention, the return value of __repr__ should also be
# an ASCII string.
return self.encode(encoding)
def __unicode__(self): def __unicode__(self):
return self.decode() return self.decode()
def __str__(self): def __str__(self):
return self.encode() if PY3K:
return self.decode()
else:
return self.encode()
if PY3K: if PY3K:
__str__ = __repr__ = __unicode__ __str__ = __repr__ = __unicode__
@ -1014,7 +1077,7 @@ class Tag(PageElement):
# First off, turn a string formatter into a function. This # First off, turn a string formatter into a function. This
# will stop the lookup from happening over and over again. # will stop the lookup from happening over and over again.
if not callable(formatter): if not isinstance(formatter, collections.Callable):
formatter = self._formatter_for_name(formatter) formatter = self._formatter_for_name(formatter)
attrs = [] attrs = []
@ -1025,8 +1088,8 @@ class Tag(PageElement):
else: else:
if isinstance(val, list) or isinstance(val, tuple): if isinstance(val, list) or isinstance(val, tuple):
val = ' '.join(val) val = ' '.join(val)
elif not isinstance(val, basestring): elif not isinstance(val, str):
val = unicode(val) val = str(val)
elif ( elif (
isinstance(val, AttributeValueWithCharsetSubstitution) isinstance(val, AttributeValueWithCharsetSubstitution)
and eventual_encoding is not None): and eventual_encoding is not None):
@ -1034,7 +1097,7 @@ class Tag(PageElement):
text = self.format_string(val, formatter) text = self.format_string(val, formatter)
decoded = ( decoded = (
unicode(key) + '=' str(key) + '='
+ EntitySubstitution.quoted_attribute_value(text)) + EntitySubstitution.quoted_attribute_value(text))
attrs.append(decoded) attrs.append(decoded)
close = '' close = ''
@ -1103,16 +1166,22 @@ class Tag(PageElement):
formatter="minimal"): formatter="minimal"):
"""Renders the contents of this tag as a Unicode string. """Renders the contents of this tag as a Unicode string.
:param indent_level: Each line of the rendering will be
indented this many spaces.
:param eventual_encoding: The tag is destined to be :param eventual_encoding: The tag is destined to be
encoded into this encoding. This method is _not_ encoded into this encoding. This method is _not_
responsible for performing that encoding. This information responsible for performing that encoding. This information
is passed in so that it can be substituted in if the is passed in so that it can be substituted in if the
document contains a <META> tag that mentions the document's document contains a <META> tag that mentions the document's
encoding. encoding.
:param formatter: The output formatter responsible for converting
entities to Unicode characters.
""" """
# First off, turn a string formatter into a function. This # First off, turn a string formatter into a function. This
# will stop the lookup from happening over and over again. # will stop the lookup from happening over and over again.
if not callable(formatter): if not isinstance(formatter, collections.Callable):
formatter = self._formatter_for_name(formatter) formatter = self._formatter_for_name(formatter)
pretty_print = (indent_level is not None) pretty_print = (indent_level is not None)
@ -1137,7 +1206,17 @@ class Tag(PageElement):
def encode_contents( def encode_contents(
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"): formatter="minimal"):
"""Renders the contents of this tag as a bytestring.""" """Renders the contents of this tag as a bytestring.
:param indent_level: Each line of the rendering will be
indented this many spaces.
:param eventual_encoding: The bytestring will be in this encoding.
:param formatter: The output formatter responsible for converting
entities to Unicode characters.
"""
contents = self.decode_contents(indent_level, encoding, formatter) contents = self.decode_contents(indent_level, encoding, formatter)
return contents.encode(encoding) return contents.encode(encoding)
@ -1201,26 +1280,57 @@ class Tag(PageElement):
_selector_combinators = ['>', '+', '~'] _selector_combinators = ['>', '+', '~']
_select_debug = False _select_debug = False
def select(self, selector, _candidate_generator=None): def select_one(self, selector):
"""Perform a CSS selection operation on the current element.""" """Perform a CSS selection operation on the current element."""
value = self.select(selector, limit=1)
if value:
return value[0]
return None
def select(self, selector, _candidate_generator=None, limit=None):
"""Perform a CSS selection operation on the current element."""
# Handle grouping selectors if ',' exists, ie: p,a
if ',' in selector:
context = []
for partial_selector in selector.split(','):
partial_selector = partial_selector.strip()
if partial_selector == '':
raise ValueError('Invalid group selection syntax: %s' % selector)
candidates = self.select(partial_selector, limit=limit)
for candidate in candidates:
if candidate not in context:
context.append(candidate)
if limit and len(context) >= limit:
break
return context
tokens = selector.split() tokens = selector.split()
current_context = [self] current_context = [self]
if tokens[-1] in self._selector_combinators: if tokens[-1] in self._selector_combinators:
raise ValueError( raise ValueError(
'Final combinator "%s" is missing an argument.' % tokens[-1]) 'Final combinator "%s" is missing an argument.' % tokens[-1])
if self._select_debug: if self._select_debug:
print 'Running CSS selector "%s"' % selector print('Running CSS selector "%s"' % selector)
for index, token in enumerate(tokens): for index, token in enumerate(tokens):
if self._select_debug: new_context = []
print ' Considering token "%s"' % token new_context_ids = set([])
recursive_candidate_generator = None
tag_name = None
if tokens[index-1] in self._selector_combinators: if tokens[index-1] in self._selector_combinators:
# This token was consumed by the previous combinator. Skip it. # This token was consumed by the previous combinator. Skip it.
if self._select_debug: if self._select_debug:
print ' Token was consumed by the previous combinator.' print(' Token was consumed by the previous combinator.')
continue continue
if self._select_debug:
print(' Considering token "%s"' % token)
recursive_candidate_generator = None
tag_name = None
# Each operation corresponds to a checker function, a rule # Each operation corresponds to a checker function, a rule
# for determining whether a candidate matches the # for determining whether a candidate matches the
# selector. Candidates are generated by the active # selector. Candidates are generated by the active
@ -1256,35 +1366,38 @@ class Tag(PageElement):
"A pseudo-class must be prefixed with a tag name.") "A pseudo-class must be prefixed with a tag name.")
pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
found = [] found = []
if pseudo_attributes is not None: if pseudo_attributes is None:
pseudo_type = pseudo
pseudo_value = None
else:
pseudo_type, pseudo_value = pseudo_attributes.groups() pseudo_type, pseudo_value = pseudo_attributes.groups()
if pseudo_type == 'nth-of-type': if pseudo_type == 'nth-of-type':
try: try:
pseudo_value = int(pseudo_value) pseudo_value = int(pseudo_value)
except: except:
raise NotImplementedError(
'Only numeric values are currently supported for the nth-of-type pseudo-class.')
if pseudo_value < 1:
raise ValueError(
'nth-of-type pseudo-class value must be at least 1.')
class Counter(object):
def __init__(self, destination):
self.count = 0
self.destination = destination
def nth_child_of_type(self, tag):
self.count += 1
if self.count == self.destination:
return True
if self.count > self.destination:
# Stop the generator that's sending us
# these things.
raise StopIteration()
return False
checker = Counter(pseudo_value).nth_child_of_type
else:
raise NotImplementedError( raise NotImplementedError(
'Only the following pseudo-classes are implemented: nth-of-type.') 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
if pseudo_value < 1:
raise ValueError(
'nth-of-type pseudo-class value must be at least 1.')
class Counter(object):
def __init__(self, destination):
self.count = 0
self.destination = destination
def nth_child_of_type(self, tag):
self.count += 1
if self.count == self.destination:
return True
if self.count > self.destination:
# Stop the generator that's sending us
# these things.
raise StopIteration()
return False
checker = Counter(pseudo_value).nth_child_of_type
else:
raise NotImplementedError(
'Only the following pseudo-classes are implemented: nth-of-type.')
elif token == '*': elif token == '*':
# Star selector -- matches everything # Star selector -- matches everything
@ -1311,7 +1424,6 @@ class Tag(PageElement):
else: else:
raise ValueError( raise ValueError(
'Unsupported or invalid CSS selector: "%s"' % token) 'Unsupported or invalid CSS selector: "%s"' % token)
if recursive_candidate_generator: if recursive_candidate_generator:
# This happens when the selector looks like "> foo". # This happens when the selector looks like "> foo".
# #
@ -1325,14 +1437,14 @@ class Tag(PageElement):
next_token = tokens[index+1] next_token = tokens[index+1]
def recursive_select(tag): def recursive_select(tag):
if self._select_debug: if self._select_debug:
print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
print '-' * 40 print('-' * 40)
for i in tag.select(next_token, recursive_candidate_generator): for i in tag.select(next_token, recursive_candidate_generator):
if self._select_debug: if self._select_debug:
print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
yield i yield i
if self._select_debug: if self._select_debug:
print '-' * 40 print('-' * 40)
_use_candidate_generator = recursive_select _use_candidate_generator = recursive_select
elif _candidate_generator is None: elif _candidate_generator is None:
# By default, a tag's candidates are all of its # By default, a tag's candidates are all of its
@ -1343,7 +1455,7 @@ class Tag(PageElement):
check = "[any]" check = "[any]"
else: else:
check = tag_name check = tag_name
print ' Default candidate generator, tag name="%s"' % check print(' Default candidate generator, tag name="%s"' % check)
if self._select_debug: if self._select_debug:
# This is redundant with later code, but it stops # This is redundant with later code, but it stops
# a bunch of bogus tags from cluttering up the # a bunch of bogus tags from cluttering up the
@ -1361,12 +1473,11 @@ class Tag(PageElement):
else: else:
_use_candidate_generator = _candidate_generator _use_candidate_generator = _candidate_generator
new_context = [] count = 0
new_context_ids = set([])
for tag in current_context: for tag in current_context:
if self._select_debug: if self._select_debug:
print " Running candidate generator on %s %s" % ( print(" Running candidate generator on %s %s" % (
tag.name, repr(tag.attrs)) tag.name, repr(tag.attrs)))
for candidate in _use_candidate_generator(tag): for candidate in _use_candidate_generator(tag):
if not isinstance(candidate, Tag): if not isinstance(candidate, Tag):
continue continue
@ -1381,21 +1492,24 @@ class Tag(PageElement):
break break
if checker is None or result: if checker is None or result:
if self._select_debug: if self._select_debug:
print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
if id(candidate) not in new_context_ids: if id(candidate) not in new_context_ids:
# If a tag matches a selector more than once, # If a tag matches a selector more than once,
# don't include it in the context more than once. # don't include it in the context more than once.
new_context.append(candidate) new_context.append(candidate)
new_context_ids.add(id(candidate)) new_context_ids.add(id(candidate))
if limit and len(new_context) >= limit:
break
elif self._select_debug: elif self._select_debug:
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
current_context = new_context current_context = new_context
if self._select_debug: if self._select_debug:
print "Final verdict:" print("Final verdict:")
for i in current_context: for i in current_context:
print " %s %s" % (i.name, i.attrs) print(" %s %s" % (i.name, i.attrs))
return current_context return current_context
# Old names for backwards compatibility # Old names for backwards compatibility
@ -1439,7 +1553,7 @@ class SoupStrainer(object):
else: else:
attrs = kwargs attrs = kwargs
normalized_attrs = {} normalized_attrs = {}
for key, value in attrs.items(): for key, value in list(attrs.items()):
normalized_attrs[key] = self._normalize_search_value(value) normalized_attrs[key] = self._normalize_search_value(value)
self.attrs = normalized_attrs self.attrs = normalized_attrs
@ -1448,7 +1562,7 @@ class SoupStrainer(object):
def _normalize_search_value(self, value): def _normalize_search_value(self, value):
# Leave it alone if it's a Unicode string, a callable, a # Leave it alone if it's a Unicode string, a callable, a
# regular expression, a boolean, or None. # regular expression, a boolean, or None.
if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') if (isinstance(value, str) or isinstance(value, collections.Callable) or hasattr(value, 'match')
or isinstance(value, bool) or value is None): or isinstance(value, bool) or value is None):
return value return value
@ -1461,7 +1575,7 @@ class SoupStrainer(object):
new_value = [] new_value = []
for v in value: for v in value:
if (hasattr(v, '__iter__') and not isinstance(v, bytes) if (hasattr(v, '__iter__') and not isinstance(v, bytes)
and not isinstance(v, unicode)): and not isinstance(v, str)):
# This is almost certainly the user's mistake. In the # This is almost certainly the user's mistake. In the
# interests of avoiding infinite loops, we'll let # interests of avoiding infinite loops, we'll let
# it through as-is rather than doing a recursive call. # it through as-is rather than doing a recursive call.
@ -1473,7 +1587,7 @@ class SoupStrainer(object):
# Otherwise, convert it into a Unicode string. # Otherwise, convert it into a Unicode string.
# The unicode(str()) thing is so this will do the same thing on Python 2 # The unicode(str()) thing is so this will do the same thing on Python 2
# and Python 3. # and Python 3.
return unicode(str(value)) return str(str(value))
def __str__(self): def __str__(self):
if self.text: if self.text:
@ -1527,7 +1641,7 @@ class SoupStrainer(object):
found = None found = None
# If given a list of items, scan it for a text element that # If given a list of items, scan it for a text element that
# matches. # matches.
if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
for element in markup: for element in markup:
if isinstance(element, NavigableString) \ if isinstance(element, NavigableString) \
and self.search(element): and self.search(element):
@ -1540,7 +1654,7 @@ class SoupStrainer(object):
found = self.search_tag(markup) found = self.search_tag(markup)
# If it's text, make sure the text matches. # If it's text, make sure the text matches.
elif isinstance(markup, NavigableString) or \ elif isinstance(markup, NavigableString) or \
isinstance(markup, basestring): isinstance(markup, str):
if not self.name and not self.attrs and self._matches(markup, self.text): if not self.name and not self.attrs and self._matches(markup, self.text):
found = markup found = markup
else: else:
@ -1554,7 +1668,7 @@ class SoupStrainer(object):
if isinstance(markup, list) or isinstance(markup, tuple): if isinstance(markup, list) or isinstance(markup, tuple):
# This should only happen when searching a multi-valued attribute # This should only happen when searching a multi-valued attribute
# like 'class'. # like 'class'.
if (isinstance(match_against, unicode) if (isinstance(match_against, str)
and ' ' in match_against): and ' ' in match_against):
# A bit of a special case. If they try to match "foo # A bit of a special case. If they try to match "foo
# bar" on a multivalue attribute's value, only accept # bar" on a multivalue attribute's value, only accept
@ -1589,7 +1703,7 @@ class SoupStrainer(object):
# None matches None, False, an empty string, an empty list, and so on. # None matches None, False, an empty string, an empty list, and so on.
return not match_against return not match_against
if isinstance(match_against, unicode): if isinstance(match_against, str):
# Exact string match # Exact string match
return markup == match_against return markup == match_against

View File

@ -1,5 +1,8 @@
"""Helper classes for tests.""" """Helper classes for tests."""
__license__ = "MIT"
import pickle
import copy import copy
import functools import functools
import unittest import unittest
@ -43,6 +46,16 @@ class SoupTest(unittest.TestCase):
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
def assertConnectedness(self, element):
"""Ensure that next_element and previous_element are properly
set for all descendants of the given element.
"""
earlier = None
for e in element.descendants:
if earlier:
self.assertEqual(e, earlier.next_element)
self.assertEqual(earlier, e.previous_element)
earlier = e
class HTMLTreeBuilderSmokeTest(object): class HTMLTreeBuilderSmokeTest(object):
@ -54,6 +67,15 @@ class HTMLTreeBuilderSmokeTest(object):
markup in these tests, there's not much room for interpretation. markup in these tests, there's not much room for interpretation.
""" """
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
# to the original.
tree = self.soup("<a><b>foo</a>")
dumped = pickle.dumps(tree, 2)
loaded = pickle.loads(dumped)
self.assertEqual(loaded.__class__, BeautifulSoup)
self.assertEqual(loaded.decode(), tree.decode())
def assertDoctypeHandled(self, doctype_fragment): def assertDoctypeHandled(self, doctype_fragment):
"""Assert that a given doctype string is handled correctly.""" """Assert that a given doctype string is handled correctly."""
doctype_str, soup = self._document_with_doctype(doctype_fragment) doctype_str, soup = self._document_with_doctype(doctype_fragment)
@ -114,6 +136,11 @@ class HTMLTreeBuilderSmokeTest(object):
soup.encode("utf-8").replace(b"\n", b""), soup.encode("utf-8").replace(b"\n", b""),
markup.replace(b"\n", b"")) markup.replace(b"\n", b""))
def test_processing_instruction(self):
markup = b"""<?PITarget PIContent?>"""
soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8"))
def test_deepcopy(self): def test_deepcopy(self):
"""Make sure you can copy the tree builder. """Make sure you can copy the tree builder.
@ -155,6 +182,23 @@ class HTMLTreeBuilderSmokeTest(object):
def test_nested_formatting_elements(self): def test_nested_formatting_elements(self):
self.assertSoupEquals("<em><em></em></em>") self.assertSoupEquals("<em><em></em></em>")
def test_double_head(self):
html = '''<!DOCTYPE html>
<html>
<head>
<title>Ordinary HEAD element test</title>
</head>
<script type="text/javascript">
alert("Help!");
</script>
<body>
Hello, world!
</body>
</html>
'''
soup = self.soup(html)
self.assertEqual("text/javascript", soup.find('script')['type'])
def test_comment(self): def test_comment(self):
# Comments are represented as Comment objects. # Comments are represented as Comment objects.
markup = "<p>foo<!--foobar-->baz</p>" markup = "<p>foo<!--foobar-->baz</p>"
@ -221,18 +265,26 @@ class HTMLTreeBuilderSmokeTest(object):
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(["css"], soup.div.div['class']) self.assertEqual(["css"], soup.div.div['class'])
def test_multivalued_attribute_on_html(self):
# html5lib uses a different API to set the attributes ot the
# <html> tag. This has caused problems with multivalued
# attributes.
markup = '<html class="a b"></html>'
soup = self.soup(markup)
self.assertEqual(["a", "b"], soup.html['class'])
def test_angle_brackets_in_attribute_values_are_escaped(self): def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>') self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
def test_entities_in_attributes_converted_to_unicode(self): def test_entities_in_attributes_converted_to_unicode(self):
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect) self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect) self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect) self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect) self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
def test_entities_in_text_converted_to_unicode(self): def test_entities_in_text_converted_to_unicode(self):
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
self.assertSoupEquals("<p>pi&#241;ata</p>", expect) self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect) self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect) self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
@ -243,7 +295,7 @@ class HTMLTreeBuilderSmokeTest(object):
'<p>I said "good day!"</p>') '<p>I said "good day!"</p>')
def test_out_of_range_entity(self): def test_out_of_range_entity(self):
expect = u"\N{REPLACEMENT CHARACTER}" expect = "\N{REPLACEMENT CHARACTER}"
self.assertSoupEquals("&#10000000000000;", expect) self.assertSoupEquals("&#10000000000000;", expect)
self.assertSoupEquals("&#x10000000000000;", expect) self.assertSoupEquals("&#x10000000000000;", expect)
self.assertSoupEquals("&#1000000000;", expect) self.assertSoupEquals("&#1000000000;", expect)
@ -253,6 +305,35 @@ class HTMLTreeBuilderSmokeTest(object):
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
self.assertEqual("p", soup.h2.string.next_element.name) self.assertEqual("p", soup.h2.string.next_element.name)
self.assertEqual("p", soup.p.name) self.assertEqual("p", soup.p.name)
self.assertConnectedness(soup)
def test_head_tag_between_head_and_body(self):
"Prevent recurrence of a bug in the html5lib treebuilder."
content = """<html><head></head>
<link></link>
<body>foo</body>
</html>
"""
soup = self.soup(content)
self.assertNotEqual(None, soup.html.body)
self.assertConnectedness(soup)
def test_multiple_copies_of_a_tag(self):
"Prevent recurrence of a bug in the html5lib treebuilder."
content = """<!DOCTYPE html>
<html>
<body>
<article id="a" >
<div><a href="1"></div>
<footer>
<a href="2"></a>
</footer>
</article>
</body>
</html>
"""
soup = self.soup(content)
self.assertConnectedness(soup.article)
def test_basic_namespaces(self): def test_basic_namespaces(self):
"""Parsers don't need to *understand* namespaces, but at the """Parsers don't need to *understand* namespaces, but at the
@ -285,9 +366,9 @@ class HTMLTreeBuilderSmokeTest(object):
# A seemingly innocuous document... but it's in Unicode! And # A seemingly innocuous document... but it's in Unicode! And
# it contains characters that can't be represented in the # it contains characters that can't be represented in the
# encoding found in the declaration! The horror! # encoding found in the declaration! The horror!
markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
def test_soupstrainer(self): def test_soupstrainer(self):
"""Parsers should be able to work with SoupStrainers.""" """Parsers should be able to work with SoupStrainers."""
@ -327,7 +408,7 @@ class HTMLTreeBuilderSmokeTest(object):
# Both XML and HTML entities are converted to Unicode characters # Both XML and HTML entities are converted to Unicode characters
# during parsing. # during parsing.
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>" text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>" expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
self.assertSoupEquals(text, expected) self.assertSoupEquals(text, expected)
def test_smart_quotes_converted_on_the_way_in(self): def test_smart_quotes_converted_on_the_way_in(self):
@ -337,15 +418,15 @@ class HTMLTreeBuilderSmokeTest(object):
soup = self.soup(quote) soup = self.soup(quote)
self.assertEqual( self.assertEqual(
soup.p.string, soup.p.string,
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
def test_non_breaking_spaces_converted_on_the_way_in(self): def test_non_breaking_spaces_converted_on_the_way_in(self):
soup = self.soup("<a>&nbsp;&nbsp;</a>") soup = self.soup("<a>&nbsp;&nbsp;</a>")
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
def test_entities_converted_on_the_way_out(self): def test_entities_converted_on_the_way_out(self):
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>" text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8") expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
soup = self.soup(text) soup = self.soup(text)
self.assertEqual(soup.p.encode("utf-8"), expected) self.assertEqual(soup.p.encode("utf-8"), expected)
@ -354,7 +435,7 @@ class HTMLTreeBuilderSmokeTest(object):
# easy-to-understand document. # easy-to-understand document.
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1. # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
# That's because we're going to encode it into ISO-Latin-1, and use # That's because we're going to encode it into ISO-Latin-1, and use
# that to test. # that to test.
@ -463,11 +544,25 @@ class HTMLTreeBuilderSmokeTest(object):
class XMLTreeBuilderSmokeTest(object): class XMLTreeBuilderSmokeTest(object):
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
# to the original.
tree = self.soup("<a><b>foo</a>")
dumped = pickle.dumps(tree, 2)
loaded = pickle.loads(dumped)
self.assertEqual(loaded.__class__, BeautifulSoup)
self.assertEqual(loaded.decode(), tree.decode())
def test_docstring_generated(self): def test_docstring_generated(self):
soup = self.soup("<root/>") soup = self.soup("<root/>")
self.assertEqual( self.assertEqual(
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>') soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
def test_xml_declaration(self):
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8"))
def test_real_xhtml_document(self): def test_real_xhtml_document(self):
"""A real XHTML document should come out *exactly* the same as it went in.""" """A real XHTML document should come out *exactly* the same as it went in."""
markup = b"""<?xml version="1.0" encoding="utf-8"?> markup = b"""<?xml version="1.0" encoding="utf-8"?>
@ -485,7 +580,7 @@ class XMLTreeBuilderSmokeTest(object):
<script type="text/javascript"> <script type="text/javascript">
</script> </script>
""" """
soup = BeautifulSoup(doc, "xml") soup = BeautifulSoup(doc, "lxml-xml")
# lxml would have stripped this while parsing, but we can add # lxml would have stripped this while parsing, but we can add
# it later. # it later.
soup.script.string = 'console.log("< < hey > > ");' soup.script.string = 'console.log("< < hey > > ");'
@ -493,15 +588,15 @@ class XMLTreeBuilderSmokeTest(object):
self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded) self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
def test_can_parse_unicode_document(self): def test_can_parse_unicode_document(self):
markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
def test_popping_namespaced_tag(self): def test_popping_namespaced_tag(self):
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual( self.assertEqual(
unicode(soup.rss), markup) str(soup.rss), markup)
def test_docstring_includes_correct_encoding(self): def test_docstring_includes_correct_encoding(self):
soup = self.soup("<root/>") soup = self.soup("<root/>")
@ -532,17 +627,17 @@ class XMLTreeBuilderSmokeTest(object):
def test_closing_namespaced_tag(self): def test_closing_namespaced_tag(self):
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>' markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(unicode(soup.p), markup) self.assertEqual(str(soup.p), markup)
def test_namespaced_attributes(self): def test_namespaced_attributes(self):
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>' markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup) self.assertEqual(str(soup.foo), markup)
def test_namespaced_attributes_xml_namespace(self): def test_namespaced_attributes_xml_namespace(self):
markup = '<foo xml:lang="fr">bar</foo>' markup = '<foo xml:lang="fr">bar</foo>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup) self.assertEqual(str(soup.foo), markup)
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5.""" """Smoke test for a tree builder that supports HTML5."""

View File

@ -1,6 +1,7 @@
"""Tests of the builder registry.""" """Tests of the builder registry."""
import unittest import unittest
import warnings
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.builder import ( from bs4.builder import (
@ -67,10 +68,15 @@ class BuiltInRegistryTest(unittest.TestCase):
HTMLParserTreeBuilder) HTMLParserTreeBuilder)
def test_beautifulsoup_constructor_does_lookup(self): def test_beautifulsoup_constructor_does_lookup(self):
# You can pass in a string.
BeautifulSoup("", features="html") with warnings.catch_warnings(record=True) as w:
# Or a list of strings. # This will create a warning about not explicitly
BeautifulSoup("", features=["html", "fast"]) # specifying a parser, but we'll ignore it.
# You can pass in a string.
BeautifulSoup("", features="html")
# Or a list of strings.
BeautifulSoup("", features=["html", "fast"])
# You'll get an exception if BS can't find an appropriate # You'll get an exception if BS can't find an appropriate
# builder. # builder.

View File

@ -5,7 +5,7 @@ import warnings
try: try:
from bs4.builder import HTML5TreeBuilder from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True HTML5LIB_PRESENT = True
except ImportError, e: except ImportError as e:
HTML5LIB_PRESENT = False HTML5LIB_PRESENT = False
from bs4.element import SoupStrainer from bs4.element import SoupStrainer
from bs4.testing import ( from bs4.testing import (
@ -74,12 +74,25 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
def test_reparented_markup(self): def test_reparented_markup(self):
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>' markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode()) self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p'))) self.assertEqual(2, len(soup.find_all('p')))
def test_reparented_markup_ends_with_whitespace(self): def test_reparented_markup_ends_with_whitespace(self):
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n' markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p'))) self.assertEqual(2, len(soup.find_all('p')))
def test_processing_instruction(self):
"""Processing instructions become comments."""
markup = b"""<?PITarget PIContent?>"""
soup = self.soup(markup)
assert str(soup).startswith("<!--?PITarget PIContent?-->")
def test_cloned_multivalue_node(self):
markup = b"""<a class="my_class"><p></a>"""
soup = self.soup(markup)
a1, a2 = soup.find_all('a')
self.assertEqual(a1, a2)
assert a1 is not a2

View File

@ -1,6 +1,8 @@
"""Tests to ensure that the html.parser tree builder generates good """Tests to ensure that the html.parser tree builder generates good
trees.""" trees."""
from pdb import set_trace
import pickle
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder from bs4.builder import HTMLParserTreeBuilder
@ -17,3 +19,14 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
def test_namespaced_public_doctype(self): def test_namespaced_public_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one. # html.parser can't handle namespaced doctypes, so skip this one.
pass pass
def test_builder_is_pickled(self):
"""Unlike most tree builders, HTMLParserTreeBuilder and will
be restored after pickling.
"""
tree = self.soup("<a><b>foo</a>")
dumped = pickle.dumps(tree, 2)
loaded = pickle.loads(dumped)
self.assertTrue(isinstance(loaded.builder, type(tree.builder)))

View File

@ -7,7 +7,7 @@ try:
import lxml.etree import lxml.etree
LXML_PRESENT = True LXML_PRESENT = True
LXML_VERSION = lxml.etree.LXML_VERSION LXML_VERSION = lxml.etree.LXML_VERSION
except ImportError, e: except ImportError as e:
LXML_PRESENT = False LXML_PRESENT = False
LXML_VERSION = (0,) LXML_VERSION = (0,)
@ -62,24 +62,9 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
# if one is installed. # if one is installed.
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
soup = BeautifulStoneSoup("<b />") soup = BeautifulStoneSoup("<b />")
self.assertEqual(u"<b/>", unicode(soup.b)) self.assertEqual("<b/>", str(soup.b))
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
def test_real_xhtml_document(self):
"""lxml strips the XML definition from an XHTML doc, which is fine."""
markup = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Hello.</title></head>
<body>Goodbye.</body>
</html>"""
soup = self.soup(markup)
self.assertEqual(
soup.encode("utf-8").replace(b"\n", b''),
markup.replace(b'\n', b'').replace(
b'<?xml version="1.0" encoding="utf-8"?>', b''))
@skipIf( @skipIf(
not LXML_PRESENT, not LXML_PRESENT,
"lxml seems not to be present, not testing its XML tree builder.") "lxml seems not to be present, not testing its XML tree builder.")

View File

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
"""Tests of Beautiful Soup as a whole.""" """Tests of Beautiful Soup as a whole."""
from pdb import set_trace
import logging import logging
import unittest import unittest
import sys import sys
@ -20,6 +21,7 @@ import bs4.dammit
from bs4.dammit import ( from bs4.dammit import (
EntitySubstitution, EntitySubstitution,
UnicodeDammit, UnicodeDammit,
EncodingDetector,
) )
from bs4.testing import ( from bs4.testing import (
SoupTest, SoupTest,
@ -30,7 +32,7 @@ import warnings
try: try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True LXML_PRESENT = True
except ImportError, e: except ImportError as e:
LXML_PRESENT = False LXML_PRESENT = False
PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
@ -39,17 +41,43 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
class TestConstructor(SoupTest): class TestConstructor(SoupTest):
def test_short_unicode_input(self): def test_short_unicode_input(self):
data = u"<h1>éé</h1>" data = "<h1>éé</h1>"
soup = self.soup(data) soup = self.soup(data)
self.assertEqual(u"éé", soup.h1.string) self.assertEqual("éé", soup.h1.string)
def test_embedded_null(self): def test_embedded_null(self):
data = u"<h1>foo\0bar</h1>" data = "<h1>foo\0bar</h1>"
soup = self.soup(data) soup = self.soup(data)
self.assertEqual(u"foo\0bar", soup.h1.string) self.assertEqual("foo\0bar", soup.h1.string)
def test_exclude_encodings(self):
utf8_data = "Räksmörgås".encode("utf-8")
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
self.assertEqual("windows-1252", soup.original_encoding)
class TestDeprecatedConstructorArguments(SoupTest): class TestWarnings(SoupTest):
def _no_parser_specified(self, s, is_there=True):
v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
self.assertTrue(v)
def test_warning_if_no_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>")
msg = str(w[0].message)
self._assert_no_parser_specified(msg)
def test_warning_if_parser_specified_too_vague(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>", "html")
msg = str(w[0].message)
self._assert_no_parser_specified(msg)
def test_no_warning_if_explicit_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>", "html.parser")
self.assertEqual([], w)
def test_parseOnlyThese_renamed_to_parse_only(self): def test_parseOnlyThese_renamed_to_parse_only(self):
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
@ -117,9 +145,9 @@ class TestEntitySubstitution(unittest.TestCase):
def test_simple_html_substitution(self): def test_simple_html_substitution(self):
# Unicode characters corresponding to named HTML entites # Unicode characters corresponding to named HTML entites
# are substituted, and no others. # are substituted, and no others.
s = u"foo\u2200\N{SNOWMAN}\u00f5bar" s = "foo\u2200\N{SNOWMAN}\u00f5bar"
self.assertEqual(self.sub.substitute_html(s), self.assertEqual(self.sub.substitute_html(s),
u"foo&forall;\N{SNOWMAN}&otilde;bar") "foo&forall;\N{SNOWMAN}&otilde;bar")
def test_smart_quote_substitution(self): def test_smart_quote_substitution(self):
# MS smart quotes are a common source of frustration, so we # MS smart quotes are a common source of frustration, so we
@ -184,7 +212,7 @@ class TestEncodingConversion(SoupTest):
def setUp(self): def setUp(self):
super(TestEncodingConversion, self).setUp() super(TestEncodingConversion, self).setUp()
self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
self.utf8_data = self.unicode_data.encode("utf-8") self.utf8_data = self.unicode_data.encode("utf-8")
# Just so you know what it looks like. # Just so you know what it looks like.
self.assertEqual( self.assertEqual(
@ -204,7 +232,7 @@ class TestEncodingConversion(SoupTest):
ascii = b"<foo>a</foo>" ascii = b"<foo>a</foo>"
soup_from_ascii = self.soup(ascii) soup_from_ascii = self.soup(ascii)
unicode_output = soup_from_ascii.decode() unicode_output = soup_from_ascii.decode()
self.assertTrue(isinstance(unicode_output, unicode)) self.assertTrue(isinstance(unicode_output, str))
self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(unicode_output, self.document_for(ascii.decode()))
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
finally: finally:
@ -216,7 +244,7 @@ class TestEncodingConversion(SoupTest):
# is not set. # is not set.
soup_from_unicode = self.soup(self.unicode_data) soup_from_unicode = self.soup(self.unicode_data)
self.assertEqual(soup_from_unicode.decode(), self.unicode_data) self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
self.assertEqual(soup_from_unicode.original_encoding, None) self.assertEqual(soup_from_unicode.original_encoding, None)
def test_utf8_in_unicode_out(self): def test_utf8_in_unicode_out(self):
@ -224,7 +252,7 @@ class TestEncodingConversion(SoupTest):
# attribute is set. # attribute is set.
soup_from_utf8 = self.soup(self.utf8_data) soup_from_utf8 = self.soup(self.utf8_data)
self.assertEqual(soup_from_utf8.decode(), self.unicode_data) self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
def test_utf8_out(self): def test_utf8_out(self):
# The internal data structures can be encoded as UTF-8. # The internal data structures can be encoded as UTF-8.
@ -235,14 +263,14 @@ class TestEncodingConversion(SoupTest):
PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
def test_attribute_name_containing_unicode_characters(self): def test_attribute_name_containing_unicode_characters(self):
markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
class TestUnicodeDammit(unittest.TestCase): class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of UnicodeDammit.""" """Standalone tests of UnicodeDammit."""
def test_unicode_input(self): def test_unicode_input(self):
markup = u"I'm already Unicode! \N{SNOWMAN}" markup = "I'm already Unicode! \N{SNOWMAN}"
dammit = UnicodeDammit(markup) dammit = UnicodeDammit(markup)
self.assertEqual(dammit.unicode_markup, markup) self.assertEqual(dammit.unicode_markup, markup)
@ -250,7 +278,7 @@ class TestUnicodeDammit(unittest.TestCase):
markup = b"<foo>\x91\x92\x93\x94</foo>" markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup) dammit = UnicodeDammit(markup)
self.assertEqual( self.assertEqual(
dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
def test_smart_quotes_to_xml_entities(self): def test_smart_quotes_to_xml_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>" markup = b"<foo>\x91\x92\x93\x94</foo>"
@ -271,16 +299,17 @@ class TestUnicodeDammit(unittest.TestCase):
dammit.unicode_markup, """<foo>''""</foo>""") dammit.unicode_markup, """<foo>''""</foo>""")
def test_detect_utf8(self): def test_detect_utf8(self):
utf8 = b"\xc3\xa9" utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
dammit = UnicodeDammit(utf8) dammit = UnicodeDammit(utf8)
self.assertEqual(dammit.unicode_markup, u'\xe9')
self.assertEqual(dammit.original_encoding.lower(), 'utf-8') self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
def test_convert_hebrew(self): def test_convert_hebrew(self):
hebrew = b"\xed\xe5\xec\xf9" hebrew = b"\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self): def test_dont_see_smart_quotes_where_there_are_none(self):
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
@ -289,16 +318,36 @@ class TestUnicodeDammit(unittest.TestCase):
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self): def test_ignore_inappropriate_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8") utf8_data = "Räksmörgås".encode("utf-8")
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8') self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_ignore_invalid_codecs(self): def test_ignore_invalid_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8") utf8_data = "Räksmörgås".encode("utf-8")
for bad_encoding in ['.utf8', '...', 'utF---16.!']: for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding]) dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8') self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_exclude_encodings(self):
# This is UTF-8.
utf8_data = "Räksmörgås".encode("utf-8")
# But if we exclude UTF-8 from consideration, the guess is
# Windows-1252.
dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
# And if we exclude that, there is no valid guess at all.
dammit = UnicodeDammit(
utf8_data, exclude_encodings=["utf-8", "windows-1252"])
self.assertEqual(dammit.original_encoding, None)
def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
detected = EncodingDetector(
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
encodings = list(detected.encodings)
assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
def test_detect_html5_style_meta_tag(self): def test_detect_html5_style_meta_tag(self):
for data in ( for data in (
@ -337,7 +386,7 @@ class TestUnicodeDammit(unittest.TestCase):
bs4.dammit.chardet_dammit = noop bs4.dammit.chardet_dammit = noop
dammit = UnicodeDammit(doc) dammit = UnicodeDammit(doc)
self.assertEqual(True, dammit.contains_replacement_characters) self.assertEqual(True, dammit.contains_replacement_characters)
self.assertTrue(u"\ufffd" in dammit.unicode_markup) self.assertTrue("\ufffd" in dammit.unicode_markup)
soup = BeautifulSoup(doc, "html.parser") soup = BeautifulSoup(doc, "html.parser")
self.assertTrue(soup.contains_replacement_characters) self.assertTrue(soup.contains_replacement_characters)
@ -349,17 +398,17 @@ class TestUnicodeDammit(unittest.TestCase):
# A document written in UTF-16LE will have its byte order marker stripped. # A document written in UTF-16LE will have its byte order marker stripped.
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
dammit = UnicodeDammit(data) dammit = UnicodeDammit(data)
self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) self.assertEqual("<a>áé</a>", dammit.unicode_markup)
self.assertEqual("utf-16le", dammit.original_encoding) self.assertEqual("utf-16le", dammit.original_encoding)
def test_detwingle(self): def test_detwingle(self):
# Here's a UTF8 document. # Here's a UTF8 document.
utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
# Here's a Windows-1252 document. # Here's a Windows-1252 document.
windows_1252 = ( windows_1252 = (
u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
# Through some unholy alchemy, they've been stuck together. # Through some unholy alchemy, they've been stuck together.
doc = utf8 + windows_1252 + utf8 doc = utf8 + windows_1252 + utf8
@ -374,7 +423,7 @@ class TestUnicodeDammit(unittest.TestCase):
fixed = UnicodeDammit.detwingle(doc) fixed = UnicodeDammit.detwingle(doc)
self.assertEqual( self.assertEqual(
u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
def test_detwingle_ignores_multibyte_characters(self): def test_detwingle_ignores_multibyte_characters(self):
# Each of these characters has a UTF-8 representation ending # Each of these characters has a UTF-8 representation ending
@ -382,9 +431,9 @@ class TestUnicodeDammit(unittest.TestCase):
# Windows-1252. But our code knows to skip over multibyte # Windows-1252. But our code knows to skip over multibyte
# UTF-8 characters, so they'll survive the process unscathed. # UTF-8 characters, so they'll survive the process unscathed.
for tricky_unicode_char in ( for tricky_unicode_char in (
u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
): ):
input = tricky_unicode_char.encode("utf8") input = tricky_unicode_char.encode("utf8")
self.assertTrue(input.endswith(b'\x93')) self.assertTrue(input.endswith(b'\x93'))

View File

@ -9,6 +9,7 @@ same markup, but all Beautiful Soup trees can be traversed with the
methods tested here. methods tested here.
""" """
from pdb import set_trace
import copy import copy
import pickle import pickle
import re import re
@ -19,8 +20,10 @@ from bs4.builder import (
HTMLParserTreeBuilder, HTMLParserTreeBuilder,
) )
from bs4.element import ( from bs4.element import (
PY3K,
CData, CData,
Comment, Comment,
Declaration,
Doctype, Doctype,
NavigableString, NavigableString,
SoupStrainer, SoupStrainer,
@ -67,8 +70,14 @@ class TestFind(TreeTest):
self.assertEqual(soup.find("b").string, "2") self.assertEqual(soup.find("b").string, "2")
def test_unicode_text_find(self): def test_unicode_text_find(self):
soup = self.soup(u'<h1>Räksmörgås</h1>') soup = self.soup('<h1>Räksmörgås</h1>')
self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås') self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås')
def test_unicode_attribute_find(self):
soup = self.soup('<h1 id="Räksmörgås">here it is</h1>')
str(soup)
self.assertEqual("here it is", soup.find(id='Räksmörgås').text)
def test_find_everything(self): def test_find_everything(self):
"""Test an optimization that finds all tags.""" """Test an optimization that finds all tags."""
@ -87,16 +96,17 @@ class TestFindAll(TreeTest):
"""You can search the tree for text nodes.""" """You can search the tree for text nodes."""
soup = self.soup("<html>Foo<b>bar</b>\xbb</html>") soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
# Exact match. # Exact match.
self.assertEqual(soup.find_all(text="bar"), [u"bar"]) self.assertEqual(soup.find_all(string="bar"), ["bar"])
self.assertEqual(soup.find_all(text="bar"), ["bar"])
# Match any of a number of strings. # Match any of a number of strings.
self.assertEqual( self.assertEqual(
soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"])
# Match a regular expression. # Match a regular expression.
self.assertEqual(soup.find_all(text=re.compile('.*')), self.assertEqual(soup.find_all(text=re.compile('.*')),
[u"Foo", u"bar", u'\xbb']) ["Foo", "bar", '\xbb'])
# Match anything. # Match anything.
self.assertEqual(soup.find_all(text=True), self.assertEqual(soup.find_all(text=True),
[u"Foo", u"bar", u'\xbb']) ["Foo", "bar", '\xbb'])
def test_find_all_limit(self): def test_find_all_limit(self):
"""You can limit the number of items returned by find_all.""" """You can limit the number of items returned by find_all."""
@ -227,8 +237,8 @@ class TestFindAllByAttribute(TreeTest):
["Matching a.", "Matching b."]) ["Matching a.", "Matching b."])
def test_find_all_by_utf8_attribute_value(self): def test_find_all_by_utf8_attribute_value(self):
peace = u"םולש".encode("utf8") peace = "םולש".encode("utf8")
data = u'<a title="םולש"></a>'.encode("utf8") data = '<a title="םולש"></a>'.encode("utf8")
soup = self.soup(data) soup = self.soup(data)
self.assertEqual([soup.a], soup.find_all(title=peace)) self.assertEqual([soup.a], soup.find_all(title=peace))
self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
@ -688,7 +698,7 @@ class TestTagCreation(SoupTest):
def test_tag_inherits_self_closing_rules_from_builder(self): def test_tag_inherits_self_closing_rules_from_builder(self):
if XML_BUILDER_PRESENT: if XML_BUILDER_PRESENT:
xml_soup = BeautifulSoup("", "xml") xml_soup = BeautifulSoup("", "lxml-xml")
xml_br = xml_soup.new_tag("br") xml_br = xml_soup.new_tag("br")
xml_p = xml_soup.new_tag("p") xml_p = xml_soup.new_tag("p")
@ -697,7 +707,7 @@ class TestTagCreation(SoupTest):
self.assertEqual(b"<br/>", xml_br.encode()) self.assertEqual(b"<br/>", xml_br.encode())
self.assertEqual(b"<p/>", xml_p.encode()) self.assertEqual(b"<p/>", xml_p.encode())
html_soup = BeautifulSoup("", "html") html_soup = BeautifulSoup("", "html.parser")
html_br = html_soup.new_tag("br") html_br = html_soup.new_tag("br")
html_p = html_soup.new_tag("p") html_p = html_soup.new_tag("p")
@ -773,6 +783,14 @@ class TestTreeModification(SoupTest):
new_a = a.unwrap() new_a = a.unwrap()
self.assertEqual(a, new_a) self.assertEqual(a, new_a)
def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self):
soup = self.soup("<a><b>Foo</b></a><c>Bar</c>")
a = soup.a
a.extract()
self.assertEqual(None, a.parent)
self.assertRaises(ValueError, a.unwrap)
self.assertRaises(ValueError, a.replace_with, soup.c)
def test_replace_tag_with_itself(self): def test_replace_tag_with_itself(self):
text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>" text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
soup = self.soup(text) soup = self.soup(text)
@ -1067,6 +1085,31 @@ class TestTreeModification(SoupTest):
self.assertEqual(foo_2, soup.a.string) self.assertEqual(foo_2, soup.a.string)
self.assertEqual(bar_2, soup.b.string) self.assertEqual(bar_2, soup.b.string)
def test_extract_multiples_of_same_tag(self):
soup = self.soup("""
<html>
<head>
<script>foo</script>
</head>
<body>
<script>bar</script>
<a></a>
</body>
<script>baz</script>
</html>""")
[soup.script.extract() for i in soup.find_all("script")]
self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body))
def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
soup = self.soup(
'<html>\n'
'<body>hi</body>\n'
'</html>')
soup.find('body').extract()
self.assertEqual(None, soup.find('body'))
def test_clear(self): def test_clear(self):
"""Tag.clear()""" """Tag.clear()"""
soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>") soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>")
@ -1287,27 +1330,72 @@ class TestPersistence(SoupTest):
def test_unicode_pickle(self): def test_unicode_pickle(self):
# A tree containing Unicode characters can be pickled. # A tree containing Unicode characters can be pickled.
html = u"<b>\N{SNOWMAN}</b>" html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html) soup = self.soup(html)
dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
loaded = pickle.loads(dumped) loaded = pickle.loads(dumped)
self.assertEqual(loaded.decode(), soup.decode()) self.assertEqual(loaded.decode(), soup.decode())
def test_copy_navigablestring_is_not_attached_to_tree(self):
html = "<b>Foo<a></a></b><b>Bar</b>"
soup = self.soup(html)
s1 = soup.find(string="Foo")
s2 = copy.copy(s1)
self.assertEqual(s1, s2)
self.assertEqual(None, s2.parent)
self.assertEqual(None, s2.next_element)
self.assertNotEqual(None, s1.next_sibling)
self.assertEqual(None, s2.next_sibling)
self.assertEqual(None, s2.previous_element)
def test_copy_navigablestring_subclass_has_same_type(self):
html = "<b><!--Foo--></b>"
soup = self.soup(html)
s1 = soup.string
s2 = copy.copy(s1)
self.assertEqual(s1, s2)
self.assertTrue(isinstance(s2, Comment))
def test_copy_entire_soup(self):
html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
soup = self.soup(html)
soup_copy = copy.copy(soup)
self.assertEqual(soup, soup_copy)
def test_copy_tag_copies_contents(self):
html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
soup = self.soup(html)
div = soup.div
div_copy = copy.copy(div)
# The two tags look the same, and evaluate to equal.
self.assertEqual(str(div), str(div_copy))
self.assertEqual(div, div_copy)
# But they're not the same object.
self.assertFalse(div is div_copy)
# And they don't have the same relation to the parse tree. The
# copy is not associated with a parse tree at all.
self.assertEqual(None, div_copy.parent)
self.assertEqual(None, div_copy.previous_element)
self.assertEqual(None, div_copy.find(string='Bar').next_element)
self.assertNotEqual(None, div.find(string='Bar').next_element)
class TestSubstitutions(SoupTest): class TestSubstitutions(SoupTest):
def test_default_formatter_is_minimal(self): def test_default_formatter_is_minimal(self):
markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>" markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup) soup = self.soup(markup)
decoded = soup.decode(formatter="minimal") decoded = soup.decode(formatter="minimal")
# The < is converted back into &lt; but the e-with-acute is left alone. # The < is converted back into &lt; but the e-with-acute is left alone.
self.assertEqual( self.assertEqual(
decoded, decoded,
self.document_for( self.document_for(
u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>")) "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
def test_formatter_html(self): def test_formatter_html(self):
markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>" markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup) soup = self.soup(markup)
decoded = soup.decode(formatter="html") decoded = soup.decode(formatter="html")
self.assertEqual( self.assertEqual(
@ -1315,49 +1403,49 @@ class TestSubstitutions(SoupTest):
self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>")) self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
def test_formatter_minimal(self): def test_formatter_minimal(self):
markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>" markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup) soup = self.soup(markup)
decoded = soup.decode(formatter="minimal") decoded = soup.decode(formatter="minimal")
# The < is converted back into &lt; but the e-with-acute is left alone. # The < is converted back into &lt; but the e-with-acute is left alone.
self.assertEqual( self.assertEqual(
decoded, decoded,
self.document_for( self.document_for(
u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>")) "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
def test_formatter_null(self): def test_formatter_null(self):
markup = u"<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>" markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
soup = self.soup(markup) soup = self.soup(markup)
decoded = soup.decode(formatter=None) decoded = soup.decode(formatter=None)
# Neither the angle brackets nor the e-with-acute are converted. # Neither the angle brackets nor the e-with-acute are converted.
# This is not valid HTML, but it's what the user wanted. # This is not valid HTML, but it's what the user wanted.
self.assertEqual(decoded, self.assertEqual(decoded,
self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
def test_formatter_custom(self): def test_formatter_custom(self):
markup = u"<b>&lt;foo&gt;</b><b>bar</b>" markup = "<b>&lt;foo&gt;</b><b>bar</b>"
soup = self.soup(markup) soup = self.soup(markup)
decoded = soup.decode(formatter = lambda x: x.upper()) decoded = soup.decode(formatter = lambda x: x.upper())
# Instead of normal entity conversion code, the custom # Instead of normal entity conversion code, the custom
# callable is called on every string. # callable is called on every string.
self.assertEqual( self.assertEqual(
decoded, decoded,
self.document_for(u"<b><FOO></b><b>BAR</b>")) self.document_for("<b><FOO></b><b>BAR</b>"))
def test_formatter_is_run_on_attribute_values(self): def test_formatter_is_run_on_attribute_values(self):
markup = u'<a href="http://a.com?a=b&c=é">e</a>' markup = '<a href="http://a.com?a=b&c=é">e</a>'
soup = self.soup(markup) soup = self.soup(markup)
a = soup.a a = soup.a
expect_minimal = u'<a href="http://a.com?a=b&amp;c=é">e</a>' expect_minimal = '<a href="http://a.com?a=b&amp;c=é">e</a>'
self.assertEqual(expect_minimal, a.decode()) self.assertEqual(expect_minimal, a.decode())
self.assertEqual(expect_minimal, a.decode(formatter="minimal")) self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
expect_html = u'<a href="http://a.com?a=b&amp;c=&eacute;">e</a>' expect_html = '<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
self.assertEqual(expect_html, a.decode(formatter="html")) self.assertEqual(expect_html, a.decode(formatter="html"))
self.assertEqual(markup, a.decode(formatter=None)) self.assertEqual(markup, a.decode(formatter=None))
expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>' expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>'
self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
def test_formatter_skips_script_tag_for_html_documents(self): def test_formatter_skips_script_tag_for_html_documents(self):
@ -1366,7 +1454,7 @@ class TestSubstitutions(SoupTest):
console.log("< < hey > > "); console.log("< < hey > > ");
</script> </script>
""" """
encoded = BeautifulSoup(doc).encode() encoded = BeautifulSoup(doc, 'html.parser').encode()
self.assertTrue(b"< < hey > >" in encoded) self.assertTrue(b"< < hey > >" in encoded)
def test_formatter_skips_style_tag_for_html_documents(self): def test_formatter_skips_style_tag_for_html_documents(self):
@ -1375,7 +1463,7 @@ class TestSubstitutions(SoupTest):
console.log("< < hey > > "); console.log("< < hey > > ");
</style> </style>
""" """
encoded = BeautifulSoup(doc).encode() encoded = BeautifulSoup(doc, 'html.parser').encode()
self.assertTrue(b"< < hey > >" in encoded) self.assertTrue(b"< < hey > >" in encoded)
def test_prettify_leaves_preformatted_text_alone(self): def test_prettify_leaves_preformatted_text_alone(self):
@ -1383,24 +1471,24 @@ class TestSubstitutions(SoupTest):
# Everything outside the <pre> tag is reformatted, but everything # Everything outside the <pre> tag is reformatted, but everything
# inside is left alone. # inside is left alone.
self.assertEqual( self.assertEqual(
u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>', '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
soup.div.prettify()) soup.div.prettify())
def test_prettify_accepts_formatter(self): def test_prettify_accepts_formatter(self):
soup = BeautifulSoup("<html><body>foo</body></html>") soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
pretty = soup.prettify(formatter = lambda x: x.upper()) pretty = soup.prettify(formatter = lambda x: x.upper())
self.assertTrue("FOO" in pretty) self.assertTrue("FOO" in pretty)
def test_prettify_outputs_unicode_by_default(self): def test_prettify_outputs_unicode_by_default(self):
soup = self.soup("<a></a>") soup = self.soup("<a></a>")
self.assertEqual(unicode, type(soup.prettify())) self.assertEqual(str, type(soup.prettify()))
def test_prettify_can_encode_data(self): def test_prettify_can_encode_data(self):
soup = self.soup("<a></a>") soup = self.soup("<a></a>")
self.assertEqual(bytes, type(soup.prettify("utf-8"))) self.assertEqual(bytes, type(soup.prettify("utf-8")))
def test_html_entity_substitution_off_by_default(self): def test_html_entity_substitution_off_by_default(self):
markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
soup = self.soup(markup) soup = self.soup(markup)
encoded = soup.b.encode("utf-8") encoded = soup.b.encode("utf-8")
self.assertEqual(encoded, markup.encode('utf-8')) self.assertEqual(encoded, markup.encode('utf-8'))
@ -1444,45 +1532,53 @@ class TestEncoding(SoupTest):
"""Test the ability to encode objects into strings.""" """Test the ability to encode objects into strings."""
def test_unicode_string_can_be_encoded(self): def test_unicode_string_can_be_encoded(self):
html = u"<b>\N{SNOWMAN}</b>" html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html) soup = self.soup(html)
self.assertEqual(soup.b.string.encode("utf-8"), self.assertEqual(soup.b.string.encode("utf-8"),
u"\N{SNOWMAN}".encode("utf-8")) "\N{SNOWMAN}".encode("utf-8"))
def test_tag_containing_unicode_string_can_be_encoded(self): def test_tag_containing_unicode_string_can_be_encoded(self):
html = u"<b>\N{SNOWMAN}</b>" html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html) soup = self.soup(html)
self.assertEqual( self.assertEqual(
soup.b.encode("utf-8"), html.encode("utf-8")) soup.b.encode("utf-8"), html.encode("utf-8"))
def test_encoding_substitutes_unrecognized_characters_by_default(self): def test_encoding_substitutes_unrecognized_characters_by_default(self):
html = u"<b>\N{SNOWMAN}</b>" html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html) soup = self.soup(html)
self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>") self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>")
def test_encoding_can_be_made_strict(self): def test_encoding_can_be_made_strict(self):
html = u"<b>\N{SNOWMAN}</b>" html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html) soup = self.soup(html)
self.assertRaises( self.assertRaises(
UnicodeEncodeError, soup.encode, "ascii", errors="strict") UnicodeEncodeError, soup.encode, "ascii", errors="strict")
def test_decode_contents(self): def test_decode_contents(self):
html = u"<b>\N{SNOWMAN}</b>" html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html) soup = self.soup(html)
self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents()) self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents())
def test_encode_contents(self): def test_encode_contents(self):
html = u"<b>\N{SNOWMAN}</b>" html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html) soup = self.soup(html)
self.assertEqual( self.assertEqual(
u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
encoding="utf8")) encoding="utf8"))
def test_deprecated_renderContents(self): def test_deprecated_renderContents(self):
html = u"<b>\N{SNOWMAN}</b>" html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html) soup = self.soup(html)
self.assertEqual( self.assertEqual(
u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
def test_repr(self):
html = "<b>\N{SNOWMAN}</b>"
soup = self.soup(html)
if PY3K:
self.assertEqual(html, repr(soup))
else:
self.assertEqual(b'<b>\\u2603</b>', repr(soup))
class TestNavigableStringSubclasses(SoupTest): class TestNavigableStringSubclasses(SoupTest):
@ -1522,6 +1618,9 @@ class TestNavigableStringSubclasses(SoupTest):
soup.insert(1, doctype) soup.insert(1, doctype)
self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
def test_declaration(self):
d = Declaration("foo")
self.assertEqual("<?foo?>", d.output_ready())
class TestSoupSelector(TreeTest): class TestSoupSelector(TreeTest):
@ -1534,7 +1633,7 @@ class TestSoupSelector(TreeTest):
<link rel="stylesheet" href="blah.css" type="text/css" id="l1"> <link rel="stylesheet" href="blah.css" type="text/css" id="l1">
</head> </head>
<body> <body>
<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag>
<div id="main" class="fancy"> <div id="main" class="fancy">
<div id="inner"> <div id="inner">
<h1 id="header1">An H1</h1> <h1 id="header1">An H1</h1>
@ -1552,8 +1651,18 @@ class TestSoupSelector(TreeTest):
<a href="#" id="s2a1">span2a1</a> <a href="#" id="s2a1">span2a1</a>
</span> </span>
<span class="span3"></span> <span class="span3"></span>
<custom-dashed-tag class="dashed" id="dash2"/>
<div data-tag="dashedvalue" id="data1"/>
</span> </span>
</div> </div>
<x id="xid">
<z id="zida"/>
<z id="zidab"/>
<z id="zidac"/>
</x>
<y id="yid">
<z id="zidb"/>
</y>
<p lang="en" id="lang-en">English</p> <p lang="en" id="lang-en">English</p>
<p lang="en-gb" id="lang-en-gb">English UK</p> <p lang="en-gb" id="lang-en-gb">English UK</p>
<p lang="en-us" id="lang-en-us">English US</p> <p lang="en-us" id="lang-en-us">English US</p>
@ -1565,7 +1674,7 @@ class TestSoupSelector(TreeTest):
""" """
def setUp(self): def setUp(self):
self.soup = BeautifulSoup(self.HTML) self.soup = BeautifulSoup(self.HTML, 'html.parser')
def assertSelects(self, selector, expected_ids): def assertSelects(self, selector, expected_ids):
el_ids = [el['id'] for el in self.soup.select(selector)] el_ids = [el['id'] for el in self.soup.select(selector)]
@ -1587,21 +1696,29 @@ class TestSoupSelector(TreeTest):
els = self.soup.select('title') els = self.soup.select('title')
self.assertEqual(len(els), 1) self.assertEqual(len(els), 1)
self.assertEqual(els[0].name, 'title') self.assertEqual(els[0].name, 'title')
self.assertEqual(els[0].contents, [u'The title']) self.assertEqual(els[0].contents, ['The title'])
def test_one_tag_many(self): def test_one_tag_many(self):
els = self.soup.select('div') els = self.soup.select('div')
self.assertEqual(len(els), 3) self.assertEqual(len(els), 4)
for div in els: for div in els:
self.assertEqual(div.name, 'div') self.assertEqual(div.name, 'div')
el = self.soup.select_one('div')
self.assertEqual('main', el['id'])
def test_select_one_returns_none_if_no_match(self):
match = self.soup.select_one('nonexistenttag')
self.assertEqual(None, match)
def test_tag_in_tag_one(self): def test_tag_in_tag_one(self):
els = self.soup.select('div div') els = self.soup.select('div div')
self.assertSelects('div div', ['inner']) self.assertSelects('div div', ['inner', 'data1'])
def test_tag_in_tag_many(self): def test_tag_in_tag_many(self):
for selector in ('html div', 'html body div', 'body div'): for selector in ('html div', 'html body div', 'body div'):
self.assertSelects(selector, ['main', 'inner', 'footer']) self.assertSelects(selector, ['data1', 'main', 'inner', 'footer'])
def test_tag_no_match(self): def test_tag_no_match(self):
self.assertEqual(len(self.soup.select('del')), 0) self.assertEqual(len(self.soup.select('del')), 0)
@ -1609,6 +1726,20 @@ class TestSoupSelector(TreeTest):
def test_invalid_tag(self): def test_invalid_tag(self):
self.assertRaises(ValueError, self.soup.select, 'tag%t') self.assertRaises(ValueError, self.soup.select, 'tag%t')
def test_select_dashed_tag_ids(self):
self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
def test_select_dashed_by_id(self):
dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]')
self.assertEqual(dashed[0].name, 'custom-dashed-tag')
self.assertEqual(dashed[0]['id'], 'dash2')
def test_dashed_tag_text(self):
self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.')
def test_select_dashed_matches_find_all(self):
self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag'))
def test_header_tags(self): def test_header_tags(self):
self.assertSelectMultiple( self.assertSelectMultiple(
('h1', ['header1']), ('h1', ['header1']),
@ -1709,6 +1840,7 @@ class TestSoupSelector(TreeTest):
('[id^="m"]', ['me', 'main']), ('[id^="m"]', ['me', 'main']),
('div[id^="m"]', ['main']), ('div[id^="m"]', ['main']),
('a[id^="m"]', ['me']), ('a[id^="m"]', ['me']),
('div[data-tag^="dashed"]', ['data1'])
) )
def test_attribute_endswith(self): def test_attribute_endswith(self):
@ -1716,8 +1848,8 @@ class TestSoupSelector(TreeTest):
('[href$=".css"]', ['l1']), ('[href$=".css"]', ['l1']),
('link[href$=".css"]', ['l1']), ('link[href$=".css"]', ['l1']),
('link[id$="1"]', ['l1']), ('link[id$="1"]', ['l1']),
('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']), ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']),
('div[id$="1"]', []), ('div[id$="1"]', ['data1']),
('[id$="noending"]', []), ('[id$="noending"]', []),
) )
@ -1730,7 +1862,6 @@ class TestSoupSelector(TreeTest):
('[rel*="notstyle"]', []), ('[rel*="notstyle"]', []),
('link[rel*="notstyle"]', []), ('link[rel*="notstyle"]', []),
('link[href*="bla"]', ['l1']), ('link[href*="bla"]', ['l1']),
('a[href*="http://"]', ['bob', 'me']),
('[href*="http://"]', ['bob', 'me']), ('[href*="http://"]', ['bob', 'me']),
('[id*="p"]', ['pmulti', 'p1']), ('[id*="p"]', ['pmulti', 'p1']),
('div[id*="m"]', ['main']), ('div[id*="m"]', ['main']),
@ -1739,8 +1870,8 @@ class TestSoupSelector(TreeTest):
('[href*=".css"]', ['l1']), ('[href*=".css"]', ['l1']),
('link[href*=".css"]', ['l1']), ('link[href*=".css"]', ['l1']),
('link[id*="1"]', ['l1']), ('link[id*="1"]', ['l1']),
('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']), ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']),
('div[id*="1"]', []), ('div[id*="1"]', ['data1']),
('[id*="noending"]', []), ('[id*="noending"]', []),
# New for this test # New for this test
('[href*="."]', ['bob', 'me', 'l1']), ('[href*="."]', ['bob', 'me', 'l1']),
@ -1748,6 +1879,7 @@ class TestSoupSelector(TreeTest):
('link[href*="."]', ['l1']), ('link[href*="."]', ['l1']),
('div[id*="n"]', ['main', 'inner']), ('div[id*="n"]', ['main', 'inner']),
('div[id*="nn"]', ['inner']), ('div[id*="nn"]', ['inner']),
('div[data-tag*="edval"]', ['data1'])
) )
def test_attribute_exact_or_hypen(self): def test_attribute_exact_or_hypen(self):
@ -1767,18 +1899,27 @@ class TestSoupSelector(TreeTest):
('p[class]', ['p1', 'pmulti']), ('p[class]', ['p1', 'pmulti']),
('[blah]', []), ('[blah]', []),
('p[blah]', []), ('p[blah]', []),
('div[data-tag]', ['data1'])
) )
def test_unsupported_pseudoclass(self):
self.assertRaises(
NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
self.assertRaises(
NotImplementedError, self.soup.select, "a:nth-of-type(a)")
def test_nth_of_type(self): def test_nth_of_type(self):
# Try to select first paragraph # Try to select first paragraph
els = self.soup.select('div#inner p:nth-of-type(1)') els = self.soup.select('div#inner p:nth-of-type(1)')
self.assertEqual(len(els), 1) self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, u'Some text') self.assertEqual(els[0].string, 'Some text')
# Try to select third paragraph # Try to select third paragraph
els = self.soup.select('div#inner p:nth-of-type(3)') els = self.soup.select('div#inner p:nth-of-type(3)')
self.assertEqual(len(els), 1) self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, u'Another') self.assertEqual(els[0].string, 'Another')
# Try to select (non-existent!) fourth paragraph # Try to select (non-existent!) fourth paragraph
els = self.soup.select('div#inner p:nth-of-type(4)') els = self.soup.select('div#inner p:nth-of-type(4)')
@ -1791,7 +1932,7 @@ class TestSoupSelector(TreeTest):
def test_nth_of_type_direct_descendant(self): def test_nth_of_type_direct_descendant(self):
els = self.soup.select('div#inner > p:nth-of-type(1)') els = self.soup.select('div#inner > p:nth-of-type(1)')
self.assertEqual(len(els), 1) self.assertEqual(len(els), 1)
self.assertEqual(els[0].string, u'Some text') self.assertEqual(els[0].string, 'Some text')
def test_id_child_selector_nth_of_type(self): def test_id_child_selector_nth_of_type(self):
self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
@ -1803,7 +1944,7 @@ class TestSoupSelector(TreeTest):
selected = inner.select("div") selected = inner.select("div")
# The <div id="inner"> tag was selected. The <div id="footer"> # The <div id="inner"> tag was selected. The <div id="footer">
# tag was not. # tag was not.
self.assertSelectsIDs(selected, ['inner']) self.assertSelectsIDs(selected, ['inner', 'data1'])
def test_overspecified_child_id(self): def test_overspecified_child_id(self):
self.assertSelects(".fancy #inner", ['inner']) self.assertSelects(".fancy #inner", ['inner'])
@ -1827,3 +1968,44 @@ class TestSoupSelector(TreeTest):
def test_sibling_combinator_wont_select_same_tag_twice(self): def test_sibling_combinator_wont_select_same_tag_twice(self):
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
# Test the selector grouping operator (the comma)
def test_multiple_select(self):
self.assertSelects('x, y', ['xid', 'yid'])
def test_multiple_select_with_no_space(self):
self.assertSelects('x,y', ['xid', 'yid'])
def test_multiple_select_with_more_space(self):
self.assertSelects('x, y', ['xid', 'yid'])
def test_multiple_select_duplicated(self):
self.assertSelects('x, x', ['xid'])
def test_multiple_select_sibling(self):
self.assertSelects('x, y ~ p[lang=fr]', ['xid', 'lang-fr'])
def test_multiple_select_tag_and_direct_descendant(self):
self.assertSelects('x, y > z', ['xid', 'zidb'])
def test_multiple_select_direct_descendant_and_tags(self):
self.assertSelects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
def test_multiple_select_indirect_descendant(self):
self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
def test_invalid_multiple_select(self):
self.assertRaises(ValueError, self.soup.select, ',x, y')
self.assertRaises(ValueError, self.soup.select, 'x,,y')
def test_multiple_select_attrs(self):
self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
def test_multiple_select_ids(self):
self.assertSelects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab'])
def test_multiple_select_nested(self):
self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])