"""Demonstrate how different parsers parse the same markup. Beautiful Soup can use any of a number of different parsers. Every parser should behave more or less the same on valid markup, and Beautiful Soup's unit tests make sure this is the case. But every parser handles invalid markup differently. Even different versions of the same parser handle invalid markup differently. So instead of unit tests I've created this educational demonstration script. The file demonstration_markup.txt contains many lines of HTML. This script tests each line of markup against every parser you have installed, and prints out how each parser sees that markup. This may help you choose a parser, or understand why Beautiful Soup presents your document the way it does. """ import os import sys from bs4 import BeautifulSoup parsers = ['html.parser'] try: from bs4.builder import _lxml parsers.append('lxml') except ImportError, e: pass try: from bs4.builder import _html5lib parsers.append('html5lib') except ImportError, e: pass class Demonstration(object): def __init__(self, markup): self.results = {} self.markup = markup def run_against(self, *parser_names): uniform_results = True previous_output = None for parser in parser_names: try: soup = BeautifulSoup(self.markup, parser) if markup.startswith("<div>"): # Extract the interesting part output = soup.div else: output = soup except Exception, e: output = "[EXCEPTION] %s" % str(e) self.results[parser] = output if previous_output is None: previous_output = output elif previous_output != output: uniform_results = False return uniform_results def dump(self): print "%s: %s" % ("Markup".rjust(13), self.markup.encode("utf8")) for parser, output in self.results.items(): print "%s: %s" % (parser.rjust(13), output.encode("utf8")) different_results = [] uniform_results = [] print "= Testing the following parsers: %s =" % ", ".join(parsers) print input_file = sys.stdin if sys.stdin.isatty(): for filename in [ "demonstration_markup.txt", os.path.join("scripts", "demonstration_markup.txt")]: if os.path.exists(filename): input_file = open(filename) for markup in input_file: demo = Demonstration(markup.decode("utf8").strip().replace("\\n", "\n")) is_uniform = demo.run_against(*parsers) if is_uniform: uniform_results.append(demo) else: different_results.append(demo) print "== Markup that's handled the same in every parser ==" print for demo in uniform_results: demo.dump() print print "== Markup that's not handled the same in every parser ==" print for demo in different_results: demo.dump() print
# | Change | User | Description | Committed | |
---|---|---|---|---|---|
#1 | 26953 | Paul Allen | Move //guest/perforce_software/p4convert to //guest/perforce_software/p4convert/main | ||
//guest/perforce_software/p4convert/docs/beautifulsoup4-4.3.2/scripts/demonstrate_parser_differences.py | |||||
#2 | 14806 | Paul Allen | Update docs and add +w. | ||
#1 | 13920 | Paul Allen | copy part 2 (no errors) | ||
//guest/paul_allen/p4convert-maven/docs/beautifulsoup4-4.3.2/scripts/demonstrate_parser_differences.py | |||||
#1 | 13895 | Paul Allen | Copying using p4convert-docbook | ||
//guest/perforce_software/doc_build/main/beautifulsoup4-4.3.2/scripts/demonstrate_parser_differences.py | |||||
#1 | 12835 | eedwards |
Upgrade BeautifulSoup from 4.1.0 to 4.3.2. This is a drop-in replacement, and there should be no difference in doc indexing results. However, docs can be indexed 2-4 times faster. |