indexer.py #3

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import sys, getopt, json, os.path
from pprint import pprint as pp

# import BeautifulSoup relative to where this script exists
include_path = os.path.dirname(os.path.abspath(os.path.dirname(sys.argv[0])))
sys.path.append(include_path + "/beautifulsoup4-4.3.2")
from bs4 import BeautifulSoup

def usage(level):
    print """
indexer.py [-h] -d <document file> -i <index file>
-h: this help
-d: the document file (HTML, with content in the '#content' id) to index
-i: the index file, in JSON format. Will be created if it does not exist.
"""
    sys.exit(level)


class IndexerException(Exception):
    def __init__(self, value):
        self.value = value
    def __str__(self):
        return repr(self.value)

# ========================================================================

filenames  = []
documents  = []
index_file = ''
index      = {}

try:
    options, args = getopt.getopt(sys.argv[1:], "hi:")
except getopt.GetoptError:
    usage(2)

for opt, arg in options:
    if opt == "-h":
        usage(0)
    elif opt == "-i":
        index_file = arg

for arg in args:
    filename = os.path.basename(arg.strip("/\\"))
    if filename in filenames:
        raise IndexerException(
            "{} already specified".format(filename)
        )
    filenames.append(filename)
    documents.append({ "name": filename, "path": arg })

if len(documents) < 1:
    print "Error: No document filename specified"
    usage(1)

if len(index_file) < 1:
    print "Error: No index filename specified"
    usage(1)

strip_chars = " .,:;(){}[]?|" + u"\u201c" + u"\u201d" + u"\xa9"

for d, doc in enumerate(documents):
    print("Indexing {}".format(doc["path"]))

    soup = BeautifulSoup(open(doc["path"]))
    doc["title"] = soup.title.string
    text = soup.find(id="content").get_text()
    if len(text) < 1:
        raise IndexerException("{} has no content".format(doc["path"]))

    for pos, w in enumerate(text.lower().split()):
        word = w.strip(strip_chars).replace('"', '').replace("'", "")
        if len(word) < 1:
            continue

        if word not in index:
            index[word] = {}

        if d not in index[word]:
            index[word][d] = []

        index[word][d].append(pos)

hash = {
    "f": documents,
    "i": index,
}
with open(index_file, "w") as zFile:
    zFile.write(json.dumps(hash))

# print("\nIndex:")
#print("final index:")
#pp(index)
# print("files:")
# pp(documents)

#	Change	User	Description
#3	12835	eedwards	Upgrade BeautifulSoup from 4.1.0 to 4.3.2. This is a drop-in replacement, and there should be no difference in doc indexing results. However, docs can be indexed 2-4 times faster.
#2	12738	eedwards	Infrastructure updates to support new Swarm DocBook documentation: - Strip out XSLTHL; it does a poor job of highlighting the kinds of syntax included in our documentation, and it does an awful job when code to be highlighted contains other markup, such as xref, link, or replaceable. - Add in google-code-prettify (from Swarm) so that generated HTML has fairly decent syntax highlighting for many of the code examples we use. Further work on syntax highlighting, particularly for PDF output, is required. - Disable TOC generation in HTML; the HTML template already includes the full TOC as a sidebar on every page. - Update the xmlns attribute and image URL filtering. - Fix a bug in HTML page title XSL that often prevented the current section title from being included. This was more obvious in the Swarm documentation which chunks into pages by section, rather than by chapter. - Update the HTML indexer to process multiple HTML files in one pass for performance, and overall simplify the code structure. - Adjust sidebar TOC activation logic to allow Bootstrap's scrollSpy code to work with section-based chunking. - Re-introduce the "brand" markup in the HTML doc heading so that Swarm's documentation can include its logo. Other guides can do so if required, but none have been updated to do so yet. - Add facility for guide-specific custom javacsript inclusion. - Add facility to apply custom CSS classes to images in documentation. This is used in Swarm documentation to provide thumbnail, popup, and "framed" images. - Adjust xref and link text to just report the chapter or section name, without the preamble text, for example: "the section called". - Add <replaceable role="bold"> to provide a second kind of emphasis to programlistings. - Added a subtle color to <filename> content, to help distinguish from other literal text. - Removed the separator rule from the first section heading while chunking by section. - Added Swarm's orderedlist styling so that each numbered item has a circle around the number and a vertical bar indicating the extend of the item. - Added a visual indicator when a link points to an external document. - Rewrote the javascript search code to search for exact and fuzzy terms matches, as well as phrases. Relevance scoring is included, but likely needs some fine tuning to suit our documentation.
#1	12734	eedwards	Replace the Java-based HTML indexer with a Python-based one. The Java-based HTML indexer was based on stemming, which can reduce the size of the index notably. However, stemming producespoor search behaviour, and is language specific. For example, the word 'documentation' has a stem of 'document'. When a user searches for 'docu', 'document' is not found. Also, all punctuation was stripped, making searches for IP addresses or depot paths impossible. The Python-based indexer collects all whitespace-separated tokens. The resulting index is around 10% larger than the stemmed index, but permits much more reasonable results, particularly while the user is still typing in search terms. The Python-based indexer should be cross-platform, but has to date only been tested on Mac OSX. It should work as-is on Linux, but further work may be required on Windows or other platforms. Also, there is room for optimization, particularly when a large number of HTML documents are to be indexed. Searching multiple terms is possible, and each HTML page must match all entered terms. There is nothing fancy about handling the search terms, no conjunctions, phrase searching, etc. Depending on user feedback, we may need to add more sophistication in the future. Included in this change: - the Python indexer - removed use of Java-based indexer from common build.xml and applied the Python indexer - added additional image src filtering - better build commentary in certain targets - rewrote the client-side searching - fixed the content of the HTML <title> tags to be '<chapter> // <book>', or just '<book>' when necessary. Search results strip off the ' // <book>' to be concise. - identify results as 'pages' rather than results, as each page may contain multiple matches. - results now indicate how many matches to expect on each page. - re-enabled indexing/search for CmdRef, P4API, P4Dist, and P4Guide.