Module.php #1

<?php
/**
 * Integrates the search module with the rest of the application.
 *
 * @copyright   2011 Perforce Software. All rights reserved.
 * @license     Please see LICENSE.txt in top-level folder of this distribution.
 * @version     <release>/<patch>
 */
class Search_Module extends P4Cms_Module_Integration
{
    // the maximum recursive depth for enhancing user queries
    const  MAX_DEPTH                    = 10;

    // the required prefix for wildcard queries to avoid performance problems.
    const   MIN_PREFIX_LENGTH           = 2;
    const   MAX_RESULTS                 = 10000;

    // the QueryPaserException error codes
    const  ERROR_TWO_CHARS_LEXEME       = 1;
    const  ERROR_LEXEME_MODIFIER        = 2;

    const  ACTIVE_INDEX_PATH            = 'search-index';
    const  NEW_DOCUMENT_COUNT_FILE      = 'search-newly-added-document.count';

    // The default number of new documents with triggers auto optimization
    const  DEFAULT_MAX_BUFFERED_DOCS    = 10;
    const  DEFAULT_MERGE_FACTOR         = 10;

    /**
     * @var array   the list of existing search instances (Zend_Search_Lucene_Interface objects)
     */
    protected static  $_searchInstances  = array();

    /**
     * Subscribe to search index topic.
     */
    public static function load()
    {
        // listen for documents to be indexed.
        P4Cms_PubSub::subscribe('p4cms.search.add',
            function($document)
            {
                // if we don't have a lucene doc, bail out.
                if (!$document = Search_Module::prepareDocument($document)) {
                    return;
                }

                // add the document
                Search_Module::factory()->addDocument($document);
            }
        );

        // listen for documents to be removed from index.
        P4Cms_PubSub::subscribe('p4cms.search.delete',
            function($document)
            {
                // if we don't have a lucene doc, bail out.
                if (!$document = Search_Module::prepareDocument($document)) {
                    return;
                }

                // remove documents with matching key fields.
                $index     = Search_Module::factory();
                $keyFields = array('uri', 'contentId');
                foreach ($keyFields as $keyField) {
                    if (in_array($keyField, $document->getFieldNames())) {

                        // search for existing documents with matching key field.
                        $term = new Zend_Search_Lucene_Index_Term(
                            $document->getFieldValue($keyField),
                            $keyField
                        );

                        // remove matches.
                        foreach ($index->termDocs($term) as $id) {
                            $index->delete($id);
                        }
                    }
                }
            }
        );

        /**
         * listen for documents to be updated in the index.
         *
         * @publishes   p4cms.search.delete
         *              Perform operations when an entry is deleted from the search-index.
         *              Note: Updates to existing entries are accomplished via delete/add.
         *              Zend_Search_Lucene_Document|P4Cms_Content   $document   The entry being
         *                                                                      deleted.
         *
         * @publishes   p4cms.search.add
         *              Perform operations when an entry is added to the search index.
         *              Note: Updates to existing entries are accomplished via delete/add.
         *              Zend_Search_Lucene_Document|P4Cms_Content   $document   The entry being
         *                                                                      added.
         */
        P4Cms_PubSub::subscribe('p4cms.search.update',
            function($document)
            {
                // if we don't have a lucene doc, bail out.
                if (!$document = Search_Module::prepareDocument($document)) {
                    return;
                }

                // lucene does not have a 'update' function, so
                // we publish to the delete and add topics instead.
                P4Cms_PubSub::publish('p4cms.search.delete', $document);
                P4Cms_PubSub::publish('p4cms.search.add',    $document);
            }
        );

        // steal content's search form to use lucene
        P4Cms_PubSub::subscribe('p4cms.content.grid.form',
            function(Zend_Form $form)
            {
                $search = $form->getSubForm('search');
                if (!$search) {
                    return;
                }

                $form->removeSubForm('search');
                $form->addSubForm($search, 'lucene');
            }
        );

        // filter content list by keyword search.
        P4Cms_PubSub::subscribe('p4cms.content.grid.populate',
            function(P4Cms_Record_Query $query, Zend_Form $form)
            {
                $values = $form->getValues();

                // extract search query.
                $searchQuery = isset($values['lucene']['query'])
                    ? $values['lucene']['query']
                    : null;

                // early exit if no query.
                if (!$searchQuery) {
                    return;
                }

                $filter = ($query->getFilter()) ?: new P4Cms_Record_Filter;

                if ($filter->getOption('lucene')) {
                    $searchQuery = (is_array($filter->getOption('lucene')))
                                 ? array_intersect($filter->getOption('lucene'), array($searchQuery))
                                 : $filter->getOption('lucene') . ' ' . $searchQuery;
                }

                $filter->setOption('lucene', $searchQuery);
                $query->setFilter($filter);
            }
        );

        // Allows for filtering a content query by lucene.
        // Used by creating a filter on the query with the 'lucene' option set to a string
        // or array containing keywords.
        P4Cms_PubSub::subscribe('p4cms.content.record.query',
            function(P4Cms_Record_Query $query, P4Cms_Record_Adapter $adapter)
            {
                $filter = $query->getFilter();
                if (!$filter || !$filter instanceof P4Cms_Record_Filter) {
                    return;
                }

                // see if the lucene filter option is set
                $keywords = $filter->getOption('lucene');
                if (!$keywords || (!is_string($keywords) && !is_array($keywords))) {
                    return;
                }

                if (is_array($keywords)) {
                    $keywords = implode(' ', $keywords);
                }

                // collect matching content ids.
                $ids = array();
                foreach (Search_Module::find($keywords) as $result) {
                    $document = $result->getDocument();
                    if (in_array('contentId', $document->getFieldNames())) {
                        $ids[] = $document->contentId;
                    }
                }

                // add content ids to query paths.
                $query->addPaths($ids, true);
            }
        );

        // copy the search index when a new branch is created.
        P4Cms_PubSub::subscribe(
            'p4cms.site.branch.add.postSubmit',
            function($target, $source, $adapter)
            {
                $sourcePath = $source->getDataPath() . '/' . Search_Module::ACTIVE_INDEX_PATH;
                $targetPath = $target->getDataPath() . '/' . Search_Module::ACTIVE_INDEX_PATH;

                // if a search index exists, it means the target branch has previously
                // existed. remove the old search index because the content of this branch
                // now represents the content of the source branch.
                if (is_dir($targetPath)) {
                    P4Cms_FileUtility::deleteRecursive($targetPath);
                }

                // if no existing source index, nothing to do.
                // if we proceeded and took lock on the source directory that creates
                // an empty search index (with a lock file) which breaks lucene.
                if (!is_dir($sourcePath)) {
                    return;
                }

                // lock the source branch's search index so we don't clash with writers.
                $lock = Zend_Search_Lucene_LockManager::obtainReadLock(
                    new Zend_Search_Lucene_Storage_Directory_Filesystem($sourcePath)
                );

                // copy source index files to target.
                P4Cms_FileUtility::copyRecursive($sourcePath, $targetPath);

                // all done.
                $lock->unlock();
            }
        );
    }

    /**
     * Get matching results for the given search string.
     *
     * @param   string  $query  a user provided search string.
     * @return  array   an array of Zend_Search_Lucene_Search_QueryHit objects.
     */
    public static function find($query)
    {
        $index = Search_Module::factory();
        $query = Search_Module::stringToQuery($query);

        return $index->find($query);
    }

    /**
     * Create a lucene search instance for a given site's index folder name.
     *
     * @param  string   $indexName              the index folder name
     * @return Zend_Search_Lucene_Interface     search instance for the site index.
     */
    public static function factory($indexName = null)
    {
        if (!$indexName) {
            $indexName = self::ACTIVE_INDEX_PATH;
        }

        // If we don't already have the search index set up, create one
        if (!array_key_exists($indexName, static::$_searchInstances)) {
            static::$_searchInstances[$indexName] = static::_getSearchIndex($indexName);
        }

        return static::$_searchInstances[$indexName];
    }

    /**
     * Check if there exists a static search instance reference under the given index folder name.
     *
     * @param   string  $indexName  the name of the search index folder
     * @return  boolean             true,  if the search instance exists
     *                              false, otherwise
     */
    public static function hasSearchInstance($indexName = null)
    {
        $exists = false;

        if (!$indexName) {
            $indexName = self::ACTIVE_INDEX_PATH;
        }

        if (isset(static::$_searchInstances[$indexName])) {
           $exists = static::$_searchInstances[$indexName] instanceof Zend_Search_Lucene_Interface;
        }

        return $exists;
    }

    /**
     * Destroy the static search instance references.
     * Intended primarly for testing.
     */
    public static function clearSearchInstances()
    {
        foreach (static::$_searchInstances as $index) {
            if ($index instanceof Zend_Search_Lucene_Interface) {
                $index->__destruct();
            }
        }

        static::$_searchInstances = array();
    }

    /**
     * Produce a lucene query object for a given search string.
     *
     * @param   string  $search     the string based search query.
     * @return  Zend_Search_Lucene_Search_Query     the lucene query object.
     */
    public static function stringToQuery($search)
    {
        $enhanced  = static::_enhanceQuery($search);
        $userQuery = Zend_Search_Lucene_Search_QueryParser::parse($enhanced);
        $query     = new Zend_Search_Lucene_Search_Query_Boolean();
        $query->addSubquery($userQuery, true);

        return $query;
    }

    /**
     * Enhance a user provided search query to fix common problems.
     *
     * @param   string  $query  the query string to enhance.
     * @param   integer $depth  the recursive depth
     * @return  string  the enhanced query string.
     */
    protected static function _enhanceQuery($query, $depth = 0)
    {
        // increase the depth
        $depth++;

        // use Zend's lexer for proper parsing of search queries.
        $lexer = new Zend_Search_Lucene_Search_QueryLexer;

        // catch syntax errors known to us and try to help
        try {
            $tokens = $lexer->tokenize($query, 'UTF-8');
        } catch (Zend_Search_Lucene_Search_QueryParserException $e) {
            // re-throw exception if it's too deep
            if ($depth >= self::MAX_DEPTH) {
                throw $e;
            }

            $error = static::_getQueryParserError($e);

            // if we don't know the error, throw it
            if (empty($error)) {
                throw $e;
            }

            switch ($error['code']) {
                case self::ERROR_TWO_CHARS_LEXEME:
                    $query = substr($query, 0, $error['position'] - 1)
                           . str_repeat($query[$error['position'] - 1], 2)
                           . substr($query, $error['position']);

                    return static::_enhanceQuery($query, $depth);
                    break;

                case self::ERROR_LEXEME_MODIFIER:
                    $query = substr($query, 0, $error['position'] - 1)
                           . ' '
                           . substr($query, $error['position']);

                    return static::_enhanceQuery($query, $depth);
                    break;

                default:
                    throw $e;  // re-throw any unknow queryparser exceptions
                    break;
            }
        }

        // look at each token.
        $newQuery = "";
        for ($i = 0; $i < count($tokens); $i++) {

            $token     = $tokens[$i];
            $prevToken = isset($tokens[$i-1]) ? $tokens[$i-1] : null;
            $nextToken = isset($tokens[$i+1]) ? $tokens[$i+1] : null;

            // extract portion of query associated with this token.
            $start        = $prevToken ? $prevToken->position : 0;
            $length       = $token->position - $start;
            $token->query = substr($query, $start, $length);

            // make word tokens wild by default.
            static::_makeWordTokensWild($token, $prevToken, $nextToken);

            // fix problems with multi-word tokens.
            static::_fixMultiWordTokens($token, $prevToken, $nextToken);

            $newQuery .= $token->query;
        }

        return $newQuery;
    }

    /**
     * Work-around poor handling of multiple-word terms.
     *
     * Multi-word terms such as foo-bar and joe's are treated as individual
     * words which causes them to match more documents than the user likely
     * wants. Additionally, they are incompatible with wildcards and fuzzy
     * searches.
     *
     * Quoting multi-word search terms avoids these problems and seems to
     * be the least offensive thing to do to the user's query.
     *
     * @param   Zend_Search_Lucene_Search_QueryToken    $token      the token to examine for repair.
     * @param   Zend_Search_Lucene_Search_QueryToken    $prevToken  optional - the previous token if there is one.
     * @param   Zend_Search_Lucene_Search_QueryToken    $nextToken  optional - the next token if there is one.
     */
    protected static function _fixMultiWordTokens(
        Zend_Search_Lucene_Search_QueryToken $token,
        Zend_Search_Lucene_Search_QueryToken $prevToken = null,
        Zend_Search_Lucene_Search_QueryToken $nextToken = null)
    {
        // only examine word tokens.
        if ($token->type !== Zend_Search_Lucene_Search_QueryToken::TT_WORD) {
            return;
        }

        // count sub-tokens after removing wildcards.
        $text  = preg_replace('/[\*\?]/', '', $token->text);
        $count = count(Zend_Search_Lucene_Analysis_Analyzer::getDefault()->tokenize($text));

        // if there are multiple parts to this word, quote it.
        if ($count > 1) {
            $token->query = '"' . $token->query . '"';
        }
    }

    /**
     * If user searches for 'foo' we want it to match 'foobar'.
     * This will only happen if we append a wildcard, so we do this
     * automatically for the user.
     *
     * @param   Zend_Search_Lucene_Search_QueryToken    $token      the token to examine.
     * @param   Zend_Search_Lucene_Search_QueryToken    $prevToken  optional - the previous token if there is one.
     * @param   Zend_Search_Lucene_Search_QueryToken    $nextToken  optional - the next token if there is one.
     */
    protected static function _makeWordTokensWild(
        Zend_Search_Lucene_Search_QueryToken $token,
        Zend_Search_Lucene_Search_QueryToken $prevToken = null,
        Zend_Search_Lucene_Search_QueryToken $nextToken = null)
    {
        // only examine word tokens.
        if ($token->type !== Zend_Search_Lucene_Search_QueryToken::TT_WORD) {
            return;
        }

        // if token query length is long enough, append a wildcard.
        if (strlen($token->query) >= static::MIN_PREFIX_LENGTH) {
            $token->query = rtrim($token->query, '*') . "*";
        }
    }

    /**
     * Parse a QueryParserException error message to get error code
     * and the position for errors that we want to handle.
     *
     * @param Zend_Search_Lucene_Search_QueryParserException $e  the exception
     * @return array   the error code and position in the following format:
     *                     array('code' => 1, 'position' => 2)
     *                 array is empty if we don't know the error.
     */
    protected static function _getQueryParserError(
        Zend_Search_Lucene_Search_QueryParserException $e
    )
    {
        $error = array();

        $message = $e->getMessage();

        // Two chars lexeme -- '&&', '||' -- error
        $twoCharsPattern = '/Two chars lexeme expected. Position is ([0-9]+)./';

        // Lexeme modifier char error -- '~' and '^'
        $modifierPattern = '/Lexeme modifier character can be followed'
                         . ' only by number, white space or query syntax'
                         . ' element. Position is ([0-9]+)./';

        // for two chars operators, we correct it
        if (preg_match($twoCharsPattern, $message, $matches)) {
            $error['code']     = self::ERROR_TWO_CHARS_LEXEME;
            $error['position'] = $matches[1];

        } else if (preg_match($modifierPattern, $message, $matches)) {
            $error['code']     = self::ERROR_LEXEME_MODIFIER;
            $error['position'] = $matches[1];
        }

        return $error;
    }

    /**
     * Get a Zend_Search_Lucene instance.  It opens the search index if
     * the index exists.  Otherwise, it will create a new one.
     *
     * @param  string $index       the name of the search index
     *                              (also the folder name).
     * @return Zend_Search_Lucene_Interface   a search instance
     */
    protected static function _getSearchIndex($index)
    {
        // if $index is not a string or it's an empty string
        // we cannot get search index
        if (!is_string($index) || (strlen($index) == 0) ) {
            throw new Zend_Search_Exception(
                'Require a directory to fetch a Search index.'
            );
        }

        // give R/W only for current user and group
        Zend_Search_Lucene_Storage_Directory_Filesystem::setDefaultFilePermissions(0660);

        // set a limit on the size of a result set and set the minimum
        // characters allowed before a wildcard in a query to helps avoid
        // performance problems resulting from too queries that are too broad
        Zend_Search_Lucene::setResultSetLimit(static::MAX_RESULTS);
        Zend_Search_Lucene_Search_Query_Wildcard::setMinPrefixLength(static::MIN_PREFIX_LENGTH);

        // use 'UTF8num' analyzer so words with numbers embedded will
        // be treated as a single token (otherwise considered multi-word).
        Zend_Search_Lucene_Analysis_Analyzer::setDefault(
            new P4Cms_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive
        );

        // make space imply AND instead of OR.
        Zend_Search_Lucene_Search_QueryParser::setDefaultOperator(
            Zend_Search_Lucene_Search_QueryParser::B_AND
        );

        $indexFile = P4Cms_Site::fetchActive()->getDataPath() . '/' . $index;

        if (file_exists($indexFile)) {
            $searchInstance = Zend_Search_Lucene::open($indexFile);
        } else {
            $searchInstance = Zend_Search_Lucene::create($indexFile);
        }

        // apply performance tunables if they exist
        $maxBufferedDocs = Search_Module::getMaxBufferedDocs();
        $searchInstance->setMaxBufferedDocs(intval($maxBufferedDocs));

        $maxMergeDocs    = Search_Module::getMaxMergeDocs();
        $searchInstance->setMaxMergeDocs(intval($maxMergeDocs));

        $mergeFactor     = Search_Module::getMergeFactor();
        $searchInstance->setMergeFactor(intval($mergeFactor));

        return $searchInstance;
    }

    /**
     * Get the Module config option.
     *
     * @param string $option  the name of the config option
     * @return mixed          the value of the option,
     *                        null, if the option does not exist
     */
    public static function getOption($option)
    {
        $config = self::getConfig();

        if ($config instanceof Zend_Config) {
            $config = $config->toArray();
        }

        if (isset($config[$option])) {
            return $config[$option];
        }

        return null;
    }

    /**
     * Attempts to normalize the given 'document' into a lucene document
     * object. If the input is an object with a toLuceneDocument method,
     * we will use that.
     *
     * @param   mixed   $document                   the input document to normalize to lucene
     * @return  Zend_Search_Lucene_Document|false   a lucene document object or false if we
     *                                              can't convert the input to lucene.
     *
     * @publishes   p4cms.search.prepareDocument
     *              Return the passed document after making any necessary modifications for proper
     *              indexing. Subscribers can adjust values or take responsibility for converting
     *              the document to Lucene Document format so it can be successfully indexed.
     *              Zend_Search_Lucene_Document|mixed   $document   The document to prepare for
     *                                                              indexing.
     *              mixed                               $original   The original value passed to
     *                                                              'prepareDocument'
     */
    public static function prepareDocument($document)
    {
        $original = $document;

        // can the object turn itself into a lucene doc?
        if (is_object($document) && method_exists($document, 'toLuceneDocument')) {
            try {
                $document = $document->toLuceneDocument();
            } catch (Exception $e) {
                P4Cms_Log::logException(
                    "Failed to create Lucene document.",
                    $e
                );
            }
        }

        // if document is not yet a lucene doc, make one.
        if (!$document instanceof Zend_Search_Lucene_Document) {
            $document = new Zend_Search_Lucene_Document;
        }

        // allow third-parties to influence how document is prepared for index.
        // this is done via the 'filter' technique of pub/sub whereby the first
        // argument passed to each subscriber is the return value of the last.
        $document = P4Cms_PubSub::filter(
            'p4cms.search.prepareDocument',
            $document,
            $original
        );

        // if the document doesn't have any fields, then we were unable
        // to prepare it for indexing, therefore return false.
        if (!$document instanceof Zend_Search_Lucene_Document
            || !count($document->getFieldNames())
        ) {
            return false;
        }

        return $document;
    }

    /**
     * Get the maximum number of documents buffered in memory at one time.
     *
     * @return  string  The maximum number of documents
     */
    public static function getMaxBufferedDocs()
    {
        return Search_Module::getOption('maxBufferedDocs')
            ? Search_Module::getOption('maxBufferedDocs')
            : self::DEFAULT_MAX_BUFFERED_DOCS;
    }

    /**
     * Get the maximum number of documents merged into an index segment by auto-optimization.
     *
     * @return  string   the maximum number of merge documents
     */
    public static function getMaxMergeDocs()
    {
        return Search_Module::getOption('maxMergeDocs')
            ? Search_Module::getOption('maxMergeDocs')
            : PHP_INT_MAX;
    }

    /**
     * Get the Merge Factor.
     *
     * @return  string   the merge factor.
     */
    public static function getMergeFactor()
    {
        return Search_Module::getOption('mergeFactor')
            ? Search_Module::getOption('mergeFactor')
            : self::DEFAULT_MERGE_FACTOR;
    }
}
#	Change	User	Description
#1	16170	perforce_software	Move Chronicle files to follow new path scheme for branching.
//guest/perforce_software/chronicle/sites/all/modules/search/Module.php
#1	8972	Matt Attaway	Initial add of the Chronicle source code