Site search plugin

Hit GAE limit at 2k pages. Rewriting... don't use :-)
Demo

You can test the plugin on GAE at (currently down). The demo searches web2py's epydocs.
Source

Repository: http://bitbucket.org/hc/plugin_h_search
#!/usr/bin/env python

""" Search plugin for web2py (alpha)

Author  : Hans Christian von Stockhausen <hc at vst.io>
Date    : 2011-01-13
Source  : http://bitbucket.org/hc/plugin_h_search
Demo    : http://vsttemp.appspot.com (Google Appengine)
License : MIT (see LICENSE)

This module implements a simple search engine for web2py-powered sites. It is 
intended for local site search on or off Google Appengine. Results are ranked 
by keyword frequency, keyword position relative to the top of the page and 
keyword proximity to one another. 
The engine is based on "Programming Collective Intelligence" (Ch. 4).


Dependencies
------------

This module requires the following third party modules:

pyporter2     - https://github.com/mdirolf/pyporter2
BeautifulSoup - www.crummy.com/software/BeautifulSoup/

Note: The w2p plugin package contains these already for your convinience.


Example 
-------

### models/mymodel.py
search = local_import('plugin_h_search')
search.create_tables(globals())

### controllers/mycontroller.py
search = local_import('plugin_h_search')

@search.index(globals(), search.INDEX_DAILY)
def foo():
    " The entire html output of this page is indexed max once a day "
    return dict(message='Hello World')

def bar():
    " Index arbitrary data. Note that no decorator is used "
    url = 'http://www.web2py.com'
    data = urllib2.urlopen(url).read()
    idx = search.Indexer(globals(), search.INDEX_WEEKLY)
    idx.add(data, url)
    return 'Indexed %s' % url

@cache(request.vars.get('q'), time_expire=3600, cache_model=cache_disk)  
def search():
    return dict(results=search.search(globals(), request.vars.get('q'))

### views/mycontroller/search.html
{{extend 'layout.html'}}
{{for r in results:}}
    {{=A(XML(r.title), _href=r.url)}}
    {{=XML(r.snippet)}}
{{pass}}

"""

# FIXME/TODO
#
# - fix title extracted from HTML (Indexer._process_html)
# - fix HTML stripper (includes too much CSS, etc.) (Indexer._process_html)
# - test with a lot more data on and off GAE
# - add meta tag description to page table
# - implement summary snippet extractor for search results
# - implement additional scoring functionality (page rank, neural net)
# - support asynchronous indexing using GAE task queue/cron?
# - paging
# 

import datetime
import itertools
import logging
import operator
import string
import re
from gluon.html import URL
from gluon.sql import Field
from gluon.tools import Storage
from pyporter2 import Stemmer
from BeautifulSoup import BeautifulSoup, Comment

__author__ = 'Hans Christian v. Stockhausen <hc at vst.io>'

INDEX_ALWAYS = 0
INDEX_HOURLY = 3600
INDEX_DAILY = INDEX_HOURLY * 24
INDEX_WEEKLY = INDEX_DAILY * 7
INDEX_MONTHLY = INDEX_DAILY * 30
INDEX_YEARLY = INDEX_DAILY * 356

TABLE_PAGE = 'plugin_h_search_page'
TABLE_INDEX = 'plugin_h_search_index'

STOPWORDS = ["a", "about", "above", "after", "again", "against", "all", "am",
    "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been",
    "before", "being", "below", "between", "both", "but", "by", "can't",
    "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't",
    "doing", "don't", "down", "during", "each", "few", "for", "from",
    "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having",
    "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself",
    "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've",
    "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's",
    "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of",
    "off", "on", "once", "only", "or", "other", "ought", "our", "ours ",
    "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd",
    "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than",
    "that", "that's", "the", "their", "theirs", "them", "themselves", "then",
    "there", "there's", "these", "they", "they'd", "they'll", "they're",
    "they've", "this", "those", "through", "to", "too", "under", "until", "up",
    "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were",
    "weren't", "what", "what's", "when", "when's", "where", "where's", "which",
    "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would",
    "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours",
    "yourself", "yourselves"] # http://www.ranks.nl/resources/stopwords.html

log = logging.getLogger('web2py')

# Python < 2.6 as on GAE requires custom implementation of itertools.product 
if not hasattr(itertools, 'product'): 
    def product(*args):
        pools = map(tuple, args)
        result = [[]]
        for pool in pools:
            result = [x+[y] for x in result for y in pool]
        for prod in result:
            yield tuple(prod)       
    itertools.product = product


class Indexer(object):

    def __init__(self, globals_, frequency):
        self.request = request = globals_['request']
        self.response = globals_['response']
        self.db = globals_['db']
        self.url = URL(r=request, args=request.args, vars=request.get_vars)
        self.frequency = frequency

    def add(self, data, url=None):
        " Add data to search index "

        if self.request.extension == 'html' or url:
            if url: # required when not using decorator to manually index
                self.url = url
            if self._index_expired():
                self._process(data)
            else:
                log.info('Not reindexing %s as not yet expired.' % self.url)
        else:
            log.warn('Non-HTML resouce at %s not added to index.' % self.url)

    def _process(self, data):
        soup = BeautifulSoup(data)
        # remove <script>, <style> and <!-- comments -->
        for tag in ('script', 'style'):
            for t in soup(tag):
                t.extract()
        for comment in soup.findAll(text=lambda t: isinstance(t, Comment)):
            comment.extract()       
        # extract page title and strip newlines if any
        title = soup.html.head.title.string or 'Untitled'
        title = re.sub('\n', ' ', title).strip()
        # strip html and remove newlines
        contents = ' '.join(soup.findAll(text=True))
        contents = re.sub('\n', ' ', contents)
        # split words
        r = re.compile(r'\W*')
        words = (w.lower() for w in r.split(data) if w!='')
        # remove stopwords
        words = (w for w in words if w not in STOPWORDS)
        # stem words
        stem = Stemmer.Stemmer('english').stemWord
        words = itertools.imap(stem, words)
        self._update_index(title, contents, words)

    def _index_expired(self):
        db = self.db
        record = db(db[TABLE_PAGE].url==self.url).select().first()
        if record:
            indexed_on = record.indexed_on
            now = datetime.datetime.now()
            delta = datetime.timedelta(seconds=self.frequency)
            has_expired = indexed_on < (now - delta)
        else:
            has_expired = True # not previously indexed
        return has_expired

    def _update_index(self, title, contents, words):
        db = self.db
        if self.request.env.web2py_runtime_gae:
            for r in db(db[TABLE_PAGE].url==self.url).select():
                db(db[TABLE_PAGE].id==r.id).delete()
                db(db[TABLE_INDEX].ref_page==r.id).delete()    
        else:
            # delete cascades to index table automatically on non GAE
            db(db[TABLE_PAGE].url==self.url).delete()
        page_id = db[TABLE_PAGE].insert(url=self.url, title=title,
            contents=contents)
        db[TABLE_INDEX].insert(ref_page=page_id, 
            keywords=list(words)[:4990]) # GAE index limit 5000


class Searcher(object):

    def __init__(self, globals_):
        self.db = globals_['db']

    def search(self, search_query, namespace=None, limit=15):
        results = self._query_index(search_query, namespace)
        if results:
            scores = Scorer(results.values()).score()
            db = self.db
            for score, ref_page in sorted(zip(scores, results.keys())[:limit],
                reverse=True):
                record = db[TABLE_PAGE][ref_page]
                s = Storage()
                s.title = self._highlight_matches(record.title)
                s.snippet = self._get_snippet(record.contents)
                s.url = record.url
                yield s
        else:
            yield []

    def _query_index(self, search_query, namespace):
        db = self.db
        stemmer = Stemmer.Stemmer('english')
        self.words = words = [stemmer.stemWord(w) for w in
            search_query.lower().split() if not w in STOPWORDS]
        if words:
            filter_ = db[TABLE_INDEX].keywords.contains
            query = reduce(operator.and_, [filter_(w) for w in words])
            if namespace:
                query &= db[TABLE_INDEX].namespace == namespace
            results = dict()
            for row in db(query).select():
                results[row.ref_page] = [x for x in enumerate(row.keywords)
                    if x[1] in words] # x is the wordpos tuple (position, word)
            return results

    def _get_snippet(self, text):
        PRECONTEXT = 25
        POSTCONTEXT = 150
        if not hasattr(self, 'regex'):
            p = (r'|'.join(self.words) + r'.+?') * len(self.words)
            p = r'\b[\w\s]{0,%s}' % PRECONTEXT + p + r'.{0,%s}\b' % POSTCONTEXT
            self.regex = re.compile(p, re.I| re.S)
        best_len = sum(map(len, self.words)) + PRECONTEXT + POSTCONTEXT
        try:
            snippet = sorted(self.regex.findall(text),
                key=lambda x: abs(len(x)-best_len))[0][:400]
        except IndexError:
            snippet = ''
        return self._highlight_matches(snippet)

    def _highlight_matches(self, text):
        r = re.compile(r'(%s)' % '|'.join(self.words), re.IGNORECASE)
        return r.subn(r'<b>\1</b>', text)[0]


class Scorer(object):

    def __init__(self, word_positions):
        self.word_positions = word_positions

    def score(self, frequency_weight=1, position_weight=1, proximity_weight=1):
        return [
            fs*frequency_weight + ps*position_weight + pr*proximity_weight for
            fs, ps, pr in zip(
                self._get_frequency_score(), 
                self._get_position_score(), 
                self._get_proximity_score())]

    def _get_frequency_score(self):
        " The higher the keyword frequency the better the ranking "

        scores = [len(wp) for wp in self.word_positions]
        return Scorer.normalize_score(scores, high_is_better=True)

    def _get_position_score(self):
        " The sooner keywords appear in the source the better the ranking "

        scores = []
        for wp in self.word_positions:
            min_pos = dict()
            for pos, word in wp:
                min_pos[word] = min(pos, min_pos.get(word, pos))
            scores.append(sum(min_pos.values()))
        return Scorer.normalize_score(scores, high_is_better=False)

    def _get_proximity_score(self):
        " The closer keywords are to one another the better the ranking "

        scores = []
        for wp in self.word_positions:
            positions = dict()
            for pos, word in wp:
                positions.setdefault(word, []).append(pos)
            score = float('inf')
            for p in itertools.product(*positions.values()):
                distance = sum([abs(p[i]-p[i-1]) for i in range(1, len(p))])
                score = min(distance, score)
            scores.append(score)
        return Scorer.normalize_score(scores, high_is_better=False)

    @staticmethod
    def normalize_score(scores, high_is_better):
        " Return ranking scores between 1 (relevant) and 0 "

        small = 0.00001 # used instead of zero to avoid division by zero
        if high_is_better:
            maxscore = max(scores) or small
            normalized = (float(score)/maxscore for score in scores)
        else:
            minscore = float(min(scores))
            normalized = (minscore/max(score, small) for score in scores)   
        return normalized


# -----------------------------------------------------------------------------    
# API - use these functions from your model/controller as per module docstring
# -----------------------------------------------------------------------------

def index(globals_, frequency=INDEX_DAILY):
    " Decorator to index an action's output "

    def wrapper(action):
        def f():
            data = globals_['response'].render(action())
            Indexer(globals_, frequency).add(data)
            return data
        return f       
    return wrapper

def search(globals_, query, namespace=None):
    " Use from within a controller action to query index "

    if query:
        return Searcher(globals_).search(query, namespace)

def create_tables(globals_):
    " Use in model to define DB tables "

    db = globals_['db']
    request = globals_['request']
    db.define_table(TABLE_PAGE,
        Field('url', required=True),
        Field('indexed_on', 'datetime', default=request.now),
        Field('title'),
        Field('contents', 'text'))
    db.define_table(TABLE_INDEX,
        Field('ref_page', db[TABLE_PAGE]),
        Field('keywords', 'list:string'),
        Field('namespace'))
Demo

Source

Related slices

Comments (0)