Hit GAE limit at 2k pages. Rewriting... don't use :-)
Demo
You can test the plugin on GAE at (currently down). The demo searches web2py's epydocs.
Source
Repository: http://bitbucket.org/hc/plugin_h_search
#!/usr/bin/env python
""" Search plugin for web2py (alpha)
Author : Hans Christian von Stockhausen <hc at vst.io>
Date : 2011-01-13
Source : http://bitbucket.org/hc/plugin_h_search
Demo : http://vsttemp.appspot.com (Google Appengine)
License : MIT (see LICENSE)
This module implements a simple search engine for web2py-powered sites. It is
intended for local site search on or off Google Appengine. Results are ranked
by keyword frequency, keyword position relative to the top of the page and
keyword proximity to one another.
The engine is based on "Programming Collective Intelligence" (Ch. 4).
Dependencies
------------
This module requires the following third party modules:
pyporter2 - https://github.com/mdirolf/pyporter2
BeautifulSoup - www.crummy.com/software/BeautifulSoup/
Note: The w2p plugin package contains these already for your convinience.
Example
-------
### models/mymodel.py
search = local_import('plugin_h_search')
search.create_tables(globals())
### controllers/mycontroller.py
search = local_import('plugin_h_search')
@search.index(globals(), search.INDEX_DAILY)
def foo():
" The entire html output of this page is indexed max once a day "
return dict(message='Hello World')
def bar():
" Index arbitrary data. Note that no decorator is used "
url = 'http://www.web2py.com'
data = urllib2.urlopen(url).read()
idx = search.Indexer(globals(), search.INDEX_WEEKLY)
idx.add(data, url)
return 'Indexed %s' % url
@cache(request.vars.get('q'), time_expire=3600, cache_model=cache_disk)
def search():
return dict(results=search.search(globals(), request.vars.get('q'))
### views/mycontroller/search.html
{{extend 'layout.html'}}
{{for r in results:}}
{{=A(XML(r.title), _href=r.url)}}
{{=XML(r.snippet)}}
{{pass}}
"""
# FIXME/TODO
#
# - fix title extracted from HTML (Indexer._process_html)
# - fix HTML stripper (includes too much CSS, etc.) (Indexer._process_html)
# - test with a lot more data on and off GAE
# - add meta tag description to page table
# - implement summary snippet extractor for search results
# - implement additional scoring functionality (page rank, neural net)
# - support asynchronous indexing using GAE task queue/cron?
# - paging
#
import datetime
import itertools
import logging
import operator
import string
import re
from gluon.html import URL
from gluon.sql import Field
from gluon.tools import Storage
from pyporter2 import Stemmer
from BeautifulSoup import BeautifulSoup, Comment
__author__ = 'Hans Christian v. Stockhausen <hc at vst.io>'
INDEX_ALWAYS = 0
INDEX_HOURLY = 3600
INDEX_DAILY = INDEX_HOURLY * 24
INDEX_WEEKLY = INDEX_DAILY * 7
INDEX_MONTHLY = INDEX_DAILY * 30
INDEX_YEARLY = INDEX_DAILY * 356
TABLE_PAGE = 'plugin_h_search_page'
TABLE_INDEX = 'plugin_h_search_index'
STOPWORDS = ["a", "about", "above", "after", "again", "against", "all", "am",
"an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been",
"before", "being", "below", "between", "both", "but", "by", "can't",
"cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't",
"doing", "don't", "down", "during", "each", "few", "for", "from",
"further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having",
"he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself",
"him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've",
"if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's",
"me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of",
"off", "on", "once", "only", "or", "other", "ought", "our", "ours ",
"ourselves", "out", "over", "own", "same", "shan't", "she", "she'd",
"she'll", "she's", "should", "shouldn't", "so", "some", "such", "than",
"that", "that's", "the", "their", "theirs", "them", "themselves", "then",
"there", "there's", "these", "they", "they'd", "they'll", "they're",
"they've", "this", "those", "through", "to", "too", "under", "until", "up",
"very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were",
"weren't", "what", "what's", "when", "when's", "where", "where's", "which",
"while", "who", "who's", "whom", "why", "why's", "with", "won't", "would",
"wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours",
"yourself", "yourselves"] # http://www.ranks.nl/resources/stopwords.html
log = logging.getLogger('web2py')
# Python < 2.6 as on GAE requires custom implementation of itertools.product
if not hasattr(itertools, 'product'):
def product(*args):
pools = map(tuple, args)
result = [[]]
for pool in pools:
result = [x+[y] for x in result for y in pool]
for prod in result:
yield tuple(prod)
itertools.product = product
class Indexer(object):
def __init__(self, globals_, frequency):
self.request = request = globals_['request']
self.response = globals_['response']
self.db = globals_['db']
self.url = URL(r=request, args=request.args, vars=request.get_vars)
self.frequency = frequency
def add(self, data, url=None):
" Add data to search index "
if self.request.extension == 'html' or url:
if url: # required when not using decorator to manually index
self.url = url
if self._index_expired():
self._process(data)
else:
log.info('Not reindexing %s as not yet expired.' % self.url)
else:
log.warn('Non-HTML resouce at %s not added to index.' % self.url)
def _process(self, data):
soup = BeautifulSoup(data)
# remove <script>, <style> and <!-- comments -->
for tag in ('script', 'style'):
for t in soup(tag):
t.extract()
for comment in soup.findAll(text=lambda t: isinstance(t, Comment)):
comment.extract()
# extract page title and strip newlines if any
title = soup.html.head.title.string or 'Untitled'
title = re.sub('\n', ' ', title).strip()
# strip html and remove newlines
contents = ' '.join(soup.findAll(text=True))
contents = re.sub('\n', ' ', contents)
# split words
r = re.compile(r'\W*')
words = (w.lower() for w in r.split(data) if w!='')
# remove stopwords
words = (w for w in words if w not in STOPWORDS)
# stem words
stem = Stemmer.Stemmer('english').stemWord
words = itertools.imap(stem, words)
self._update_index(title, contents, words)
def _index_expired(self):
db = self.db
record = db(db[TABLE_PAGE].url==self.url).select().first()
if record:
indexed_on = record.indexed_on
now = datetime.datetime.now()
delta = datetime.timedelta(seconds=self.frequency)
has_expired = indexed_on < (now - delta)
else:
has_expired = True # not previously indexed
return has_expired
def _update_index(self, title, contents, words):
db = self.db
if self.request.env.web2py_runtime_gae:
for r in db(db[TABLE_PAGE].url==self.url).select():
db(db[TABLE_PAGE].id==r.id).delete()
db(db[TABLE_INDEX].ref_page==r.id).delete()
else:
# delete cascades to index table automatically on non GAE
db(db[TABLE_PAGE].url==self.url).delete()
page_id = db[TABLE_PAGE].insert(url=self.url, title=title,
contents=contents)
db[TABLE_INDEX].insert(ref_page=page_id,
keywords=list(words)[:4990]) # GAE index limit 5000
class Searcher(object):
def __init__(self, globals_):
self.db = globals_['db']
def search(self, search_query, namespace=None, limit=15):
results = self._query_index(search_query, namespace)
if results:
scores = Scorer(results.values()).score()
db = self.db
for score, ref_page in sorted(zip(scores, results.keys())[:limit],
reverse=True):
record = db[TABLE_PAGE][ref_page]
s = Storage()
s.title = self._highlight_matches(record.title)
s.snippet = self._get_snippet(record.contents)
s.url = record.url
yield s
else:
yield []
def _query_index(self, search_query, namespace):
db = self.db
stemmer = Stemmer.Stemmer('english')
self.words = words = [stemmer.stemWord(w) for w in
search_query.lower().split() if not w in STOPWORDS]
if words:
filter_ = db[TABLE_INDEX].keywords.contains
query = reduce(operator.and_, [filter_(w) for w in words])
if namespace:
query &= db[TABLE_INDEX].namespace == namespace
results = dict()
for row in db(query).select():
results[row.ref_page] = [x for x in enumerate(row.keywords)
if x[1] in words] # x is the wordpos tuple (position, word)
return results
def _get_snippet(self, text):
PRECONTEXT = 25
POSTCONTEXT = 150
if not hasattr(self, 'regex'):
p = (r'|'.join(self.words) + r'.+?') * len(self.words)
p = r'\b[\w\s]{0,%s}' % PRECONTEXT + p + r'.{0,%s}\b' % POSTCONTEXT
self.regex = re.compile(p, re.I| re.S)
best_len = sum(map(len, self.words)) + PRECONTEXT + POSTCONTEXT
try:
snippet = sorted(self.regex.findall(text),
key=lambda x: abs(len(x)-best_len))[0][:400]
except IndexError:
snippet = ''
return self._highlight_matches(snippet)
def _highlight_matches(self, text):
r = re.compile(r'(%s)' % '|'.join(self.words), re.IGNORECASE)
return r.subn(r'<b>\1</b>', text)[0]
class Scorer(object):
def __init__(self, word_positions):
self.word_positions = word_positions
def score(self, frequency_weight=1, position_weight=1, proximity_weight=1):
return [
fs*frequency_weight + ps*position_weight + pr*proximity_weight for
fs, ps, pr in zip(
self._get_frequency_score(),
self._get_position_score(),
self._get_proximity_score())]
def _get_frequency_score(self):
" The higher the keyword frequency the better the ranking "
scores = [len(wp) for wp in self.word_positions]
return Scorer.normalize_score(scores, high_is_better=True)
def _get_position_score(self):
" The sooner keywords appear in the source the better the ranking "
scores = []
for wp in self.word_positions:
min_pos = dict()
for pos, word in wp:
min_pos[word] = min(pos, min_pos.get(word, pos))
scores.append(sum(min_pos.values()))
return Scorer.normalize_score(scores, high_is_better=False)
def _get_proximity_score(self):
" The closer keywords are to one another the better the ranking "
scores = []
for wp in self.word_positions:
positions = dict()
for pos, word in wp:
positions.setdefault(word, []).append(pos)
score = float('inf')
for p in itertools.product(*positions.values()):
distance = sum([abs(p[i]-p[i-1]) for i in range(1, len(p))])
score = min(distance, score)
scores.append(score)
return Scorer.normalize_score(scores, high_is_better=False)
@staticmethod
def normalize_score(scores, high_is_better):
" Return ranking scores between 1 (relevant) and 0 "
small = 0.00001 # used instead of zero to avoid division by zero
if high_is_better:
maxscore = max(scores) or small
normalized = (float(score)/maxscore for score in scores)
else:
minscore = float(min(scores))
normalized = (minscore/max(score, small) for score in scores)
return normalized
# -----------------------------------------------------------------------------
# API - use these functions from your model/controller as per module docstring
# -----------------------------------------------------------------------------
def index(globals_, frequency=INDEX_DAILY):
" Decorator to index an action's output "
def wrapper(action):
def f():
data = globals_['response'].render(action())
Indexer(globals_, frequency).add(data)
return data
return f
return wrapper
def search(globals_, query, namespace=None):
" Use from within a controller action to query index "
if query:
return Searcher(globals_).search(query, namespace)
def create_tables(globals_):
" Use in model to define DB tables "
db = globals_['db']
request = globals_['request']
db.define_table(TABLE_PAGE,
Field('url', required=True),
Field('indexed_on', 'datetime', default=request.now),
Field('title'),
Field('contents', 'text'))
db.define_table(TABLE_INDEX,
Field('ref_page', db[TABLE_PAGE]),
Field('keywords', 'list:string'),
Field('namespace'))
Comments (0)