# Copyright 2009 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
"""
The classes in this module encode and decode posting information for a field.
The field format essentially determines what information is stored about each
occurance of a term.
"""
from collections import defaultdict
from pickle import dumps, loads
from whoosh.analysis import entoken, unstopped
from whoosh.system import (
_FLOAT_SIZE,
_INT_SIZE,
emptybytes,
pack_float,
pack_uint,
unpack_float,
unpack_uint,
)
# Format base class
# Concrete field classes
# TODO: as a legacy thing most of these formats store the frequency but not the
# weight in the value string, so if you use field or term boosts
# postreader.value_as("weight") will not match postreader.weight()
def tokens(value, analyzer, kwargs):
if isinstance(value, (tuple, list)):
gen = entoken(value, **kwargs)
else:
gen = analyzer(value, **kwargs)
return unstopped(gen)
[docs]class Existence(Format):
"""Only indexes whether a given term occurred in a given document; it does
not store frequencies or positions. This is useful for fields that should
be searchable but not scorable, such as file path.
Supports: frequency, weight (always reports frequency = 1).
"""
posting_size = 0
__inittypes__ = {"field_boost": float}
def __init__(self, field_boost=1.0, **options):
self.field_boost = field_boost
self.options = options
def word_values(self, value, analyzer, **kwargs):
fb = self.field_boost
wordset = {t.text for t in tokens(value, analyzer, kwargs)}
return ((w, 1, fb, emptybytes) for w in wordset)
def encode(self, value):
return emptybytes
def decode_frequency(self, valuestring):
return 1
def decode_weight(self, valuestring):
return self.field_boost
def combine(self, vs):
return emptybytes
[docs]class Frequency(Format):
"""Stores frequency information for each posting.
Supports: frequency, weight.
"""
posting_size = _INT_SIZE
__inittypes__ = {"field_boost": float, "boost_as_freq": bool}
def __init__(self, field_boost=1.0, boost_as_freq=False, **options):
"""
:param field_boost: A constant boost factor to scale to the score of
all queries matching terms in this field.
"""
assert isinstance(field_boost, float)
self.field_boost = field_boost
self.options = options
def word_values(self, value, analyzer, **kwargs):
fb = self.field_boost
length = 0
freqs = defaultdict(int)
weights = defaultdict(float)
kwargs["boosts"] = True
for t in tokens(value, analyzer, kwargs):
length += 1
freqs[t.text] += 1
weights[t.text] += t.boost
wvs = ((w, freq, weights[w] * fb, pack_uint(freq)) for w, freq in freqs.items())
return wvs
def decode_frequency(self, valuestring):
return unpack_uint(valuestring)[0]
def decode_weight(self, valuestring):
freq = unpack_uint(valuestring)[0]
return freq * self.field_boost
def combine(self, vs):
return pack_uint(sum(self.decode_value(v) for v in vs))
[docs]class Positions(Format):
"""Stores position information in each posting, to allow phrase searching
and "near" queries.
Supports: frequency, weight, positions, position_boosts (always reports
position boost = 1.0).
"""
def word_values(self, value, analyzer, **kwargs):
fb = self.field_boost
poses = defaultdict(list)
weights = defaultdict(float)
kwargs["positions"] = True
kwargs["boosts"] = True
for t in tokens(value, analyzer, kwargs):
poses[t.text].append(t.pos)
weights[t.text] += t.boost
for w, poslist in poses.items():
value = self.encode(poslist)
yield (w, len(poslist), weights[w] * fb, value)
def encode(self, poslist):
deltas = []
base = 0
for pos in poslist:
deltas.append(pos - base)
base = pos
return pack_uint(len(deltas)) + dumps(deltas, 2)
def decode_positions(self, valuestring):
if not valuestring.endswith(b"."):
valuestring += b"."
codes = loads(valuestring[_INT_SIZE:])
position = 0
positions = []
for code in codes:
position += code
positions.append(position)
return positions
def decode_frequency(self, valuestring):
return unpack_uint(valuestring[:_INT_SIZE])[0]
def decode_weight(self, valuestring):
return self.decode_frequency(valuestring) * self.field_boost
def decode_position_boosts(self, valuestring):
return [(pos, 1) for pos in self.decode_positions(valuestring)]
def combine(self, vs):
s = set()
for v in vs:
s.update(self.decode_positions(v))
return self.encode(sorted(s))
[docs]class Characters(Positions):
"""Stores token position and character start and end information for each
posting.
Supports: frequency, weight, positions, position_boosts (always reports
position boost = 1.0), characters.
"""
def word_values(self, value, analyzer, **kwargs):
fb = self.field_boost
seen = defaultdict(list)
weights = defaultdict(float)
kwargs["positions"] = True
kwargs["chars"] = True
kwargs["boosts"] = True
for t in tokens(value, analyzer, kwargs):
seen[t.text].append((t.pos, t.startchar, t.endchar))
weights[t.text] += t.boost
for w, poslist in seen.items():
value = self.encode(poslist)
yield (w, len(poslist), weights[w] * fb, value)
def encode(self, poslist):
deltas = []
posbase = 0
charbase = 0
for pos, startchar, endchar in poslist:
deltas.append((pos - posbase, startchar - charbase, endchar - startchar))
posbase = pos
charbase = endchar
return pack_uint(len(deltas)) + dumps(deltas, 2)
def decode_characters(self, valuestring):
if not valuestring.endswith(b"."):
valuestring += b"."
codes = loads(valuestring[_INT_SIZE:])
position = 0
endchar = 0
posns_chars = []
for code in codes:
position = code[0] + position
startchar = code[1] + endchar
endchar = code[2] + startchar
posns_chars.append((position, startchar, endchar))
return posns_chars
def decode_positions(self, valuestring):
if not valuestring.endswith(b"."):
valuestring += b"."
codes = loads(valuestring[_INT_SIZE:])
position = 0
posns = []
for code in codes:
position = code[0] + position
posns.append(position)
return posns
def combine(self, vs):
s = {}
for v in vs:
for pos, sc, ec in self.decode_characters(v):
if pos in s:
old_sc, old_ec = pos[s]
s[pos] = (min(sc, old_sc), max(ec, old_ec))
else:
s[pos] = (sc, ec)
poses = [(pos, s[pos][0], s[pos][1]) for pos in sorted(s.keys())]
return self.encode(poses)
[docs]class PositionBoosts(Positions):
"""A format that stores positions and per-position boost information
in each posting.
Supports: frequency, weight, positions, position_boosts.
"""
def word_values(self, value, analyzer, **kwargs):
fb = self.field_boost
seen = defaultdict(list)
kwargs["positions"] = True
kwargs["boosts"] = True
for t in tokens(value, analyzer, kwargs):
pos = t.pos
boost = t.boost
seen[t.text].append((pos, boost))
for w, poses in seen.items():
value = self.encode(poses)
yield (w, len(poses), sum(p[1] for p in poses) * fb, value)
def encode(self, poses):
codes = []
base = 0
summedboost = 0
for pos, boost in poses:
summedboost += boost
codes.append((pos - base, boost))
base = pos
return pack_uint(len(poses)) + pack_float(summedboost) + dumps(codes, 2)
def decode_position_boosts(self, valuestring):
if not valuestring.endswith(b"."):
valuestring += b"."
codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE :])
position = 0
posns_boosts = []
for code in codes:
position = code[0] + position
posns_boosts.append((position, code[1]))
return posns_boosts
def decode_positions(self, valuestring):
if not valuestring.endswith(b"."):
valuestring += b"."
codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE :])
position = 0
posns = []
for code in codes:
position = code[0] + position
posns.append(position)
return posns
def decode_weight(self, v):
summedboost = unpack_float(v[_INT_SIZE : _INT_SIZE + _FLOAT_SIZE])[0]
return summedboost * self.field_boost
def combine(self, vs):
s = defaultdict(float)
for v in vs:
for pos, boost in self.decode_position_boosts(v):
s[pos] += boost
return self.encode(sorted(s.items()))
[docs]class CharacterBoosts(Characters):
"""A format that stores positions, character start and end, and
per-position boost information in each posting.
Supports: frequency, weight, positions, position_boosts, characters,
character_boosts.
"""
def word_values(self, value, analyzer, **kwargs):
seen = defaultdict(list)
kwargs["positions"] = True
kwargs["chars"] = True
kwargs["boosts"] = True
for t in tokens(value, analyzer, kwargs):
seen[t.text].append((t.pos, t.startchar, t.endchar, t.boost))
for w, poses in seen.items():
value, summedboost = self.encode(poses)
yield (w, len(poses), summedboost, value)
def encode(self, poses):
fb = self.field_boost
# posns_chars_boosts = [(pos, startchar, endchar, boost), ...]
codes = []
posbase = 0
charbase = 0
summedboost = 0
for pos, startchar, endchar, boost in poses:
codes.append(
(pos - posbase, startchar - charbase, endchar - startchar, boost)
)
posbase = pos
charbase = endchar
summedboost += boost
return (
(pack_uint(len(poses)) + pack_float(summedboost * fb) + dumps(codes, 2)),
summedboost,
)
def decode_character_boosts(self, valuestring):
if not valuestring.endswith(b"."):
valuestring += b"."
codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE :])
position = 0
endchar = 0
posn_char_boosts = []
for code in codes:
position = position + code[0]
startchar = endchar + code[1]
endchar = startchar + code[2]
posn_char_boosts.append((position, startchar, endchar, code[3]))
return posn_char_boosts
def decode_positions(self, valuestring):
return [item[0] for item in self.decode_character_boosts(valuestring)]
def decode_characters(self, valuestring):
return [
(pos, startchar, endchar)
for pos, startchar, endchar, _ in self.decode_character_boosts(valuestring)
]
def decode_position_boosts(self, valuestring):
return [
(pos, boost)
for pos, _, _, boost in self.decode_character_boosts(valuestring)
]
def combine(self, vs):
s = {}
for v in vs:
for pos, sc, ec, boost in self.decode_character_boosts(v):
if pos in s:
old_sc, old_ec, old_boost = pos[s]
s[pos] = (min(sc, old_sc), max(ec, old_ec), old_boost + boost)
else:
s[pos] = (sc, ec, boost)
poses = [(pos, sc, ec, boost) for pos, (sc, ec, boost) in sorted(s.items())]
return self.encode(poses)[0] # encode() returns value, summedboost