Source code for whoosh.formats

# Copyright 2009 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#    1. Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#
#    2. Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.

"""
The classes in this module encode and decode posting information for a field.
The field format essentially determines what information is stored about each
occurance of a term.
"""

from collections import defaultdict
from pickle import dumps, loads

from whoosh.analysis import entoken, unstopped
from whoosh.system import (
    _FLOAT_SIZE,
    _INT_SIZE,
    emptybytes,
    pack_float,
    pack_uint,
    unpack_float,
    unpack_uint,
)

# Format base class


[docs]class Format:
    """Abstract base class representing a storage format for a field or vector.
    Format objects are responsible for writing and reading the low-level
    representation of a field. It controls what kind/level of information to
    store about the indexed fields.
    """

    posting_size = -1
    textual = True
    __inittypes__ = {"field_boost": float}

    def __init__(self, field_boost=1.0, **options):
        """
        :param field_boost: A constant boost factor to scale to the score
            of all queries matching terms in this field.
        """

        self.field_boost = field_boost
        self.options = options

    def __eq__(self, other):
        return (
            other
            and self.__class__ is other.__class__
            and self.__dict__ == other.__dict__
        )

    def __repr__(self):
        return f"{self.__class__.__name__}(boost={self.field_boost})"

    def fixed_value_size(self):
        if self.posting_size < 0:
            return None
        return self.posting_size

[docs]    def word_values(self, value, analyzer, **kwargs):
        """Takes the text value to be indexed and yields a series of
        ("tokentext", frequency, weight, valuestring) tuples, where frequency
        is the number of times "tokentext" appeared in the value, weight is the
        weight (a float usually equal to frequency in the absence of per-term
        boosts) and valuestring is encoded field-specific posting value for the
        token. For example, in a Frequency format, the value string would be
        the same as frequency; in a Positions format, the value string would
        encode a list of token positions at which "tokentext" occured.

        :param value: The unicode text to index.
        :param analyzer: The analyzer to use to process the text.
        """

        raise NotImplementedError

[docs]    def supports(self, name):
        """Returns True if this format supports interpreting its posting
        value as 'name' (e.g. "frequency" or "positions").
        """
        return hasattr(self, "decode_" + name)

[docs]    def decoder(self, name):
        """Returns the bound method for interpreting value as 'name',
        where 'name' is for example "frequency" or "positions". This
        object must have a corresponding Format.decode_<name>() method.
        """
        return getattr(self, "decode_" + name)

[docs]    def decode_as(self, astype, valuestring):
        """Interprets the encoded value string as 'astype', where 'astype' is
        for example "frequency" or "positions". This object must have a
        corresponding decode_<astype>() method.
        """
        return self.decoder(astype)(valuestring)


# Concrete field classes

# TODO: as a legacy thing most of these formats store the frequency but not the
# weight in the value string, so if you use field or term boosts
# postreader.value_as("weight") will not match postreader.weight()


def tokens(value, analyzer, kwargs):
    if isinstance(value, (tuple, list)):
        gen = entoken(value, **kwargs)
    else:
        gen = analyzer(value, **kwargs)
    return unstopped(gen)


[docs]class Existence(Format):
    """Only indexes whether a given term occurred in a given document; it does
    not store frequencies or positions. This is useful for fields that should
    be searchable but not scorable, such as file path.

    Supports: frequency, weight (always reports frequency = 1).
    """

    posting_size = 0
    __inittypes__ = {"field_boost": float}

    def __init__(self, field_boost=1.0, **options):
        self.field_boost = field_boost
        self.options = options

    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        wordset = {t.text for t in tokens(value, analyzer, kwargs)}
        return ((w, 1, fb, emptybytes) for w in wordset)

    def encode(self, value):
        return emptybytes

    def decode_frequency(self, valuestring):
        return 1

    def decode_weight(self, valuestring):
        return self.field_boost

    def combine(self, vs):
        return emptybytes


[docs]class Frequency(Format):
    """Stores frequency information for each posting.

    Supports: frequency, weight.
    """

    posting_size = _INT_SIZE
    __inittypes__ = {"field_boost": float, "boost_as_freq": bool}

    def __init__(self, field_boost=1.0, boost_as_freq=False, **options):
        """
        :param field_boost: A constant boost factor to scale to the score of
            all queries matching terms in this field.
        """

        assert isinstance(field_boost, float)
        self.field_boost = field_boost
        self.options = options

    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        length = 0
        freqs = defaultdict(int)
        weights = defaultdict(float)

        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            length += 1
            freqs[t.text] += 1
            weights[t.text] += t.boost

        wvs = ((w, freq, weights[w] * fb, pack_uint(freq)) for w, freq in freqs.items())
        return wvs

    def decode_frequency(self, valuestring):
        return unpack_uint(valuestring)[0]

    def decode_weight(self, valuestring):
        freq = unpack_uint(valuestring)[0]
        return freq * self.field_boost

    def combine(self, vs):
        return pack_uint(sum(self.decode_value(v) for v in vs))


[docs]class Positions(Format):
    """Stores position information in each posting, to allow phrase searching
    and "near" queries.

    Supports: frequency, weight, positions, position_boosts (always reports
    position boost = 1.0).
    """

    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        poses = defaultdict(list)
        weights = defaultdict(float)
        kwargs["positions"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            poses[t.text].append(t.pos)
            weights[t.text] += t.boost

        for w, poslist in poses.items():
            value = self.encode(poslist)
            yield (w, len(poslist), weights[w] * fb, value)

    def encode(self, poslist):
        deltas = []
        base = 0
        for pos in poslist:
            deltas.append(pos - base)
            base = pos
        return pack_uint(len(deltas)) + dumps(deltas, 2)

    def decode_positions(self, valuestring):
        if not valuestring.endswith(b"."):
            valuestring += b"."
        codes = loads(valuestring[_INT_SIZE:])
        position = 0
        positions = []
        for code in codes:
            position += code
            positions.append(position)
        return positions

    def decode_frequency(self, valuestring):
        return unpack_uint(valuestring[:_INT_SIZE])[0]

    def decode_weight(self, valuestring):
        return self.decode_frequency(valuestring) * self.field_boost

    def decode_position_boosts(self, valuestring):
        return [(pos, 1) for pos in self.decode_positions(valuestring)]

    def combine(self, vs):
        s = set()
        for v in vs:
            s.update(self.decode_positions(v))
        return self.encode(sorted(s))


[docs]class Characters(Positions):
    """Stores token position and character start and end information for each
    posting.

    Supports: frequency, weight, positions, position_boosts (always reports
    position boost = 1.0), characters.
    """

    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        seen = defaultdict(list)
        weights = defaultdict(float)

        kwargs["positions"] = True
        kwargs["chars"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            seen[t.text].append((t.pos, t.startchar, t.endchar))
            weights[t.text] += t.boost

        for w, poslist in seen.items():
            value = self.encode(poslist)
            yield (w, len(poslist), weights[w] * fb, value)

    def encode(self, poslist):
        deltas = []
        posbase = 0
        charbase = 0
        for pos, startchar, endchar in poslist:
            deltas.append((pos - posbase, startchar - charbase, endchar - startchar))
            posbase = pos
            charbase = endchar
        return pack_uint(len(deltas)) + dumps(deltas, 2)

    def decode_characters(self, valuestring):
        if not valuestring.endswith(b"."):
            valuestring += b"."
        codes = loads(valuestring[_INT_SIZE:])
        position = 0
        endchar = 0
        posns_chars = []
        for code in codes:
            position = code[0] + position
            startchar = code[1] + endchar
            endchar = code[2] + startchar
            posns_chars.append((position, startchar, endchar))
        return posns_chars

    def decode_positions(self, valuestring):
        if not valuestring.endswith(b"."):
            valuestring += b"."
        codes = loads(valuestring[_INT_SIZE:])
        position = 0
        posns = []
        for code in codes:
            position = code[0] + position
            posns.append(position)
        return posns

    def combine(self, vs):
        s = {}
        for v in vs:
            for pos, sc, ec in self.decode_characters(v):
                if pos in s:
                    old_sc, old_ec = pos[s]
                    s[pos] = (min(sc, old_sc), max(ec, old_ec))
                else:
                    s[pos] = (sc, ec)
        poses = [(pos, s[pos][0], s[pos][1]) for pos in sorted(s.keys())]
        return self.encode(poses)


[docs]class PositionBoosts(Positions):
    """A format that stores positions and per-position boost information
    in each posting.

    Supports: frequency, weight, positions, position_boosts.
    """

    def word_values(self, value, analyzer, **kwargs):
        fb = self.field_boost
        seen = defaultdict(list)

        kwargs["positions"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            pos = t.pos
            boost = t.boost
            seen[t.text].append((pos, boost))

        for w, poses in seen.items():
            value = self.encode(poses)
            yield (w, len(poses), sum(p[1] for p in poses) * fb, value)

    def encode(self, poses):
        codes = []
        base = 0
        summedboost = 0
        for pos, boost in poses:
            summedboost += boost
            codes.append((pos - base, boost))
            base = pos
        return pack_uint(len(poses)) + pack_float(summedboost) + dumps(codes, 2)

    def decode_position_boosts(self, valuestring):
        if not valuestring.endswith(b"."):
            valuestring += b"."
        codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE :])
        position = 0
        posns_boosts = []
        for code in codes:
            position = code[0] + position
            posns_boosts.append((position, code[1]))
        return posns_boosts

    def decode_positions(self, valuestring):
        if not valuestring.endswith(b"."):
            valuestring += b"."
        codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE :])
        position = 0
        posns = []
        for code in codes:
            position = code[0] + position
            posns.append(position)
        return posns

    def decode_weight(self, v):
        summedboost = unpack_float(v[_INT_SIZE : _INT_SIZE + _FLOAT_SIZE])[0]
        return summedboost * self.field_boost

    def combine(self, vs):
        s = defaultdict(float)
        for v in vs:
            for pos, boost in self.decode_position_boosts(v):
                s[pos] += boost
        return self.encode(sorted(s.items()))


[docs]class CharacterBoosts(Characters):
    """A format that stores positions, character start and end, and
    per-position boost information in each posting.

    Supports: frequency, weight, positions, position_boosts, characters,
    character_boosts.
    """

    def word_values(self, value, analyzer, **kwargs):
        seen = defaultdict(list)

        kwargs["positions"] = True
        kwargs["chars"] = True
        kwargs["boosts"] = True
        for t in tokens(value, analyzer, kwargs):
            seen[t.text].append((t.pos, t.startchar, t.endchar, t.boost))

        for w, poses in seen.items():
            value, summedboost = self.encode(poses)
            yield (w, len(poses), summedboost, value)

    def encode(self, poses):
        fb = self.field_boost
        # posns_chars_boosts = [(pos, startchar, endchar, boost), ...]
        codes = []
        posbase = 0
        charbase = 0
        summedboost = 0
        for pos, startchar, endchar, boost in poses:
            codes.append(
                (pos - posbase, startchar - charbase, endchar - startchar, boost)
            )
            posbase = pos
            charbase = endchar
            summedboost += boost

        return (
            (pack_uint(len(poses)) + pack_float(summedboost * fb) + dumps(codes, 2)),
            summedboost,
        )

    def decode_character_boosts(self, valuestring):
        if not valuestring.endswith(b"."):
            valuestring += b"."
        codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE :])
        position = 0
        endchar = 0
        posn_char_boosts = []
        for code in codes:
            position = position + code[0]
            startchar = endchar + code[1]
            endchar = startchar + code[2]
            posn_char_boosts.append((position, startchar, endchar, code[3]))
        return posn_char_boosts

    def decode_positions(self, valuestring):
        return [item[0] for item in self.decode_character_boosts(valuestring)]

    def decode_characters(self, valuestring):
        return [
            (pos, startchar, endchar)
            for pos, startchar, endchar, _ in self.decode_character_boosts(valuestring)
        ]

    def decode_position_boosts(self, valuestring):
        return [
            (pos, boost)
            for pos, _, _, boost in self.decode_character_boosts(valuestring)
        ]

    def combine(self, vs):
        s = {}
        for v in vs:
            for pos, sc, ec, boost in self.decode_character_boosts(v):
                if pos in s:
                    old_sc, old_ec, old_boost = pos[s]
                    s[pos] = (min(sc, old_sc), max(ec, old_ec), old_boost + boost)
                else:
                    s[pos] = (sc, ec, boost)
        poses = [(pos, sc, ec, boost) for pos, (sc, ec, boost) in sorted(s.items())]
        return self.encode(poses)[0]  # encode() returns value, summedboost