Source code for whoosh.analysis.analyzers

# Copyright 2007 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#    1. Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#
#    2. Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.

from whoosh.analysis.acore import Composable, CompositionError
from whoosh.analysis.filters import STOP_WORDS, LowercaseFilter, StopFilter
from whoosh.analysis.intraword import IntraWordFilter
from whoosh.analysis.morph import StemFilter
from whoosh.analysis.tokenizers import (
    CommaSeparatedTokenizer,
    IDTokenizer,
    RegexTokenizer,
    SpaceSeparatedTokenizer,
    Tokenizer,
    default_pattern,
)
from whoosh.lang.porter import stem

# Analyzers


class Analyzer(Composable):
    """Abstract base class for analyzers."""

    def __repr__(self):
        return f"{self.__class__.__name__}()"

    def __eq__(self, other):
        return (
            other
            and self.__class__ is other.__class__
            and self.__dict__ == other.__dict__
        )

    def __call__(self, value, **kwargs):
        raise NotImplementedError

    def clean(self):
        # This method is intentionally left empty.
        pass


class CompositeAnalyzer(Analyzer):
    def __init__(self, *composables):
        self.items = []

        for comp in composables:
            if isinstance(comp, CompositeAnalyzer):
                self.items.extend(comp.items)
            else:
                self.items.append(comp)

        # Tokenizers must start a chain, and then only filters after that
        # (because analyzers take a string and return a generator of tokens,
        # and filters take and return generators of tokens)
        for item in self.items[1:]:
            if isinstance(item, Tokenizer):
                raise CompositionError(
                    f"Only one tokenizer allowed at the start of the analyzer: {self.items}"
                )

    def __repr__(self):
        return "{}({})".format(
            self.__class__.__name__,
            ", ".join(repr(item) for item in self.items),
        )

    def __call__(self, value, no_morph=False, **kwargs):
        items = self.items
        # Start with tokenizer
        gen = items[0](value, **kwargs)
        # Run filters
        for item in items[1:]:
            if not (no_morph and hasattr(item, "is_morph") and item.is_morph):
                gen = item(gen)
        return gen

    def __getitem__(self, item):
        return self.items.__getitem__(item)

    def __len__(self):
        return len(self.items)

    def __eq__(self, other):
        return other and self.__class__ is other.__class__ and self.items == other.items

    def clean(self):
        for item in self.items:
            if hasattr(item, "clean"):
                item.clean()

    def has_morph(self):
        return any(item.is_morph for item in self.items)


# Functions that return composed analyzers


[docs]def IDAnalyzer(lowercase=False):
    """Deprecated, just use an IDTokenizer directly, with a LowercaseFilter if
    desired.
    """

    tokenizer = IDTokenizer()
    if lowercase:
        tokenizer = tokenizer | LowercaseFilter()
    return tokenizer


[docs]def KeywordAnalyzer(lowercase=False, commas=False):
    """Parses whitespace- or comma-separated tokens.

    >>> ana = KeywordAnalyzer()
    >>> [token.text for token in ana("Hello there, this is a TEST")]
    ["Hello", "there,", "this", "is", "a", "TEST"]

    :param lowercase: whether to lowercase the tokens.
    :param commas: if True, items are separated by commas rather than
        whitespace.
    """

    if commas:
        tokenizer = CommaSeparatedTokenizer()
    else:
        tokenizer = SpaceSeparatedTokenizer()
    if lowercase:
        tokenizer = tokenizer | LowercaseFilter()
    return tokenizer


[docs]def RegexAnalyzer(expression=r"\w+(\.?\w+)*", gaps=False):
    """Deprecated, just use a RegexTokenizer directly."""

    return RegexTokenizer(expression=expression, gaps=gaps)


[docs]def SimpleAnalyzer(expression=default_pattern, gaps=False):
    """Composes a RegexTokenizer with a LowercaseFilter.

    >>> ana = SimpleAnalyzer()
    >>> [token.text for token in ana("Hello there, this is a TEST")]
    ["hello", "there", "this", "is", "a", "test"]

    :param expression: The regular expression pattern to use to extract tokens.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    """

    return RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()


[docs]def StandardAnalyzer(
    expression=default_pattern, stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=False
):
    """Composes a RegexTokenizer with a LowercaseFilter and optional
    StopFilter.

    >>> ana = StandardAnalyzer()
    >>> [token.text for token in ana("Testing is testing and testing")]
    ["testing", "testing", "testing"]

    :param expression: The regular expression pattern to use to extract tokens.
    :param stoplist: A list of stop words. Set this to None to disable
        the stop word filter.
    :param minsize: Words smaller than this are removed from the stream.
    :param maxsize: Words longer that this are removed from the stream.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    """

    ret = RegexTokenizer(expression=expression, gaps=gaps)
    chain = ret | LowercaseFilter()
    if stoplist is not None:
        chain = chain | StopFilter(stoplist=stoplist, minsize=minsize, maxsize=maxsize)
    return chain


[docs]def StemmingAnalyzer(
    expression=default_pattern,
    stoplist=STOP_WORDS,
    minsize=2,
    maxsize=None,
    gaps=False,
    stemfn=stem,
    ignore=None,
    cachesize=50000,
):
    """Composes a RegexTokenizer with a lower case filter, an optional stop
    filter, and a stemming filter.

    >>> ana = StemmingAnalyzer()
    >>> [token.text for token in ana("Testing is testing and testing")]
    ["test", "test", "test"]

    :param expression: The regular expression pattern to use to extract tokens.
    :param stoplist: A list of stop words. Set this to None to disable
        the stop word filter.
    :param minsize: Words smaller than this are removed from the stream.
    :param maxsize: Words longer that this are removed from the stream.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    :param ignore: a set of words to not stem.
    :param cachesize: the maximum number of stemmed words to cache. The larger
        this number, the faster stemming will be but the more memory it will
        use. Use None for no cache, or -1 for an unbounded cache.
    """

    ret = RegexTokenizer(expression=expression, gaps=gaps)
    chain = ret | LowercaseFilter()
    if stoplist is not None:
        chain = chain | StopFilter(stoplist=stoplist, minsize=minsize, maxsize=maxsize)
    return chain | StemFilter(stemfn=stemfn, ignore=ignore, cachesize=cachesize)


[docs]def FancyAnalyzer(
    expression=r"\s+",
    stoplist=STOP_WORDS,
    minsize=2,
    gaps=True,
    splitwords=True,
    splitnums=True,
    mergewords=False,
    mergenums=False,
):
    """Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and
    StopFilter.

    >>> ana = FancyAnalyzer()
    >>> [token.text for token in ana("Should I call getInt or get_real?")]
    ["should", "call", "getInt", "get", "int", "get_real", "get", "real"]

    :param expression: The regular expression pattern to use to extract tokens.
    :param stoplist: A list of stop words. Set this to None to disable
        the stop word filter.
    :param minsize: Words smaller than this are removed from the stream.
    :param maxsize: Words longer that this are removed from the stream.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    """

    return (
        RegexTokenizer(expression=expression, gaps=gaps)
        | IntraWordFilter(
            splitwords=splitwords,
            splitnums=splitnums,
            mergewords=mergewords,
            mergenums=mergenums,
        )
        | LowercaseFilter()
        | StopFilter(stoplist=stoplist, minsize=minsize)
    )


[docs]def LanguageAnalyzer(lang, expression=default_pattern, gaps=False, cachesize=50000):
    """Configures a simple analyzer for the given language, with a
    LowercaseFilter, StopFilter, and StemFilter.

    >>> ana = LanguageAnalyzer("es")
    >>> [token.text for token in ana("Por el mar corren las liebres")]
    ['mar', 'corr', 'liebr']

    The list of available languages is in `whoosh.lang.languages`.
    You can use :func:`whoosh.lang.has_stemmer` and
    :func:`whoosh.lang.has_stopwords` to check if a given language has a
    stemming function and/or stop word list available.

    :param expression: The regular expression pattern to use to extract tokens.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    :param cachesize: the maximum number of stemmed words to cache. The larger
        this number, the faster stemming will be but the more memory it will
        use.
    """

    from whoosh.lang import NoStemmer, NoStopWords

    # Make the start of the chain
    chain = RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()

    # Add a stop word filter
    try:
        chain = chain | StopFilter(lang=lang)
    except NoStopWords:
        pass

    # Add a stemming filter
    try:
        chain = chain | StemFilter(lang=lang, cachesize=cachesize)
    except NoStemmer:
        pass

    return chain