Source code for whoosh.analysis.morph

# Copyright 2007 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#    1. Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#
#    2. Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.

from whoosh.analysis.filters import Filter
from whoosh.lang.dmetaphone import double_metaphone
from whoosh.lang.porter import stem
from whoosh.util.cache import lfu_cache, unbound_cache


[docs]class StemFilter(Filter): """Stems (removes suffixes from) the text of tokens using the Porter stemming algorithm. Stemming attempts to reduce multiple forms of the same root word (for example, "rendering", "renders", "rendered", etc.) to a single word in the index. >>> stemmer = RegexTokenizer() | StemFilter() >>> [token.text for token in stemmer("fundamentally willows")] ["fundament", "willow"] You can pass your own stemming function to the StemFilter. The default is the Porter stemming algorithm for English. >>> stemfilter = StemFilter(stem_function) You can also use one of the Snowball stemming functions by passing the `lang` keyword argument. >>> stemfilter = StemFilter(lang="ru") The list of available languages is in `whoosh.lang.languages`. You can use :func:`whoosh.lang.has_stemmer` to check if a given language has a stemming function available. By default, this class wraps an LRU cache around the stemming function. The ``cachesize`` keyword argument sets the size of the cache. To make the cache unbounded (the class caches every input), use ``cachesize=-1``. To disable caching, use ``cachesize=None``. If you compile and install the py-stemmer library, the :class:`PyStemmerFilter` provides slightly easier access to the language stemmers in that library. """ __inittypes__ = {"stemfn": object, "ignore": list} is_morph = True def __init__(self, stemfn=stem, lang=None, ignore=None, cachesize=50000): """ :param stemfn: the function to use for stemming. :param lang: if not None, overrides the stemfn with a language stemmer from the ``whoosh.lang.snowball`` package. :param ignore: a set/list of words that should not be stemmed. This is converted into a frozenset. If you omit this argument, all tokens are stemmed. :param cachesize: the maximum number of words to cache. Use ``-1`` for an unbounded cache, or ``None`` for no caching. """ self.stemfn = stemfn self.lang = lang self.ignore = frozenset() if ignore is None else frozenset(ignore) self.cachesize = cachesize # clear() sets the _stem attr to a cached wrapper around self.stemfn self.clear() def __getstate__(self): # Can't pickle a dynamic function, so we have to remove the _stem # attribute from the state return {k: self.__dict__[k] for k in self.__dict__ if k != "_stem"} def __setstate__(self, state): # Check for old instances of StemFilter class, which didn't have a # cachesize attribute and pickled the cache attribute if "cachesize" not in state: self.cachesize = 50000 if "ignores" in state: self.ignore = state["ignores"] elif "ignore" not in state: self.ignore = frozenset() if "lang" not in state: self.lang = None if "cache" in state: del state["cache"] self.__dict__.update(state) # Set the _stem attribute self.clear() def clear(self): if self.lang: from whoosh.lang import stemmer_for_language stemfn = stemmer_for_language(self.lang) else: stemfn = self.stemfn if isinstance(self.cachesize, int) and self.cachesize != 0: if self.cachesize < 0: self._stem = unbound_cache(stemfn) elif self.cachesize > 1: self._stem = lfu_cache(self.cachesize)(stemfn) else: self._stem = stemfn def cache_info(self): if self.cachesize <= 1: return None return self._stem.cache_info() def __eq__(self, other): return ( other and self.__class__ is other.__class__ and self.stemfn == other.stemfn ) def __call__(self, tokens): stemfn = self._stem ignore = self.ignore for t in tokens: if not t.stopped: text = t.text if text not in ignore: t.text = stemfn(text) yield t
class PyStemmerFilter(StemFilter): """This is a simple subclass of StemFilter that works with the py-stemmer third-party library. You must have the py-stemmer library installed to use this filter. >>> PyStemmerFilter("spanish") """ def __init__(self, lang="english", ignore=None, cachesize=10000): """ :param lang: a string identifying the stemming algorithm to use. You can get a list of available algorithms by with the :meth:`PyStemmerFilter.algorithms` method. The identification strings are directly from the py-stemmer library. :param ignore: a set/list of words that should not be stemmed. This is converted into a frozenset. If you omit this argument, all tokens are stemmed. :param cachesize: the maximum number of words to cache. """ self.lang = lang self.ignore = frozenset() if ignore is None else frozenset(ignore) self.cachesize = cachesize self._stem = self._get_stemmer_fn() def algorithms(self): """Returns a list of stemming algorithms provided by the py-stemmer library. """ import Stemmer # type: ignore @UnresolvedImport return Stemmer.algorithms() def cache_info(self): return None def _get_stemmer_fn(self): import Stemmer # type: ignore @UnresolvedImport stemmer = Stemmer.Stemmer(self.lang) stemmer.maxCacheSize = self.cachesize return stemmer.stemWord def __getstate__(self): # Can't pickle a dynamic function, so we have to remove the _stem # attribute from the state return {k: self.__dict__[k] for k in self.__dict__ if k != "_stem"} def __setstate__(self, state): # Check for old instances of StemFilter class, which didn't have a # cachesize attribute and pickled the cache attribute if "cachesize" not in state: self.cachesize = 10000 if "ignores" in state: self.ignore = state["ignores"] elif "ignore" not in state: self.ignore = frozenset() if "cache" in state: del state["cache"] self.__dict__.update(state) # Set the _stem attribute self._stem = self._get_stemmer_fn()
[docs]class DoubleMetaphoneFilter(Filter): """Transforms the text of the tokens using Lawrence Philips's Double Metaphone algorithm. This algorithm attempts to encode words in such a way that similar-sounding words reduce to the same code. This may be useful for fields containing the names of people and places, and other uses where tolerance of spelling differences is desireable. """ is_morph = True def __init__(self, primary_boost=1.0, secondary_boost=0.5, combine=False): """ :param primary_boost: the boost to apply to the token containing the primary code. :param secondary_boost: the boost to apply to the token containing the secondary code, if any. :param combine: if True, the original unencoded tokens are kept in the stream, preceding the encoded tokens. """ self.primary_boost = primary_boost self.secondary_boost = secondary_boost self.combine = combine def __eq__(self, other): return ( other and self.__class__ is other.__class__ and self.primary_boost == other.primary_boost ) def __call__(self, tokens): primary_boost = self.primary_boost secondary_boost = self.secondary_boost combine = self.combine for t in tokens: if combine: yield t primary, secondary = double_metaphone(t.text) b = t.boost # Overwrite the token's text and boost and yield it if primary: t.text = primary t.boost = b * primary_boost yield t if secondary: t.text = secondary t.boost = b * secondary_boost yield t