# Copyright 2007 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
from whoosh.analysis.acore import Composable, CompositionError
from whoosh.analysis.filters import STOP_WORDS, LowercaseFilter, StopFilter
from whoosh.analysis.intraword import IntraWordFilter
from whoosh.analysis.morph import StemFilter
from whoosh.analysis.tokenizers import (
CommaSeparatedTokenizer,
IDTokenizer,
RegexTokenizer,
SpaceSeparatedTokenizer,
Tokenizer,
default_pattern,
)
from whoosh.lang.porter import stem
# Analyzers
class Analyzer(Composable):
"""Abstract base class for analyzers."""
def __repr__(self):
return f"{self.__class__.__name__}()"
def __eq__(self, other):
return (
other
and self.__class__ is other.__class__
and self.__dict__ == other.__dict__
)
def __call__(self, value, **kwargs):
raise NotImplementedError
def clean(self):
# This method is intentionally left empty.
pass
class CompositeAnalyzer(Analyzer):
def __init__(self, *composables):
self.items = []
for comp in composables:
if isinstance(comp, CompositeAnalyzer):
self.items.extend(comp.items)
else:
self.items.append(comp)
# Tokenizers must start a chain, and then only filters after that
# (because analyzers take a string and return a generator of tokens,
# and filters take and return generators of tokens)
for item in self.items[1:]:
if isinstance(item, Tokenizer):
raise CompositionError(
f"Only one tokenizer allowed at the start of the analyzer: {self.items}"
)
def __repr__(self):
return "{}({})".format(
self.__class__.__name__,
", ".join(repr(item) for item in self.items),
)
def __call__(self, value, no_morph=False, **kwargs):
items = self.items
# Start with tokenizer
gen = items[0](value, **kwargs)
# Run filters
for item in items[1:]:
if not (no_morph and hasattr(item, "is_morph") and item.is_morph):
gen = item(gen)
return gen
def __getitem__(self, item):
return self.items.__getitem__(item)
def __len__(self):
return len(self.items)
def __eq__(self, other):
return other and self.__class__ is other.__class__ and self.items == other.items
def clean(self):
for item in self.items:
if hasattr(item, "clean"):
item.clean()
def has_morph(self):
return any(item.is_morph for item in self.items)
# Functions that return composed analyzers
[docs]def IDAnalyzer(lowercase=False):
"""Deprecated, just use an IDTokenizer directly, with a LowercaseFilter if
desired.
"""
tokenizer = IDTokenizer()
if lowercase:
tokenizer = tokenizer | LowercaseFilter()
return tokenizer
[docs]def KeywordAnalyzer(lowercase=False, commas=False):
"""Parses whitespace- or comma-separated tokens.
>>> ana = KeywordAnalyzer()
>>> [token.text for token in ana("Hello there, this is a TEST")]
["Hello", "there,", "this", "is", "a", "TEST"]
:param lowercase: whether to lowercase the tokens.
:param commas: if True, items are separated by commas rather than
whitespace.
"""
if commas:
tokenizer = CommaSeparatedTokenizer()
else:
tokenizer = SpaceSeparatedTokenizer()
if lowercase:
tokenizer = tokenizer | LowercaseFilter()
return tokenizer
[docs]def RegexAnalyzer(expression=r"\w+(\.?\w+)*", gaps=False):
"""Deprecated, just use a RegexTokenizer directly."""
return RegexTokenizer(expression=expression, gaps=gaps)
[docs]def SimpleAnalyzer(expression=default_pattern, gaps=False):
"""Composes a RegexTokenizer with a LowercaseFilter.
>>> ana = SimpleAnalyzer()
>>> [token.text for token in ana("Hello there, this is a TEST")]
["hello", "there", "this", "is", "a", "test"]
:param expression: The regular expression pattern to use to extract tokens.
:param gaps: If True, the tokenizer *splits* on the expression, rather
than matching on the expression.
"""
return RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()
[docs]def StandardAnalyzer(
expression=default_pattern, stoplist=STOP_WORDS, minsize=2, maxsize=None, gaps=False
):
"""Composes a RegexTokenizer with a LowercaseFilter and optional
StopFilter.
>>> ana = StandardAnalyzer()
>>> [token.text for token in ana("Testing is testing and testing")]
["testing", "testing", "testing"]
:param expression: The regular expression pattern to use to extract tokens.
:param stoplist: A list of stop words. Set this to None to disable
the stop word filter.
:param minsize: Words smaller than this are removed from the stream.
:param maxsize: Words longer that this are removed from the stream.
:param gaps: If True, the tokenizer *splits* on the expression, rather
than matching on the expression.
"""
ret = RegexTokenizer(expression=expression, gaps=gaps)
chain = ret | LowercaseFilter()
if stoplist is not None:
chain = chain | StopFilter(stoplist=stoplist, minsize=minsize, maxsize=maxsize)
return chain
[docs]def StemmingAnalyzer(
expression=default_pattern,
stoplist=STOP_WORDS,
minsize=2,
maxsize=None,
gaps=False,
stemfn=stem,
ignore=None,
cachesize=50000,
):
"""Composes a RegexTokenizer with a lower case filter, an optional stop
filter, and a stemming filter.
>>> ana = StemmingAnalyzer()
>>> [token.text for token in ana("Testing is testing and testing")]
["test", "test", "test"]
:param expression: The regular expression pattern to use to extract tokens.
:param stoplist: A list of stop words. Set this to None to disable
the stop word filter.
:param minsize: Words smaller than this are removed from the stream.
:param maxsize: Words longer that this are removed from the stream.
:param gaps: If True, the tokenizer *splits* on the expression, rather
than matching on the expression.
:param ignore: a set of words to not stem.
:param cachesize: the maximum number of stemmed words to cache. The larger
this number, the faster stemming will be but the more memory it will
use. Use None for no cache, or -1 for an unbounded cache.
"""
ret = RegexTokenizer(expression=expression, gaps=gaps)
chain = ret | LowercaseFilter()
if stoplist is not None:
chain = chain | StopFilter(stoplist=stoplist, minsize=minsize, maxsize=maxsize)
return chain | StemFilter(stemfn=stemfn, ignore=ignore, cachesize=cachesize)
[docs]def FancyAnalyzer(
expression=r"\s+",
stoplist=STOP_WORDS,
minsize=2,
gaps=True,
splitwords=True,
splitnums=True,
mergewords=False,
mergenums=False,
):
"""Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and
StopFilter.
>>> ana = FancyAnalyzer()
>>> [token.text for token in ana("Should I call getInt or get_real?")]
["should", "call", "getInt", "get", "int", "get_real", "get", "real"]
:param expression: The regular expression pattern to use to extract tokens.
:param stoplist: A list of stop words. Set this to None to disable
the stop word filter.
:param minsize: Words smaller than this are removed from the stream.
:param maxsize: Words longer that this are removed from the stream.
:param gaps: If True, the tokenizer *splits* on the expression, rather
than matching on the expression.
"""
return (
RegexTokenizer(expression=expression, gaps=gaps)
| IntraWordFilter(
splitwords=splitwords,
splitnums=splitnums,
mergewords=mergewords,
mergenums=mergenums,
)
| LowercaseFilter()
| StopFilter(stoplist=stoplist, minsize=minsize)
)
[docs]def LanguageAnalyzer(lang, expression=default_pattern, gaps=False, cachesize=50000):
"""Configures a simple analyzer for the given language, with a
LowercaseFilter, StopFilter, and StemFilter.
>>> ana = LanguageAnalyzer("es")
>>> [token.text for token in ana("Por el mar corren las liebres")]
['mar', 'corr', 'liebr']
The list of available languages is in `whoosh.lang.languages`.
You can use :func:`whoosh.lang.has_stemmer` and
:func:`whoosh.lang.has_stopwords` to check if a given language has a
stemming function and/or stop word list available.
:param expression: The regular expression pattern to use to extract tokens.
:param gaps: If True, the tokenizer *splits* on the expression, rather
than matching on the expression.
:param cachesize: the maximum number of stemmed words to cache. The larger
this number, the faster stemming will be but the more memory it will
use.
"""
from whoosh.lang import NoStemmer, NoStopWords
# Make the start of the chain
chain = RegexTokenizer(expression=expression, gaps=gaps) | LowercaseFilter()
# Add a stop word filter
try:
chain = chain | StopFilter(lang=lang)
except NoStopWords:
pass
# Add a stemming filter
try:
chain = chain | StemFilter(lang=lang, cachesize=cachesize)
except NoStemmer:
pass
return chain