Source code for whoosh.analysis.ngrams

# Copyright 2007 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#    1. Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#
#    2. Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.

from whoosh.analysis.acore import Token
from whoosh.analysis.filters import Filter, LowercaseFilter
from whoosh.analysis.tokenizers import RegexTokenizer, Tokenizer

# Tokenizer


[docs]class NgramTokenizer(Tokenizer): """Splits input text into N-grams instead of words. >>> ngt = NgramTokenizer(4) >>> [token.text for token in ngt("hi there")] ["hi t", "i th", " the", "ther", "here"] Note that this tokenizer does NOT use a regular expression to extract words, so the grams emitted by it will contain whitespace, punctuation, etc. You may want to massage the input or add a custom filter to this tokenizer's output. Alternatively, if you only want sub-word grams without whitespace, you could combine a RegexTokenizer with NgramFilter instead. """ __inittypes__ = {"minsize": int, "maxsize": int} def __init__(self, minsize, maxsize=None): """ :param minsize: The minimum size of the N-grams. :param maxsize: The maximum size of the N-grams. If you omit this parameter, maxsize == minsize. """ self.min = minsize self.max = maxsize or minsize def __eq__(self, other): if self.__class__ is other.__class__: if self.min == other.min and self.max == other.max: return True return False def __call__( self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode="", **kwargs, ): assert isinstance(value, str), f"{value!r} is not unicode" inlen = len(value) t = Token(positions, chars, removestops=removestops, mode=mode) pos = start_pos if mode == "query": size = min(self.max, inlen) for start in range(0, inlen - size + 1): end = start + size if end > inlen: continue t.text = value[start:end] if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = start_char + start t.endchar = start_char + end yield t pos += 1 else: for start in range(0, inlen - self.min + 1): for size in range(self.min, self.max + 1): end = start + size if end > inlen: continue t.text = value[start:end] if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = start_char + start t.endchar = start_char + end yield t pos += 1
# Filter
[docs]class NgramFilter(Filter): """Splits token text into N-grams. >>> rext = RegexTokenizer() >>> stream = rext("hello there") >>> ngf = NgramFilter(4) >>> [token.text for token in ngf(stream)] ["hell", "ello", "ther", "here"] """ __inittypes__ = {"minsize": int, "maxsize": int} def __init__(self, minsize, maxsize=None, at=None): """ :param minsize: The minimum size of the N-grams. :param maxsize: The maximum size of the N-grams. If you omit this parameter, maxsize == minsize. :param at: If 'start', only take N-grams from the start of each word. if 'end', only take N-grams from the end of each word. Otherwise, take all N-grams from the word (the default). """ self.min = minsize self.max = maxsize or minsize self.at = 0 if at == "start": self.at = -1 elif at == "end": self.at = 1 def __eq__(self, other): return ( other and self.__class__ is other.__class__ and self.min == other.min and self.max == other.max ) def __call__(self, tokens): assert hasattr(tokens, "__iter__") at = self.at for t in tokens: text = t.text if len(text) < self.min: continue chars = t.chars if chars: startchar = t.startchar # Token positions don't mean much for N-grams, # so we'll leave the token's original position # untouched. if t.mode == "query": size = min(self.max, len(t.text)) if at == -1: t.text = text[:size] if chars: t.endchar = startchar + size yield t elif at == 1: t.text = text[0 - size :] if chars: t.startchar = t.endchar - size yield t else: for start in range(0, len(text) - size + 1): t.text = text[start : start + size] if chars: t.startchar = startchar + start t.endchar = startchar + start + size yield t else: if at == -1: limit = min(self.max, len(text)) for size in range(self.min, limit + 1): t.text = text[:size] if chars: t.endchar = startchar + size yield t elif at == 1: if chars: original_startchar = t.startchar start = max(0, len(text) - self.max) for i in range(start, len(text) - self.min + 1): t.text = text[i:] if chars: t.startchar = original_startchar + i yield t else: for start in range(0, len(text) - self.min + 1): for size in range(self.min, self.max + 1): end = start + size if end > len(text): continue t.text = text[start:end] if chars: t.startchar = startchar + start t.endchar = startchar + end yield t
# Analyzers
[docs]def NgramAnalyzer(minsize, maxsize=None): """Composes an NgramTokenizer and a LowercaseFilter. >>> ana = NgramAnalyzer(4) >>> [token.text for token in ana("hi there")] ["hi t", "i th", " the", "ther", "here"] """ return NgramTokenizer(minsize, maxsize=maxsize) | LowercaseFilter()
[docs]def NgramWordAnalyzer(minsize, maxsize=None, tokenizer=None, at=None): if not tokenizer: tokenizer = RegexTokenizer() return tokenizer | LowercaseFilter() | NgramFilter(minsize, maxsize, at=at)