Source code for whoosh.matching.binary

# Copyright 2010 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#    1. Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#
#    2. Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.

from whoosh.matching import mcore


[docs]class BiMatcher(mcore.Matcher): """Base class for matchers that combine the results of two sub-matchers in some way. """ def __init__(self, a, b): super().__init__() self.a = a self.b = b def reset(self): self.a.reset() self.b.reset() def __repr__(self): return f"{self.__class__.__name__}({self.a!r}, {self.b!r})" def children(self): return [self.a, self.b] def copy(self): return self.__class__(self.a.copy(), self.b.copy()) def depth(self): return 1 + max(self.a.depth(), self.b.depth()) def skip_to(self, id): if not self.is_active(): raise mcore.ReadTooFar ra = self.a.skip_to(id) rb = self.b.skip_to(id) return ra or rb def supports_block_quality(self): return self.a.supports_block_quality() and self.b.supports_block_quality() def supports(self, astype): return self.a.supports(astype) and self.b.supports(astype)
[docs]class AdditiveBiMatcher(BiMatcher): """Base class for binary matchers where the scores of the sub-matchers are added together. """ def max_quality(self): q = 0.0 if self.a.is_active(): q += self.a.max_quality() if self.b.is_active(): q += self.b.max_quality() return q def block_quality(self): bq = 0.0 if self.a.is_active(): bq += self.a.block_quality() if self.b.is_active(): bq += self.b.block_quality() return bq def weight(self): return self.a.weight() + self.b.weight() def score(self): return self.a.score() + self.b.score() def __eq__(self, other): return self.__class__ is type(other) def __lt__(self, other): return type(other) is self.__class__ def __ne__(self, other): return not self.__eq__(other) def __gt__(self, other): return not (self.__lt__(other) or self.__eq__(other)) def __le__(self, other): return self.__eq__(other) or self.__lt__(other) def __ge__(self, other): return self.__eq__(other) or self.__gt__(other)
[docs]class UnionMatcher(AdditiveBiMatcher): """Matches the union (OR) of the postings in the two sub-matchers.""" _id = None def replace(self, minquality=0): a = self.a b = self.b a_active = a.is_active() b_active = b.is_active() # If neither sub-matcher on its own has a high enough max quality to # contribute, convert to an intersection matcher if minquality and a_active and b_active: a_max = a.max_quality() b_max = b.max_quality() if a_max < minquality and b_max < minquality: return IntersectionMatcher(a, b).replace(minquality) elif a_max < minquality: return AndMaybeMatcher(b, a) elif b_max < minquality: return AndMaybeMatcher(a, b) # If one or both of the sub-matchers are inactive, convert if not (a_active or b_active): return mcore.NullMatcher() elif not a_active: return b.replace(minquality) elif not b_active: return a.replace(minquality) a = a.replace(minquality - b.max_quality() if minquality else 0) b = b.replace(minquality - a.max_quality() if minquality else 0) # If one of the sub-matchers changed, return a new union if a is not self.a or b is not self.b: return self.__class__(a, b) else: self._id = None return self def is_active(self): return self.a.is_active() or self.b.is_active() def skip_to(self, id): self._id = None ra = rb = False if self.a.is_active(): ra = self.a.skip_to(id) if self.b.is_active(): rb = self.b.skip_to(id) return ra or rb def id(self): _id = self._id if _id is not None: return _id a = self.a b = self.b if not a.is_active(): _id = b.id() elif not b.is_active(): _id = a.id() else: _id = min(a.id(), b.id()) self._id = _id return _id # Using sets is faster in most cases, but could potentially use a lot of # memory. Comment out this method override to not use sets. # def all_ids(self): # return iter(sorted(set(self.a.all_ids()) | set(self.b.all_ids()))) def next(self): self._id = None a = self.a b = self.b a_active = a.is_active() b_active = b.is_active() # Shortcut when one matcher is inactive if not (a_active or b_active): raise mcore.ReadTooFar elif not a_active: return b.next() elif not b_active: return a.next() a_id = a.id() b_id = b.id() ar = br = None # After all that, here's the actual implementation if a_id <= b_id: ar = a.next() if b_id <= a_id: br = b.next() return ar or br def spans(self): if not self.a.is_active(): return self.b.spans() if not self.b.is_active(): return self.a.spans() id_a = self.a.id() id_b = self.b.id() if id_a < id_b: return self.a.spans() elif id_b < id_a: return self.b.spans() else: return sorted(set(self.a.spans()) | set(self.b.spans())) def weight(self): a = self.a b = self.b if not a.is_active(): return b.weight() if not b.is_active(): return a.weight() id_a = a.id() id_b = b.id() if id_a < id_b: return a.weight() elif id_b < id_a: return b.weight() else: return a.weight() + b.weight() def score(self): a = self.a b = self.b if not a.is_active(): return b.score() if not b.is_active(): return a.score() id_a = a.id() id_b = b.id() if id_a < id_b: return a.score() elif id_b < id_a: return b.score() else: return a.score() + b.score() def skip_to_quality(self, minquality): self._id = None a = self.a b = self.b if not (a.is_active() or b.is_active()): raise mcore.ReadTooFar # Short circuit if one matcher is inactive if not a.is_active(): return b.skip_to_quality(minquality) elif not b.is_active(): return a.skip_to_quality(minquality) skipped = 0 aq = a.block_quality() bq = b.block_quality() while a.is_active() and b.is_active() and aq + bq < minquality: if aq < bq: skipped += a.skip_to_quality(minquality - bq) aq = a.block_quality() else: skipped += b.skip_to_quality(minquality - aq) bq = b.block_quality() return skipped
[docs]class DisjunctionMaxMatcher(UnionMatcher): """Matches the union (OR) of two sub-matchers. Where both sub-matchers match the same posting, returns the weight/score of the higher-scoring posting. """ # TODO: this class inherits from AdditiveBiMatcher (through UnionMatcher) # but it does not add the scores of the sub-matchers together (it # overrides all methods that perform addition). Need to clean up the # inheritance. def __init__(self, a, b, tiebreak=0.0): super().__init__(a, b) self.tiebreak = tiebreak def copy(self): return self.__class__(self.a.copy(), self.b.copy(), tiebreak=self.tiebreak) def replace(self, minquality=0): a = self.a b = self.b a_active = a.is_active() b_active = b.is_active() # DisMax takes the max of the sub-matcher qualities instead of adding # them, so we need special logic here if minquality and a_active and b_active: a_max = a.max_quality() b_max = b.max_quality() if a_max < minquality and b_max < minquality: # If neither sub-matcher has a high enough max quality to # contribute, return an inactive matcher return mcore.NullMatcher() elif b_max < minquality: # If the b matcher can't contribute, return a return a.replace(minquality) elif a_max < minquality: # If the a matcher can't contribute, return b return b.replace(minquality) if not (a_active or b_active): return mcore.NullMatcher() elif not a_active: return b.replace(minquality) elif not b_active: return a.replace(minquality) # We CAN pass the minquality down here, since we don't add the two # scores together a = a.replace(minquality) b = b.replace(minquality) a_active = a.is_active() b_active = b.is_active() # It's kind of tedious to check for inactive sub-matchers all over # again here after we replace them, but it's probably better than # returning a replacement with an inactive sub-matcher if not (a_active and b_active): return mcore.NullMatcher() elif not a_active: return b elif not b_active: return a elif a is not self.a or b is not self.b: # If one of the sub-matchers changed, return a new DisMax return self.__class__(a, b) else: return self def score(self): if not self.a.is_active(): return self.b.score() elif not self.b.is_active(): return self.a.score() else: return max(self.a.score(), self.b.score()) def max_quality(self): return max(self.a.max_quality(), self.b.max_quality()) def block_quality(self): return max(self.a.block_quality(), self.b.block_quality()) def skip_to_quality(self, minquality): a = self.a b = self.b # Short circuit if one matcher is inactive if not a.is_active(): sk = b.skip_to_quality(minquality) return sk elif not b.is_active(): return a.skip_to_quality(minquality) skipped = 0 aq = a.block_quality() bq = b.block_quality() while a.is_active() and b.is_active() and max(aq, bq) < minquality: if aq < minquality: skipped += a.skip_to_quality(minquality) aq = a.block_quality() if bq < minquality: skipped += b.skip_to_quality(minquality) bq = b.block_quality() return skipped
[docs]class IntersectionMatcher(AdditiveBiMatcher): """Matches the intersection (AND) of the postings in the two sub-matchers.""" def __init__(self, a, b): super().__init__(a, b) self._find_first() def reset(self): self.a.reset() self.b.reset() self._find_first() def _find_first(self): if self.a.is_active() and self.b.is_active() and self.a.id() != self.b.id(): self._find_next() def replace(self, minquality=0): a = self.a b = self.b a_active = a.is_active() b_active = b.is_active() if not (a_active and b_active): # Intersection matcher requires that both sub-matchers be active return mcore.NullMatcher() if minquality: a_max = a.max_quality() b_max = b.max_quality() if a_max + b_max < minquality: # If the combined quality of the sub-matchers can't contribute, # return an inactive matcher return mcore.NullMatcher() # Require that the replacements be able to contribute results # higher than the minquality a_min = minquality - b_max b_min = minquality - a_max else: a_min = b_min = 0 a = a.replace(a_min) b = b.replace(b_min) a_active = a.is_active() b_active = b.is_active() if not (a_active or b_active): return mcore.NullMatcher() elif not a_active: return b elif not b_active: return a elif a is not self.a or b is not self.b: return self.__class__(a, b) else: return self def is_active(self): return self.a.is_active() and self.b.is_active() def _find_next(self): a = self.a b = self.b a_id = a.id() b_id = b.id() assert a_id != b_id r = False while a.is_active() and b.is_active() and a_id != b_id: if a_id < b_id: ra = a.skip_to(b_id) if not a.is_active(): return r = r or ra a_id = a.id() else: rb = b.skip_to(a_id) if not b.is_active(): return r = r or rb b_id = b.id() return r def id(self): return self.a.id() # Using sets is faster in some cases, but could potentially use a lot of # memory def all_ids(self): return iter(sorted(set(self.a.all_ids()) & set(self.b.all_ids()))) def skip_to(self, id): if not self.is_active(): raise mcore.ReadTooFar ra = self.a.skip_to(id) rb = self.b.skip_to(id) if self.is_active(): rn = False if self.a.id() != self.b.id(): rn = self._find_next() return ra or rb or rn def skip_to_quality(self, minquality): a = self.a b = self.b minquality = minquality skipped = 0 aq = a.block_quality() bq = b.block_quality() while a.is_active() and b.is_active() and aq + bq < minquality: if aq < bq: # If the block quality of A is less than B, skip A ahead until # it can contribute at least the balance of the required min # quality when added to B sk = a.skip_to_quality(minquality - bq) skipped += sk if not sk and a.is_active(): # The matcher couldn't skip ahead for some reason, so just # advance and try again a.next() else: # And vice-versa sk = b.skip_to_quality(minquality - aq) skipped += sk if not sk and b.is_active(): b.next() if not a.is_active() or not b.is_active(): # One of the matchers is exhausted break if a.id() != b.id(): # We want to always leave in a state where the matchers are at # the same document, so call _find_next() to sync them self._find_next() # Get the block qualities at the new matcher positions aq = a.block_quality() bq = b.block_quality() return skipped def next(self): if not self.is_active(): raise mcore.ReadTooFar # We must assume that the ids are equal whenever next() is called (they # should have been made equal by _find_next), so advance them both ar = self.a.next() if self.is_active(): nr = self._find_next() return ar or nr def spans(self): return sorted(set(self.a.spans()) | set(self.b.spans()))
[docs]class AndNotMatcher(BiMatcher): """Matches the postings in the first sub-matcher that are NOT present in the second sub-matcher. """ def __init__(self, a, b): super().__init__(a, b) self._find_first() def reset(self): self.a.reset() self.b.reset() self._find_first() def _find_first(self): if self.a.is_active() and self.b.is_active() and self.a.id() == self.b.id(): self._find_next() def is_active(self): return self.a.is_active() def _find_next(self): pos = self.a neg = self.b if not neg.is_active(): return pos_id = pos.id() r = False if neg.id() < pos_id: neg.skip_to(pos_id) while pos.is_active() and neg.is_active() and pos_id == neg.id(): nr = pos.next() if not pos.is_active(): break r = r or nr pos_id = pos.id() neg.skip_to(pos_id) return r def supports_block_quality(self): return self.a.supports_block_quality() def replace(self, minquality=0): if not self.a.is_active(): # The a matcher is required, so if it's inactive, return an # inactive matcher return mcore.NullMatcher() elif minquality and self.a.max_quality() < minquality: # If the quality of the required matcher isn't high enough to # contribute, return an inactive matcher return mcore.NullMatcher() elif not self.b.is_active(): # If the prohibited matcher is inactive, convert to just the # required matcher return self.a.replace(minquality) a = self.a.replace(minquality) b = self.b.replace() if a is not self.a or b is not self.b: # If one of the sub-matchers was replaced, return a new AndNot return self.__class__(a, b) else: return self def max_quality(self): return self.a.max_quality() def block_quality(self): return self.a.block_quality() def skip_to_quality(self, minquality): skipped = self.a.skip_to_quality(minquality) self._find_next() return skipped def id(self): return self.a.id() def next(self): if not self.a.is_active(): raise mcore.ReadTooFar ar = self.a.next() nr = False if self.a.is_active() and self.b.is_active(): nr = self._find_next() return ar or nr def skip_to(self, id): if not self.a.is_active(): raise mcore.ReadTooFar if id < self.a.id(): return self.a.skip_to(id) if self.b.is_active(): self.b.skip_to(id) self._find_next() def weight(self): return self.a.weight() def score(self): return self.a.score() def supports(self, astype): return self.a.supports(astype) def value(self): return self.a.value() def value_as(self, astype): return self.a.value_as(astype)
[docs]class AndMaybeMatcher(AdditiveBiMatcher): """Matches postings in the first sub-matcher, and if the same posting is in the second sub-matcher, adds their scores. """ def __init__(self, a, b): AdditiveBiMatcher.__init__(self, a, b) self._first_b() def reset(self): self.a.reset() self.b.reset() self._first_b() def _first_b(self): a = self.a b = self.b if a.is_active() and b.is_active() and a.id() != b.id(): b.skip_to(a.id()) def is_active(self): return self.a.is_active() def id(self): return self.a.id() def next(self): if not self.a.is_active(): raise mcore.ReadTooFar ar = self.a.next() br = False if self.a.is_active() and self.b.is_active(): br = self.b.skip_to(self.a.id()) return ar or br def skip_to(self, id): if not self.a.is_active(): raise mcore.ReadTooFar ra = self.a.skip_to(id) rb = False if self.a.is_active() and self.b.is_active(): rb = self.b.skip_to(id) return ra or rb def replace(self, minquality=0): a = self.a b = self.b a_active = a.is_active() b_active = b.is_active() if not a_active: return mcore.NullMatcher() elif minquality and b_active: if a.max_quality() + b.max_quality() < minquality: # If the combined max quality of the sub-matchers isn't high # enough to possibly contribute, return an inactive matcher return mcore.NullMatcher() elif a.max_quality() < minquality: # If the max quality of the main sub-matcher isn't high enough # to ever contribute without the optional sub- matcher, change # into an IntersectionMatcher return IntersectionMatcher(self.a, self.b) elif not b_active: return a.replace(minquality) new_a = a.replace(minquality - b.max_quality()) new_b = b.replace(minquality - a.max_quality()) if new_a is not a or new_b is not b: # If one of the sub-matchers changed, return a new AndMaybe return self.__class__(new_a, new_b) else: return self def skip_to_quality(self, minquality): a = self.a b = self.b minquality = minquality if not a.is_active(): raise mcore.ReadTooFar if not b.is_active(): return a.skip_to_quality(minquality) skipped = 0 aq = a.block_quality() bq = b.block_quality() while a.is_active() and b.is_active() and aq + bq < minquality: if aq < bq: skipped += a.skip_to_quality(minquality - bq) aq = a.block_quality() else: skipped += b.skip_to_quality(minquality - aq) bq = b.block_quality() return skipped def weight(self): if self.a.id() == self.b.id(): return self.a.weight() + self.b.weight() else: return self.a.weight() def score(self): if self.b.is_active() and self.a.id() == self.b.id(): return self.a.score() + self.b.score() else: return self.a.score() def supports(self, astype): return self.a.supports(astype) def value(self): return self.a.value() def value_as(self, astype): return self.a.value_as(astype)