# Copyright 2010 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
from whoosh.matching import mcore
[docs]class BiMatcher(mcore.Matcher):
"""Base class for matchers that combine the results of two sub-matchers in
some way.
"""
def __init__(self, a, b):
super().__init__()
self.a = a
self.b = b
def reset(self):
self.a.reset()
self.b.reset()
def __repr__(self):
return f"{self.__class__.__name__}({self.a!r}, {self.b!r})"
def children(self):
return [self.a, self.b]
def copy(self):
return self.__class__(self.a.copy(), self.b.copy())
def depth(self):
return 1 + max(self.a.depth(), self.b.depth())
def skip_to(self, id):
if not self.is_active():
raise mcore.ReadTooFar
ra = self.a.skip_to(id)
rb = self.b.skip_to(id)
return ra or rb
def supports_block_quality(self):
return self.a.supports_block_quality() and self.b.supports_block_quality()
def supports(self, astype):
return self.a.supports(astype) and self.b.supports(astype)
[docs]class AdditiveBiMatcher(BiMatcher):
"""Base class for binary matchers where the scores of the sub-matchers are
added together.
"""
def max_quality(self):
q = 0.0
if self.a.is_active():
q += self.a.max_quality()
if self.b.is_active():
q += self.b.max_quality()
return q
def block_quality(self):
bq = 0.0
if self.a.is_active():
bq += self.a.block_quality()
if self.b.is_active():
bq += self.b.block_quality()
return bq
def weight(self):
return self.a.weight() + self.b.weight()
def score(self):
return self.a.score() + self.b.score()
def __eq__(self, other):
return self.__class__ is type(other)
def __lt__(self, other):
return type(other) is self.__class__
def __ne__(self, other):
return not self.__eq__(other)
def __gt__(self, other):
return not (self.__lt__(other) or self.__eq__(other))
def __le__(self, other):
return self.__eq__(other) or self.__lt__(other)
def __ge__(self, other):
return self.__eq__(other) or self.__gt__(other)
[docs]class UnionMatcher(AdditiveBiMatcher):
"""Matches the union (OR) of the postings in the two sub-matchers."""
_id = None
def replace(self, minquality=0):
a = self.a
b = self.b
a_active = a.is_active()
b_active = b.is_active()
# If neither sub-matcher on its own has a high enough max quality to
# contribute, convert to an intersection matcher
if minquality and a_active and b_active:
a_max = a.max_quality()
b_max = b.max_quality()
if a_max < minquality and b_max < minquality:
return IntersectionMatcher(a, b).replace(minquality)
elif a_max < minquality:
return AndMaybeMatcher(b, a)
elif b_max < minquality:
return AndMaybeMatcher(a, b)
# If one or both of the sub-matchers are inactive, convert
if not (a_active or b_active):
return mcore.NullMatcher()
elif not a_active:
return b.replace(minquality)
elif not b_active:
return a.replace(minquality)
a = a.replace(minquality - b.max_quality() if minquality else 0)
b = b.replace(minquality - a.max_quality() if minquality else 0)
# If one of the sub-matchers changed, return a new union
if a is not self.a or b is not self.b:
return self.__class__(a, b)
else:
self._id = None
return self
def is_active(self):
return self.a.is_active() or self.b.is_active()
def skip_to(self, id):
self._id = None
ra = rb = False
if self.a.is_active():
ra = self.a.skip_to(id)
if self.b.is_active():
rb = self.b.skip_to(id)
return ra or rb
def id(self):
_id = self._id
if _id is not None:
return _id
a = self.a
b = self.b
if not a.is_active():
_id = b.id()
elif not b.is_active():
_id = a.id()
else:
_id = min(a.id(), b.id())
self._id = _id
return _id
# Using sets is faster in most cases, but could potentially use a lot of
# memory. Comment out this method override to not use sets.
# def all_ids(self):
# return iter(sorted(set(self.a.all_ids()) | set(self.b.all_ids())))
def next(self):
self._id = None
a = self.a
b = self.b
a_active = a.is_active()
b_active = b.is_active()
# Shortcut when one matcher is inactive
if not (a_active or b_active):
raise mcore.ReadTooFar
elif not a_active:
return b.next()
elif not b_active:
return a.next()
a_id = a.id()
b_id = b.id()
ar = br = None
# After all that, here's the actual implementation
if a_id <= b_id:
ar = a.next()
if b_id <= a_id:
br = b.next()
return ar or br
def spans(self):
if not self.a.is_active():
return self.b.spans()
if not self.b.is_active():
return self.a.spans()
id_a = self.a.id()
id_b = self.b.id()
if id_a < id_b:
return self.a.spans()
elif id_b < id_a:
return self.b.spans()
else:
return sorted(set(self.a.spans()) | set(self.b.spans()))
def weight(self):
a = self.a
b = self.b
if not a.is_active():
return b.weight()
if not b.is_active():
return a.weight()
id_a = a.id()
id_b = b.id()
if id_a < id_b:
return a.weight()
elif id_b < id_a:
return b.weight()
else:
return a.weight() + b.weight()
def score(self):
a = self.a
b = self.b
if not a.is_active():
return b.score()
if not b.is_active():
return a.score()
id_a = a.id()
id_b = b.id()
if id_a < id_b:
return a.score()
elif id_b < id_a:
return b.score()
else:
return a.score() + b.score()
def skip_to_quality(self, minquality):
self._id = None
a = self.a
b = self.b
if not (a.is_active() or b.is_active()):
raise mcore.ReadTooFar
# Short circuit if one matcher is inactive
if not a.is_active():
return b.skip_to_quality(minquality)
elif not b.is_active():
return a.skip_to_quality(minquality)
skipped = 0
aq = a.block_quality()
bq = b.block_quality()
while a.is_active() and b.is_active() and aq + bq < minquality:
if aq < bq:
skipped += a.skip_to_quality(minquality - bq)
aq = a.block_quality()
else:
skipped += b.skip_to_quality(minquality - aq)
bq = b.block_quality()
return skipped
[docs]class DisjunctionMaxMatcher(UnionMatcher):
"""Matches the union (OR) of two sub-matchers. Where both sub-matchers
match the same posting, returns the weight/score of the higher-scoring
posting.
"""
# TODO: this class inherits from AdditiveBiMatcher (through UnionMatcher)
# but it does not add the scores of the sub-matchers together (it
# overrides all methods that perform addition). Need to clean up the
# inheritance.
def __init__(self, a, b, tiebreak=0.0):
super().__init__(a, b)
self.tiebreak = tiebreak
def copy(self):
return self.__class__(self.a.copy(), self.b.copy(), tiebreak=self.tiebreak)
def replace(self, minquality=0):
a = self.a
b = self.b
a_active = a.is_active()
b_active = b.is_active()
# DisMax takes the max of the sub-matcher qualities instead of adding
# them, so we need special logic here
if minquality and a_active and b_active:
a_max = a.max_quality()
b_max = b.max_quality()
if a_max < minquality and b_max < minquality:
# If neither sub-matcher has a high enough max quality to
# contribute, return an inactive matcher
return mcore.NullMatcher()
elif b_max < minquality:
# If the b matcher can't contribute, return a
return a.replace(minquality)
elif a_max < minquality:
# If the a matcher can't contribute, return b
return b.replace(minquality)
if not (a_active or b_active):
return mcore.NullMatcher()
elif not a_active:
return b.replace(minquality)
elif not b_active:
return a.replace(minquality)
# We CAN pass the minquality down here, since we don't add the two
# scores together
a = a.replace(minquality)
b = b.replace(minquality)
a_active = a.is_active()
b_active = b.is_active()
# It's kind of tedious to check for inactive sub-matchers all over
# again here after we replace them, but it's probably better than
# returning a replacement with an inactive sub-matcher
if not (a_active and b_active):
return mcore.NullMatcher()
elif not a_active:
return b
elif not b_active:
return a
elif a is not self.a or b is not self.b:
# If one of the sub-matchers changed, return a new DisMax
return self.__class__(a, b)
else:
return self
def score(self):
if not self.a.is_active():
return self.b.score()
elif not self.b.is_active():
return self.a.score()
else:
return max(self.a.score(), self.b.score())
def max_quality(self):
return max(self.a.max_quality(), self.b.max_quality())
def block_quality(self):
return max(self.a.block_quality(), self.b.block_quality())
def skip_to_quality(self, minquality):
a = self.a
b = self.b
# Short circuit if one matcher is inactive
if not a.is_active():
sk = b.skip_to_quality(minquality)
return sk
elif not b.is_active():
return a.skip_to_quality(minquality)
skipped = 0
aq = a.block_quality()
bq = b.block_quality()
while a.is_active() and b.is_active() and max(aq, bq) < minquality:
if aq < minquality:
skipped += a.skip_to_quality(minquality)
aq = a.block_quality()
if bq < minquality:
skipped += b.skip_to_quality(minquality)
bq = b.block_quality()
return skipped
[docs]class IntersectionMatcher(AdditiveBiMatcher):
"""Matches the intersection (AND) of the postings in the two sub-matchers."""
def __init__(self, a, b):
super().__init__(a, b)
self._find_first()
def reset(self):
self.a.reset()
self.b.reset()
self._find_first()
def _find_first(self):
if self.a.is_active() and self.b.is_active() and self.a.id() != self.b.id():
self._find_next()
def replace(self, minquality=0):
a = self.a
b = self.b
a_active = a.is_active()
b_active = b.is_active()
if not (a_active and b_active):
# Intersection matcher requires that both sub-matchers be active
return mcore.NullMatcher()
if minquality:
a_max = a.max_quality()
b_max = b.max_quality()
if a_max + b_max < minquality:
# If the combined quality of the sub-matchers can't contribute,
# return an inactive matcher
return mcore.NullMatcher()
# Require that the replacements be able to contribute results
# higher than the minquality
a_min = minquality - b_max
b_min = minquality - a_max
else:
a_min = b_min = 0
a = a.replace(a_min)
b = b.replace(b_min)
a_active = a.is_active()
b_active = b.is_active()
if not (a_active or b_active):
return mcore.NullMatcher()
elif not a_active:
return b
elif not b_active:
return a
elif a is not self.a or b is not self.b:
return self.__class__(a, b)
else:
return self
def is_active(self):
return self.a.is_active() and self.b.is_active()
def _find_next(self):
a = self.a
b = self.b
a_id = a.id()
b_id = b.id()
assert a_id != b_id
r = False
while a.is_active() and b.is_active() and a_id != b_id:
if a_id < b_id:
ra = a.skip_to(b_id)
if not a.is_active():
return
r = r or ra
a_id = a.id()
else:
rb = b.skip_to(a_id)
if not b.is_active():
return
r = r or rb
b_id = b.id()
return r
def id(self):
return self.a.id()
# Using sets is faster in some cases, but could potentially use a lot of
# memory
def all_ids(self):
return iter(sorted(set(self.a.all_ids()) & set(self.b.all_ids())))
def skip_to(self, id):
if not self.is_active():
raise mcore.ReadTooFar
ra = self.a.skip_to(id)
rb = self.b.skip_to(id)
if self.is_active():
rn = False
if self.a.id() != self.b.id():
rn = self._find_next()
return ra or rb or rn
def skip_to_quality(self, minquality):
a = self.a
b = self.b
minquality = minquality
skipped = 0
aq = a.block_quality()
bq = b.block_quality()
while a.is_active() and b.is_active() and aq + bq < minquality:
if aq < bq:
# If the block quality of A is less than B, skip A ahead until
# it can contribute at least the balance of the required min
# quality when added to B
sk = a.skip_to_quality(minquality - bq)
skipped += sk
if not sk and a.is_active():
# The matcher couldn't skip ahead for some reason, so just
# advance and try again
a.next()
else:
# And vice-versa
sk = b.skip_to_quality(minquality - aq)
skipped += sk
if not sk and b.is_active():
b.next()
if not a.is_active() or not b.is_active():
# One of the matchers is exhausted
break
if a.id() != b.id():
# We want to always leave in a state where the matchers are at
# the same document, so call _find_next() to sync them
self._find_next()
# Get the block qualities at the new matcher positions
aq = a.block_quality()
bq = b.block_quality()
return skipped
def next(self):
if not self.is_active():
raise mcore.ReadTooFar
# We must assume that the ids are equal whenever next() is called (they
# should have been made equal by _find_next), so advance them both
ar = self.a.next()
if self.is_active():
nr = self._find_next()
return ar or nr
def spans(self):
return sorted(set(self.a.spans()) | set(self.b.spans()))
[docs]class AndNotMatcher(BiMatcher):
"""Matches the postings in the first sub-matcher that are NOT present in
the second sub-matcher.
"""
def __init__(self, a, b):
super().__init__(a, b)
self._find_first()
def reset(self):
self.a.reset()
self.b.reset()
self._find_first()
def _find_first(self):
if self.a.is_active() and self.b.is_active() and self.a.id() == self.b.id():
self._find_next()
def is_active(self):
return self.a.is_active()
def _find_next(self):
pos = self.a
neg = self.b
if not neg.is_active():
return
pos_id = pos.id()
r = False
if neg.id() < pos_id:
neg.skip_to(pos_id)
while pos.is_active() and neg.is_active() and pos_id == neg.id():
nr = pos.next()
if not pos.is_active():
break
r = r or nr
pos_id = pos.id()
neg.skip_to(pos_id)
return r
def supports_block_quality(self):
return self.a.supports_block_quality()
def replace(self, minquality=0):
if not self.a.is_active():
# The a matcher is required, so if it's inactive, return an
# inactive matcher
return mcore.NullMatcher()
elif minquality and self.a.max_quality() < minquality:
# If the quality of the required matcher isn't high enough to
# contribute, return an inactive matcher
return mcore.NullMatcher()
elif not self.b.is_active():
# If the prohibited matcher is inactive, convert to just the
# required matcher
return self.a.replace(minquality)
a = self.a.replace(minquality)
b = self.b.replace()
if a is not self.a or b is not self.b:
# If one of the sub-matchers was replaced, return a new AndNot
return self.__class__(a, b)
else:
return self
def max_quality(self):
return self.a.max_quality()
def block_quality(self):
return self.a.block_quality()
def skip_to_quality(self, minquality):
skipped = self.a.skip_to_quality(minquality)
self._find_next()
return skipped
def id(self):
return self.a.id()
def next(self):
if not self.a.is_active():
raise mcore.ReadTooFar
ar = self.a.next()
nr = False
if self.a.is_active() and self.b.is_active():
nr = self._find_next()
return ar or nr
def skip_to(self, id):
if not self.a.is_active():
raise mcore.ReadTooFar
if id < self.a.id():
return
self.a.skip_to(id)
if self.b.is_active():
self.b.skip_to(id)
self._find_next()
def weight(self):
return self.a.weight()
def score(self):
return self.a.score()
def supports(self, astype):
return self.a.supports(astype)
def value(self):
return self.a.value()
def value_as(self, astype):
return self.a.value_as(astype)
[docs]class AndMaybeMatcher(AdditiveBiMatcher):
"""Matches postings in the first sub-matcher, and if the same posting is
in the second sub-matcher, adds their scores.
"""
def __init__(self, a, b):
AdditiveBiMatcher.__init__(self, a, b)
self._first_b()
def reset(self):
self.a.reset()
self.b.reset()
self._first_b()
def _first_b(self):
a = self.a
b = self.b
if a.is_active() and b.is_active() and a.id() != b.id():
b.skip_to(a.id())
def is_active(self):
return self.a.is_active()
def id(self):
return self.a.id()
def next(self):
if not self.a.is_active():
raise mcore.ReadTooFar
ar = self.a.next()
br = False
if self.a.is_active() and self.b.is_active():
br = self.b.skip_to(self.a.id())
return ar or br
def skip_to(self, id):
if not self.a.is_active():
raise mcore.ReadTooFar
ra = self.a.skip_to(id)
rb = False
if self.a.is_active() and self.b.is_active():
rb = self.b.skip_to(id)
return ra or rb
def replace(self, minquality=0):
a = self.a
b = self.b
a_active = a.is_active()
b_active = b.is_active()
if not a_active:
return mcore.NullMatcher()
elif minquality and b_active:
if a.max_quality() + b.max_quality() < minquality:
# If the combined max quality of the sub-matchers isn't high
# enough to possibly contribute, return an inactive matcher
return mcore.NullMatcher()
elif a.max_quality() < minquality:
# If the max quality of the main sub-matcher isn't high enough
# to ever contribute without the optional sub- matcher, change
# into an IntersectionMatcher
return IntersectionMatcher(self.a, self.b)
elif not b_active:
return a.replace(minquality)
new_a = a.replace(minquality - b.max_quality())
new_b = b.replace(minquality - a.max_quality())
if new_a is not a or new_b is not b:
# If one of the sub-matchers changed, return a new AndMaybe
return self.__class__(new_a, new_b)
else:
return self
def skip_to_quality(self, minquality):
a = self.a
b = self.b
minquality = minquality
if not a.is_active():
raise mcore.ReadTooFar
if not b.is_active():
return a.skip_to_quality(minquality)
skipped = 0
aq = a.block_quality()
bq = b.block_quality()
while a.is_active() and b.is_active() and aq + bq < minquality:
if aq < bq:
skipped += a.skip_to_quality(minquality - bq)
aq = a.block_quality()
else:
skipped += b.skip_to_quality(minquality - aq)
bq = b.block_quality()
return skipped
def weight(self):
if self.a.id() == self.b.id():
return self.a.weight() + self.b.weight()
else:
return self.a.weight()
def score(self):
if self.b.is_active() and self.a.id() == self.b.id():
return self.a.score() + self.b.score()
else:
return self.a.score()
def supports(self, astype):
return self.a.supports(astype)
def value(self):
return self.a.value()
def value_as(self, astype):
return self.a.value_as(astype)