# Copyright 2007 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
from whoosh.query import compound, qcore, terms, wrappers
from whoosh.util.times import datetime_to_long
class RangeMixin:
# Contains methods shared by TermRange and NumericRange
def __repr__(self):
return "{}({!r}, {!r}, {!r}, {}, {}, boost={}, constantscore={})".format(
self.__class__.__name__,
self.fieldname,
self.start,
self.end,
self.startexcl,
self.endexcl,
self.boost,
self.constantscore,
)
def __str__(self):
startchar = "{" if self.startexcl else "["
endchar = "}" if self.endexcl else "]"
start = "" if self.start is None else self.start
end = "" if self.end is None else self.end
return f"{self.fieldname}:{startchar}{start} TO {end}{endchar}"
def __eq__(self, other):
return (
other
and self.__class__ is other.__class__
and self.fieldname == other.fieldname
and self.start == other.start
and self.end == other.end
and self.startexcl == other.startexcl
and self.endexcl == other.endexcl
and self.boost == other.boost
and self.constantscore == other.constantscore
)
def __hash__(self):
return (
hash(self.fieldname)
^ hash(self.start)
^ hash(self.startexcl)
^ hash(self.end)
^ hash(self.endexcl)
^ hash(self.boost)
)
def is_range(self):
return True
def _comparable_start(self):
if self.start is None:
return (qcore.Lowest, 0)
else:
second = 1 if self.startexcl else 0
return (self.start, second)
def _comparable_end(self):
if self.end is None:
return (qcore.Highest, 0)
else:
second = -1 if self.endexcl else 0
return (self.end, second)
def overlaps(self, other):
if not isinstance(other, TermRange):
return False
if self.fieldname != other.fieldname:
return False
start1 = self._comparable_start()
start2 = other._comparable_start()
end1 = self._comparable_end()
end2 = other._comparable_end()
return (
(start1 >= start2 and start1 <= end2)
or (end1 >= start2 and end1 <= end2)
or (start2 >= start1 and start2 <= end1)
or (end2 >= start1 and end2 <= end1)
)
def merge(self, other, intersect=True):
assert self.fieldname == other.fieldname
start1 = self._comparable_start()
start2 = other._comparable_start()
end1 = self._comparable_end()
end2 = other._comparable_end()
if start1 >= start2 and end1 <= end2:
start = start2
end = end2
elif start2 >= start1 and end2 <= end1:
start = start1
end = end1
elif intersect:
start = max(start1, start2)
end = min(end1, end2)
else:
start = min(start1, start2)
end = max(end1, end2)
startval = None if start[0] is qcore.Lowest else start[0]
startexcl = start[1] == 1
endval = None if end[0] is qcore.Highest else end[0]
endexcl = end[1] == -1
boost = max(self.boost, other.boost)
constantscore = self.constantscore or other.constantscore
return self.__class__(
self.fieldname,
startval,
endval,
startexcl,
endexcl,
boost=boost,
constantscore=constantscore,
)
[docs]class TermRange(RangeMixin, terms.MultiTerm):
"""Matches documents containing any terms in a given range.
>>> # Match documents where the indexed "id" field is greater than or equal
>>> # to 'apple' and less than or equal to 'pear'.
>>> TermRange("id", u"apple", u"pear")
"""
def __init__(
self,
fieldname,
start,
end,
startexcl=False,
endexcl=False,
boost=1.0,
constantscore=True,
):
"""
:param fieldname: The name of the field to search.
:param start: Match terms equal to or greater than this.
:param end: Match terms equal to or less than this.
:param startexcl: If True, the range start is exclusive. If False, the
range start is inclusive.
:param endexcl: If True, the range end is exclusive. If False, the
range end is inclusive.
:param boost: Boost factor that should be applied to the raw score of
results matched by this query.
"""
self.fieldname = fieldname
self.start = start
self.end = end
self.startexcl = startexcl
self.endexcl = endexcl
self.boost = boost
self.constantscore = constantscore
def normalize(self):
if self.start in ("", None) and self.end in ("\uffff", None):
from whoosh.query import Every
return Every(self.fieldname, boost=self.boost)
elif self.start == self.end:
if self.startexcl or self.endexcl:
return qcore.NullQuery
return terms.Term(self.fieldname, self.start, boost=self.boost)
else:
return TermRange(
self.fieldname,
self.start,
self.end,
self.startexcl,
self.endexcl,
boost=self.boost,
)
# def replace(self, fieldname, oldtext, newtext):
# q = self.copy()
# if q.fieldname == fieldname:
# if q.start == oldtext:
# q.start = newtext
# if q.end == oldtext:
# q.end = newtext
# return q
def _btexts(self, ixreader):
fieldname = self.fieldname
field = ixreader.schema[fieldname]
startexcl = self.startexcl
endexcl = self.endexcl
if self.start is None:
start = b""
else:
try:
start = field.to_bytes(self.start)
except ValueError:
return
if self.end is None:
end = b"\xFF\xFF\xFF\xFF"
else:
try:
end = field.to_bytes(self.end)
except ValueError:
return
for fname, t in ixreader.terms_from(fieldname, start):
if fname != fieldname:
break
if t == start and startexcl:
continue
if t == end and endexcl:
break
if t > end:
break
yield t
[docs]class NumericRange(RangeMixin, qcore.Query):
"""
A range query for NUMERIC fields. Takes advantage of tiered indexing
to speed up large ranges by matching at a high resolution at the edges of
the range and a low resolution in the middle.
Example Usage:
# Match numbers from 10 to 5925 in the "number" field.
nr = NumericRange("number", 10, 5925)
Methods:
__init__(self, fieldname, start, end, startexcl=False, endexcl=False, boost=1.0, constantscore=True):
Initializes a NumericRange object with the specified parameters.
simplify(self, ixreader):
Simplifies the range query by compiling it and calling the simplify method on the compiled query.
estimate_size(self, ixreader):
Estimates the size of the range query by compiling it and calling the estimate_size method on the compiled query.
estimate_min_size(self, ixreader):
Estimates the minimum size of the range query by compiling it and calling the estimate_min_size method on the compiled query.
docs(self, searcher):
Retrieves the documents that match the range query by compiling it and calling the docs method on the compiled query.
_compile_query(self, ixreader):
Compiles the range query by preparing the start and end values, generating subqueries for different resolutions, and combining them into a single query.
matcher(self, searcher, context=None):
Retrieves the matcher for the range query by compiling it and calling the matcher method on the compiled query.
Fields:
fieldname: The name of the field to search.
start: Match terms equal to or greater than this number. This should be a number type, not a string.
end: Match terms equal to or less than this number. This should be a number type, not a string.
startexcl: If True, the range start is exclusive. If False, the range start is inclusive.
endexcl: If True, the range end is exclusive. If False, the range end is inclusive.
boost: Boost factor that should be applied to the raw score of results matched by this query.
constantscore: If True, the compiled query returns a constant score (the value of the boost keyword argument) instead of actually scoring the matched terms. This gives a nice speed boost and won't affect the results in most cases since numeric ranges will almost always be used as a filter.
"""
[docs] def __init__(
self,
fieldname,
start,
end,
startexcl=False,
endexcl=False,
boost=1.0,
constantscore=True,
):
"""
:param fieldname: The name of the field to search.
:param start: Match terms equal to or greater than this number. This
should be a number type, not a string.
:param end: Match terms equal to or less than this number. This should
be a number type, not a string.
:param startexcl: If True, the range start is exclusive. If False, the
range start is inclusive.
:param endexcl: If True, the range end is exclusive. If False, the
range end is inclusive.
:param boost: Boost factor that should be applied to the raw score of
results matched by this query.
:param constantscore: If True, the compiled query returns a constant
score (the value of the ``boost`` keyword argument) instead of
actually scoring the matched terms. This gives a nice speed boost
and won't affect the results in most cases since numeric ranges
will almost always be used as a filter.
"""
self.fieldname = fieldname
self.start = start
self.end = end
self.startexcl = startexcl
self.endexcl = endexcl
self.boost = boost
self.constantscore = constantscore
# NumericRange should raise an error if the start and end parameters are not numeric.
# Some of the old tests fail if this is enabled. We need to confirm if this is a bug or not.
# if not isinstance(self.start, (int, float)):
# raise ValueError("NumericRange: start parameter must be numeric")
# if not isinstance(self.end, (int, float)):
# raise ValueError("NumericRange: end parameter must be numeric")
[docs] def simplify(self, ixreader):
return self._compile_query(ixreader).simplify(ixreader)
[docs] def estimate_size(self, ixreader):
return self._compile_query(ixreader).estimate_size(ixreader)
[docs] def estimate_min_size(self, ixreader):
return self._compile_query(ixreader).estimate_min_size(ixreader)
[docs] def docs(self, searcher):
q = self._compile_query(searcher.reader())
return q.docs(searcher)
[docs] def _compile_query(self, ixreader):
from whoosh.fields import NUMERIC
from whoosh.util.numeric import tiered_ranges
field = ixreader.schema[self.fieldname]
if not isinstance(field, NUMERIC):
raise ValueError(f"NumericRange: field {self.fieldname} is not numeric")
start = self.start
if start is not None:
start = field.prepare_number(start)
end = self.end
if end is not None:
end = field.prepare_number(end)
subqueries = []
stb = field.sortable_to_bytes
# Get the term ranges for the different resolutions
ranges = tiered_ranges(
field.numtype,
field.bits,
field.signed,
start,
end,
field.shift_step,
self.startexcl,
self.endexcl,
)
for startnum, endnum, shift in ranges:
if startnum == endnum:
subq = terms.Term(self.fieldname, stb(startnum, shift))
else:
startbytes = stb(startnum, shift)
endbytes = stb(endnum, shift)
subq = TermRange(self.fieldname, startbytes, endbytes)
subqueries.append(subq)
if len(subqueries) == 1:
q = subqueries[0]
elif subqueries:
q = compound.Or(subqueries, boost=self.boost)
else:
return qcore.NullQuery
if self.constantscore:
q = wrappers.ConstantScoreQuery(q, self.boost)
return q
[docs] def matcher(self, searcher, context=None):
q = self._compile_query(searcher.reader())
return q.matcher(searcher, context)
[docs]class DateRange(NumericRange):
"""This is a very thin subclass of :class:`NumericRange` that only
overrides the initializer and ``__repr__()`` methods to work with datetime
objects instead of numbers. Internally this object converts the datetime
objects it's created with to numbers and otherwise acts like a
``NumericRange`` query.
>>> DateRange("date", datetime(2010, 11, 3, 3, 0),
... datetime(2010, 11, 3, 17, 59))
"""
def __init__(
self,
fieldname,
start,
end,
startexcl=False,
endexcl=False,
boost=1.0,
constantscore=True,
):
self.startdate = start
self.enddate = end
if start:
start = datetime_to_long(start)
if end:
end = datetime_to_long(end)
super().__init__(
fieldname,
start,
end,
startexcl=startexcl,
endexcl=endexcl,
boost=boost,
constantscore=constantscore,
)
def __repr__(self):
return "{}({!r}, {!r}, {!r}, {}, {}, boost={})".format(
self.__class__.__name__,
self.fieldname,
self.startdate,
self.enddate,
self.startexcl,
self.endexcl,
self.boost,
)