# Copyright 2012 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.
from whoosh import matching
from whoosh.query import qcore
from whoosh.query.wrappers import WrappingQuery
[docs]class NestedParent(WrappingQuery):
"""A query that allows you to search for "nested" documents, where you can
index (possibly multiple levels of) "parent" and "child" documents using
the :meth:`~whoosh.writing.IndexWriter.group` and/or
:meth:`~whoosh.writing.IndexWriter.start_group` methods of a
:class:`whoosh.writing.IndexWriter` to indicate that hierarchically related
documents should be kept together::
schema = fields.Schema(type=fields.ID, text=fields.TEXT(stored=True))
with ix.writer() as w:
# Say we're indexing chapters (type=chap) and each chapter has a
# number of paragraphs (type=p)
with w.group():
w.add_document(type="chap", text="Chapter 1")
w.add_document(type="p", text="Able baker")
w.add_document(type="p", text="Bright morning")
with w.group():
w.add_document(type="chap", text="Chapter 2")
w.add_document(type="p", text="Car trip")
w.add_document(type="p", text="Dog eared")
w.add_document(type="p", text="Every day")
with w.group():
w.add_document(type="chap", text="Chapter 3")
w.add_document(type="p", text="Fine day")
The ``NestedParent`` query wraps two sub-queries: the "parent query"
matches a class of "parent documents". The "sub query" matches nested
documents you want to find. For each "sub document" the "sub query" finds,
this query acts as if it found the corresponding "parent document".
>>> with ix.searcher() as s:
... r = s.search(query.Term("text", "day"))
... for hit in r:
... print(hit["text"])
...
Chapter 2
Chapter 3
"""
def __init__(self, parents, subq, per_parent_limit=None, score_fn=sum):
"""
:param parents: a query, DocIdSet object, or Results object
representing the documents you want to use as the "parent"
documents. Where the sub-query matches, the corresponding document
in these results will be returned as the match.
:param subq: a query matching the information you want to find.
:param per_parent_limit: a maximum number of "sub documents" to search
per parent. The default is None, meaning no limit.
:param score_fn: a function to use to combine the scores of matching
sub-documents to calculate the score returned for the parent
document. The default is ``sum``, that is, add up the scores of the
sub-documents.
"""
self.parents = parents
self.child = subq
self.per_parent_limit = per_parent_limit
self.score_fn = score_fn
def normalize(self):
p = self.parents
if isinstance(p, qcore.Query):
p = p.normalize()
q = self.child.normalize()
if p is qcore.NullQuery or q is qcore.NullQuery:
return qcore.NullQuery
return self.__class__(p, q)
def requires(self):
return self.child.requires()
def matcher(self, searcher, context=None):
bits = searcher._filter_to_comb(self.parents)
if not bits:
return matching.NullMatcher
m = self.child.matcher(searcher, context)
if not m.is_active():
return matching.NullMatcher
return self.NestedParentMatcher(
bits, m, self.per_parent_limit, searcher.doc_count_all(), self.score_fn
)
def deletion_docs(self, searcher):
bits = searcher._filter_to_comb(self.parents)
if not bits:
return
m = self.child.matcher(searcher, searcher.boolean_context())
maxdoc = searcher.doc_count_all()
while m.is_active():
docnum = m.id()
parentdoc = bits.before(docnum + 1)
nextparent = bits.after(docnum) or maxdoc
yield from range(parentdoc, nextparent)
m.skip_to(nextparent)
class NestedParentMatcher(matching.Matcher):
def __init__(self, comb, child, per_parent_limit, maxdoc, score_fn):
self.comb = comb
self.child = child
self.per_parent_limit = per_parent_limit
self.maxdoc = maxdoc
self.score_fn = score_fn
self._nextdoc = None
if self.child.is_active():
self._gather()
def is_active(self):
return self._nextdoc is not None
def supports_block_quality(self):
return False
def _gather(self):
# This is where the magic happens ;)
child = self.child
pplimit = self.per_parent_limit
# The next document returned by this matcher is the parent of the
# child's current document. We don't have to worry about whether
# the parent is deleted, because the query that gave us the parents
# wouldn't return deleted documents.
self._nextdoc = self.comb.before(child.id() + 1)
# The next parent after the child matcher's current document
nextparent = self.comb.after(child.id()) or self.maxdoc
# Sum the scores of all matching documents under the parent
count = 1
scores = []
while child.is_active() and child.id() < nextparent:
if pplimit and count > pplimit:
child.skip_to(nextparent)
break
scores.append(child.score())
child.next()
count += 1
score = self.score_fn(scores) if scores else 0
self._nextscore = score
def id(self):
return self._nextdoc
def score(self):
return self._nextscore
def reset(self):
self.child.reset()
self._gather()
def next(self):
if self.child.is_active():
self._gather()
else:
if self._nextdoc is None:
raise matching.ReadTooFar
else:
self._nextdoc = None
def skip_to(self, id):
self.child.skip_to(id)
self._gather()
def value(self):
raise NotImplementedError(self.__class__)
def spans(self):
return []
[docs]class NestedChildren(WrappingQuery):
"""This is the reverse of a :class:`NestedParent` query: instead of taking
a query that matches children but returns the parent, this query matches
parents but returns the children.
This is useful, for example, to search for an album title and return the
songs in the album::
schema = fields.Schema(type=fields.ID(stored=True),
album_name=fields.TEXT(stored=True),
track_num=fields.NUMERIC(stored=True),
track_name=fields.TEXT(stored=True),
lyrics=fields.TEXT)
ix = RamStorage().create_index(schema)
# Indexing
with ix.writer() as w:
# For each album, index a "group" of a parent "album" document and
# multiple child "track" documents.
with w.group():
w.add_document(type="album",
artist="The Cure", album_name="Disintegration")
w.add_document(type="track", track_num=1,
track_name="Plainsong")
w.add_document(type="track", track_num=2,
track_name="Pictures of You")
# ...
# ...
# Find songs where the song name has "heaven" in the title and the
# album the song is on has "hell" in the title
qp = QueryParser("lyrics", ix.schema)
with ix.searcher() as s:
# A query that matches all parents
all_albums = qp.parse("type:album")
# A query that matches the parents we want
albums_with_hell = qp.parse("album_name:hell")
# A query that matches the desired albums but returns the tracks
songs_on_hell_albums = NestedChildren(all_albums, albums_with_hell)
# A query that matches tracks with heaven in the title
songs_with_heaven = qp.parse("track_name:heaven")
# A query that finds tracks with heaven in the title on albums
# with hell in the title
q = query.And([songs_on_hell_albums, songs_with_heaven])
"""
def __init__(self, parents, subq, boost=1.0):
self.parents = parents
self.child = subq
self.boost = boost
def matcher(self, searcher, context=None):
bits = searcher._filter_to_comb(self.parents)
if not bits:
return matching.NullMatcher
m = self.child.matcher(searcher, context)
if not m.is_active():
return matching.NullMatcher
return self.NestedChildMatcher(
bits,
m,
searcher.doc_count_all(),
searcher.reader().is_deleted,
boost=self.boost,
)
class NestedChildMatcher(matching.WrappingMatcher):
def __init__(
self, parent_comb, wanted_parent_matcher, limit, is_deleted, boost=1.0
):
self.parent_comb = parent_comb
self.child = wanted_parent_matcher
self.limit = limit
self.is_deleted = is_deleted
self.boost = boost
self._nextchild = -1
self._nextparent = -1
self._find_next_children()
def __repr__(self):
return f"{self.__class__.__name__}({self.parent_comb!r}, {self.child!r})"
def reset(self):
self.child.reset()
self._reset()
def _reset(self):
self._nextchild = -1
self._nextparent = -1
self._find_next_children()
def is_active(self):
return self._nextchild < self._nextparent
def replace(self, minquality=0):
return self
def _find_next_children(self):
# "comb" contains the doc IDs of all parent documents
comb = self.parent_comb
# "m" is the matcher for "wanted" parents
m = self.child
# Last doc ID + 1
limit = self.limit
# A function that returns True if a doc ID is deleted
is_deleted = self.is_deleted
nextchild = self._nextchild
nextparent = self._nextparent
while m.is_active():
# Move the "child id" to the document after the current match
nextchild = m.id() + 1
# Move the parent matcher to the next match
m.next()
# Find the next parent document (matching or not) after this
nextparent = comb.after(nextchild)
if nextparent is None:
nextparent = limit
# Skip any deleted child documents
while is_deleted(nextchild):
nextchild += 1
# If skipping deleted documents put us to or past the next
# parent doc, go again
if nextchild >= nextparent:
continue
else:
# Otherwise, we're done
break
self._nextchild = nextchild
self._nextparent = nextparent
def id(self):
return self._nextchild
def all_ids(self):
while self.is_active():
yield self.id()
self.next()
def next(self):
is_deleted = self.is_deleted
limit = self.limit
nextparent = self._nextparent
# Go to the next document
nextchild = self._nextchild
nextchild += 1
# Skip over any deleted child documents
while nextchild < nextparent and is_deleted(nextchild):
nextchild += 1
self._nextchild = nextchild
# If we're at or past the next parent doc, go to the next set of
# children
if nextchild >= limit:
return
elif nextchild >= nextparent:
self._find_next_children()
def skip_to(self, docid):
comb = self.parent_comb
wanted = self.child
# self._nextchild is the "current" matching child ID
if docid <= self._nextchild:
return
# self._nextparent is the next parent ID (matching or not)
if docid < self._nextparent:
# Just iterate
while self.is_active() and self.id() < docid:
self.next()
elif wanted.is_active():
# Find the parent before the target ID
pid = comb.before(docid)
# Skip the parent matcher to that ID
wanted.skip_to(pid)
# If that made the matcher inactive, then we're done
if not wanted.is_active():
self._nextchild = self._nextparent = self.limit
else:
# Reestablish for the next child after the next matching
# parent
self._find_next_children()
else:
self._nextchild = self._nextparent = self.limit
def value(self):
raise NotImplementedError(self.__class__)
def score(self):
return self.boost
def spans(self):
return []